summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGravatar Yuri Kunde Schlesner2017-01-27 14:29:10 -0300
committerGravatar GitHub2017-01-27 14:29:10 -0300
commitbf14f4be2263b4769e97800b35951717192c2d1c (patch)
tree9c1c47f5a05e9907257f620d8426a0cebaf0cf78 /src
parentSDL: Select audio device (#2403) (diff)
parentVideoCore/Shader: Move entry_point to SetupBatch (diff)
downloadyuzu-bf14f4be2263b4769e97800b35951717192c2d1c.tar.gz
yuzu-bf14f4be2263b4769e97800b35951717192c2d1c.tar.xz
yuzu-bf14f4be2263b4769e97800b35951717192c2d1c.zip
Merge pull request #2346 from yuriks/shader-refactor2
More shader refactoring
Diffstat (limited to 'src')
-rw-r--r--src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp7
-rw-r--r--src/citra_qt/debugger/graphics/graphics_vertex_shader.h1
-rw-r--r--src/video_core/CMakeLists.txt6
-rw-r--r--src/video_core/command_processor.cpp22
-rw-r--r--src/video_core/pica.cpp2
-rw-r--r--src/video_core/shader/shader.cpp102
-rw-r--r--src/video_core/shader/shader.h70
-rw-r--r--src/video_core/shader/shader_interpreter.cpp49
-rw-r--r--src/video_core/shader/shader_interpreter.h26
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp890
-rw-r--r--src/video_core/shader/shader_jit_x64.h115
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.cpp884
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.h125
13 files changed, 1189 insertions, 1110 deletions
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
index ff2e7e363..f37524190 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp
@@ -18,7 +18,9 @@
18#include "citra_qt/util/util.h" 18#include "citra_qt/util/util.h"
19#include "video_core/pica.h" 19#include "video_core/pica.h"
20#include "video_core/pica_state.h" 20#include "video_core/pica_state.h"
21#include "video_core/shader/debug_data.h"
21#include "video_core/shader/shader.h" 22#include "video_core/shader/shader.h"
23#include "video_core/shader/shader_interpreter.h"
22 24
23using nihstro::OpCode; 25using nihstro::OpCode;
24using nihstro::Instruction; 26using nihstro::Instruction;
@@ -518,8 +520,9 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
518 info.labels.insert({entry_point, "main"}); 520 info.labels.insert({entry_point, "main"});
519 521
520 // Generate debug information 522 // Generate debug information
521 debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, 523 Pica::Shader::InterpreterEngine shader_engine;
522 shader_setup); 524 shader_engine.SetupBatch(shader_setup, entry_point);
525 debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes);
523 526
524 // Reload widget state 527 // Reload widget state
525 for (int attr = 0; attr < num_attributes; ++attr) { 528 for (int attr = 0; attr < num_attributes; ++attr) {
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
index bedea0bed..3292573f3 100644
--- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
+++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h
@@ -8,6 +8,7 @@
8#include <QTreeView> 8#include <QTreeView>
9#include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h" 9#include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h"
10#include "nihstro/parser_shbin.h" 10#include "nihstro/parser_shbin.h"
11#include "video_core/shader/debug_data.h"
11#include "video_core/shader/shader.h" 12#include "video_core/shader/shader.h"
12 13
13class QLabel; 14class QLabel;
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 6ca319b59..d55b84ce0 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -50,10 +50,12 @@ set(HEADERS
50 50
51if(ARCHITECTURE_x86_64) 51if(ARCHITECTURE_x86_64)
52 set(SRCS ${SRCS} 52 set(SRCS ${SRCS}
53 shader/shader_jit_x64.cpp) 53 shader/shader_jit_x64.cpp
54 shader/shader_jit_x64_compiler.cpp)
54 55
55 set(HEADERS ${HEADERS} 56 set(HEADERS ${HEADERS}
56 shader/shader_jit_x64.h) 57 shader/shader_jit_x64.h
58 shader/shader_jit_x64_compiler.h)
57endif() 59endif()
58 60
59create_directory_groups(${SRCS} ${HEADERS}) 61create_directory_groups(${SRCS} ${HEADERS})
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index ea58e9f54..eb79974a8 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -142,16 +142,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
142 MICROPROFILE_SCOPE(GPU_Drawing); 142 MICROPROFILE_SCOPE(GPU_Drawing);
143 immediate_attribute_id = 0; 143 immediate_attribute_id = 0;
144 144
145 Shader::UnitState shader_unit; 145 auto* shader_engine = Shader::GetEngine();
146 g_state.vs.Setup(); 146 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
147 147
148 // Send to vertex shader 148 // Send to vertex shader
149 if (g_debug_context) 149 if (g_debug_context)
150 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, 150 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
151 static_cast<void*>(&immediate_input)); 151 static_cast<void*>(&immediate_input));
152 g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1); 152 Shader::UnitState shader_unit;
153 Shader::OutputVertex output_vertex = 153 shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1);
154 shader_unit.output_registers.ToVertex(regs.vs); 154 shader_engine->Run(g_state.vs, shader_unit);
155 auto output_vertex = Shader::OutputVertex::FromRegisters(
156 shader_unit.registers.output, regs, regs.vs.output_mask);
155 157
156 // Send to renderer 158 // Send to renderer
157 using Pica::Shader::OutputVertex; 159 using Pica::Shader::OutputVertex;
@@ -243,8 +245,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
243 unsigned int vertex_cache_pos = 0; 245 unsigned int vertex_cache_pos = 0;
244 vertex_cache_ids.fill(-1); 246 vertex_cache_ids.fill(-1);
245 247
248 auto* shader_engine = Shader::GetEngine();
246 Shader::UnitState shader_unit; 249 Shader::UnitState shader_unit;
247 g_state.vs.Setup(); 250
251 shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset);
248 252
249 for (unsigned int index = 0; index < regs.num_vertices; ++index) { 253 for (unsigned int index = 0; index < regs.num_vertices; ++index) {
250 // Indexed rendering doesn't use the start offset 254 // Indexed rendering doesn't use the start offset
@@ -283,10 +287,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
283 if (g_debug_context) 287 if (g_debug_context)
284 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, 288 g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation,
285 (void*)&input); 289 (void*)&input);
286 g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); 290 shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes());
291 shader_engine->Run(g_state.vs, shader_unit);
287 292
288 // Retrieve vertex from register data 293 // Retrieve vertex from register data
289 output_vertex = shader_unit.output_registers.ToVertex(regs.vs); 294 output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output,
295 regs, regs.vs.output_mask);
290 296
291 if (is_indexed) { 297 if (is_indexed) {
292 vertex_cache[vertex_cache_pos] = output_vertex; 298 vertex_cache[vertex_cache_pos] = output_vertex;
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index ce2bd455e..b4a77c632 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -499,7 +499,7 @@ void Init() {
499} 499}
500 500
501void Shutdown() { 501void Shutdown() {
502 Shader::ClearCache(); 502 Shader::Shutdown();
503} 503}
504 504
505template <typename T> 505template <typename T>
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 7ae57e619..2da50bd62 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -2,14 +2,8 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <atomic>
6#include <cmath> 5#include <cmath>
7#include <cstring> 6#include <cstring>
8#include <unordered_map>
9#include <utility>
10#include <boost/range/algorithm/fill.hpp>
11#include "common/bit_field.h"
12#include "common/hash.h"
13#include "common/logging/log.h" 7#include "common/logging/log.h"
14#include "common/microprofile.h" 8#include "common/microprofile.h"
15#include "video_core/pica.h" 9#include "video_core/pica.h"
@@ -25,7 +19,8 @@ namespace Pica {
25 19
26namespace Shader { 20namespace Shader {
27 21
28OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { 22OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
23 u32 output_mask) {
29 // Setup output data 24 // Setup output data
30 OutputVertex ret; 25 OutputVertex ret;
31 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to 26 // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
@@ -33,13 +28,13 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
33 unsigned index = 0; 28 unsigned index = 0;
34 for (unsigned i = 0; i < 7; ++i) { 29 for (unsigned i = 0; i < 7; ++i) {
35 30
36 if (index >= g_state.regs.vs_output_total) 31 if (index >= regs.vs_output_total)
37 break; 32 break;
38 33
39 if ((config.output_mask & (1 << i)) == 0) 34 if ((output_mask & (1 << i)) == 0)
40 continue; 35 continue;
41 36
42 const auto& output_register_map = g_state.regs.vs_output_attributes[index]; 37 const auto& output_register_map = regs.vs_output_attributes[index];
43 38
44 u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, 39 u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y,
45 output_register_map.map_z, output_register_map.map_w}; 40 output_register_map.map_z, output_register_map.map_w};
@@ -47,7 +42,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
47 for (unsigned comp = 0; comp < 4; ++comp) { 42 for (unsigned comp = 0; comp < 4; ++comp) {
48 float24* out = ((float24*)&ret) + semantics[comp]; 43 float24* out = ((float24*)&ret) + semantics[comp];
49 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { 44 if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
50 *out = value[i][comp]; 45 *out = output_regs[i][comp];
51 } else { 46 } else {
52 // Zero output so that attributes which aren't output won't have denormals in them, 47 // Zero output so that attributes which aren't output won't have denormals in them,
53 // which would slow us down later. 48 // which would slow us down later.
@@ -76,86 +71,41 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const {
76 return ret; 71 return ret;
77} 72}
78 73
79#ifdef ARCHITECTURE_x86_64 74void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) {
80static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; 75 // Setup input register table
81static const JitShader* jit_shader; 76 const auto& attribute_register_map = g_state.regs.vs.input_register_map;
82#endif // ARCHITECTURE_x86_64 77
78 for (int i = 0; i < num_attributes; i++)
79 registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
80}
81
82MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
83 83
84void ClearCache() {
85#ifdef ARCHITECTURE_x86_64 84#ifdef ARCHITECTURE_x86_64
86 shader_map.clear(); 85static std::unique_ptr<JitX64Engine> jit_engine;
87#endif // ARCHITECTURE_x86_64 86#endif // ARCHITECTURE_x86_64
88} 87static InterpreterEngine interpreter_engine;
89 88
90void ShaderSetup::Setup() { 89ShaderEngine* GetEngine() {
91#ifdef ARCHITECTURE_x86_64 90#ifdef ARCHITECTURE_x86_64
91 // TODO(yuriks): Re-initialize on each change rather than being persistent
92 if (VideoCore::g_shader_jit_enabled) { 92 if (VideoCore::g_shader_jit_enabled) {
93 u64 cache_key = 93 if (jit_engine == nullptr) {
94 Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ 94 jit_engine = std::make_unique<JitX64Engine>();
95 Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data));
96
97 auto iter = shader_map.find(cache_key);
98 if (iter != shader_map.end()) {
99 jit_shader = iter->second.get();
100 } else {
101 auto shader = std::make_unique<JitShader>();
102 shader->Compile();
103 jit_shader = shader.get();
104 shader_map[cache_key] = std::move(shader);
105 } 95 }
96 return jit_engine.get();
106 } 97 }
107#endif // ARCHITECTURE_x86_64 98#endif // ARCHITECTURE_x86_64
108}
109
110MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
111
112void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) {
113 auto& config = g_state.regs.vs;
114 auto& setup = g_state.vs;
115
116 MICROPROFILE_SCOPE(GPU_Shader);
117 99
118 // Setup input register table 100 return &interpreter_engine;
119 const auto& attribute_register_map = config.input_register_map; 101}
120
121 for (int i = 0; i < num_attributes; i++)
122 state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
123
124 state.conditional_code[0] = false;
125 state.conditional_code[1] = false;
126 102
103void Shutdown() {
127#ifdef ARCHITECTURE_x86_64 104#ifdef ARCHITECTURE_x86_64
128 if (VideoCore::g_shader_jit_enabled) { 105 jit_engine = nullptr;
129 jit_shader->Run(setup, state, config.main_offset);
130 } else {
131 DebugData<false> dummy_debug_data;
132 RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
133 }
134#else
135 DebugData<false> dummy_debug_data;
136 RunInterpreter(setup, state, dummy_debug_data, config.main_offset);
137#endif // ARCHITECTURE_x86_64 106#endif // ARCHITECTURE_x86_64
138} 107}
139 108
140DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes,
141 const Regs::ShaderConfig& config,
142 const ShaderSetup& setup) {
143 UnitState state;
144 DebugData<true> debug_data;
145
146 // Setup input register table
147 boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
148 const auto& attribute_register_map = config.input_register_map;
149 for (int i = 0; i < num_attributes; i++)
150 state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i];
151
152 state.conditional_code[0] = false;
153 state.conditional_code[1] = false;
154
155 RunInterpreter(setup, state, debug_data, config.main_offset);
156 return debug_data;
157}
158
159} // namespace Shader 109} // namespace Shader
160 110
161} // namespace Pica 111} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2b07759b9..44d9f76c3 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -6,7 +6,6 @@
6 6
7#include <array> 7#include <array>
8#include <cstddef> 8#include <cstddef>
9#include <memory>
10#include <type_traits> 9#include <type_traits>
11#include <nihstro/shader_bytecode.h> 10#include <nihstro/shader_bytecode.h>
12#include "common/assert.h" 11#include "common/assert.h"
@@ -15,7 +14,6 @@
15#include "common/vector_math.h" 14#include "common/vector_math.h"
16#include "video_core/pica.h" 15#include "video_core/pica.h"
17#include "video_core/pica_types.h" 16#include "video_core/pica_types.h"
18#include "video_core/shader/debug_data.h"
19 17
20using nihstro::RegisterType; 18using nihstro::RegisterType;
21using nihstro::SourceRegister; 19using nihstro::SourceRegister;
@@ -75,19 +73,13 @@ struct OutputVertex {
75 ret.Lerp(factor, v1); 73 ret.Lerp(factor, v1);
76 return ret; 74 return ret;
77 } 75 }
76
77 static OutputVertex FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs,
78 u32 output_mask);
78}; 79};
79static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); 80static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
80static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); 81static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
81 82
82struct OutputRegisters {
83 OutputRegisters() = default;
84
85 alignas(16) Math::Vec4<float24> value[16];
86
87 OutputVertex ToVertex(const Regs::ShaderConfig& config) const;
88};
89static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD");
90
91/** 83/**
92 * This structure contains the state information that needs to be unique for a shader unit. The 3DS 84 * This structure contains the state information that needs to be unique for a shader unit. The 3DS
93 * has four shader units that process shaders in parallel. At the present, Citra only implements a 85 * has four shader units that process shaders in parallel. At the present, Citra only implements a
@@ -100,11 +92,10 @@ struct UnitState {
100 // required to be 16-byte aligned. 92 // required to be 16-byte aligned.
101 alignas(16) Math::Vec4<float24> input[16]; 93 alignas(16) Math::Vec4<float24> input[16];
102 alignas(16) Math::Vec4<float24> temporary[16]; 94 alignas(16) Math::Vec4<float24> temporary[16];
95 alignas(16) Math::Vec4<float24> output[16];
103 } registers; 96 } registers;
104 static_assert(std::is_pod<Registers>::value, "Structure is not POD"); 97 static_assert(std::is_pod<Registers>::value, "Structure is not POD");
105 98
106 OutputRegisters output_registers;
107
108 bool conditional_code[2]; 99 bool conditional_code[2];
109 100
110 // Two Address registers and one loop counter 101 // Two Address registers and one loop counter
@@ -130,7 +121,7 @@ struct UnitState {
130 static size_t OutputOffset(const DestRegister& reg) { 121 static size_t OutputOffset(const DestRegister& reg) {
131 switch (reg.GetRegisterType()) { 122 switch (reg.GetRegisterType()) {
132 case RegisterType::Output: 123 case RegisterType::Output:
133 return offsetof(UnitState, output_registers.value) + 124 return offsetof(UnitState, registers.output) +
134 reg.GetIndex() * sizeof(Math::Vec4<float24>); 125 reg.GetIndex() * sizeof(Math::Vec4<float24>);
135 126
136 case RegisterType::Temporary: 127 case RegisterType::Temporary:
@@ -142,13 +133,17 @@ struct UnitState {
142 return 0; 133 return 0;
143 } 134 }
144 } 135 }
145};
146 136
147/// Clears the shader cache 137 /**
148void ClearCache(); 138 * Loads the unit state with an input vertex.
139 *
140 * @param input Input vertex into the shader
141 * @param num_attributes The number of vertex shader attributes to load
142 */
143 void LoadInputVertex(const InputVertex& input, int num_attributes);
144};
149 145
150struct ShaderSetup { 146struct ShaderSetup {
151
152 struct { 147 struct {
153 // The float uniforms are accessed by the shader JIT using SSE instructions, and are 148 // The float uniforms are accessed by the shader JIT using SSE instructions, and are
154 // therefore required to be 16-byte aligned. 149 // therefore required to be 16-byte aligned.
@@ -173,32 +168,37 @@ struct ShaderSetup {
173 std::array<u32, 1024> program_code; 168 std::array<u32, 1024> program_code;
174 std::array<u32, 1024> swizzle_data; 169 std::array<u32, 1024> swizzle_data;
175 170
171 /// Data private to ShaderEngines
172 struct EngineData {
173 unsigned int entry_point;
174 /// Used by the JIT, points to a compiled shader object.
175 const void* cached_shader = nullptr;
176 } engine_data;
177};
178
179class ShaderEngine {
180public:
181 virtual ~ShaderEngine() = default;
182
176 /** 183 /**
177 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once 184 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once
178 * per vertex, which would happen within the `Run` function). 185 * per vertex, which would happen within the `Run` function).
179 */ 186 */
180 void Setup(); 187 virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0;
181
182 /**
183 * Runs the currently setup shader
184 * @param state Shader unit state, must be setup per shader and per shader unit
185 * @param input Input vertex into the shader
186 * @param num_attributes The number of vertex shader attributes
187 */
188 void Run(UnitState& state, const InputVertex& input, int num_attributes);
189 188
190 /** 189 /**
191 * Produce debug information based on the given shader and input vertex 190 * Runs the currently setup shader.
192 * @param input Input vertex into the shader 191 *
193 * @param num_attributes The number of vertex shader attributes 192 * @param setup Shader engine state, must be setup with SetupBatch on each shader change.
194 * @param config Configuration object for the shader pipeline 193 * @param state Shader unit state, must be setup with input data before each shader invocation.
195 * @param setup Setup object for the shader pipeline
196 * @return Debug information for this shader with regards to the given vertex
197 */ 194 */
198 DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, 195 virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0;
199 const Regs::ShaderConfig& config, const ShaderSetup& setup);
200}; 196};
201 197
198// TODO(yuriks): Remove and make it non-global state somewhere
199ShaderEngine* GetEngine();
200void Shutdown();
201
202} // namespace Shader 202} // namespace Shader
203 203
204} // namespace Pica 204} // namespace Pica
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index 20fb9754b..c0c89b857 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -7,10 +7,12 @@
7#include <cmath> 7#include <cmath>
8#include <numeric> 8#include <numeric>
9#include <boost/container/static_vector.hpp> 9#include <boost/container/static_vector.hpp>
10#include <boost/range/algorithm/fill.hpp>
10#include <nihstro/shader_bytecode.h> 11#include <nihstro/shader_bytecode.h>
11#include "common/assert.h" 12#include "common/assert.h"
12#include "common/common_types.h" 13#include "common/common_types.h"
13#include "common/logging/log.h" 14#include "common/logging/log.h"
15#include "common/microprofile.h"
14#include "common/vector_math.h" 16#include "common/vector_math.h"
15#include "video_core/pica_state.h" 17#include "video_core/pica_state.h"
16#include "video_core/pica_types.h" 18#include "video_core/pica_types.h"
@@ -37,12 +39,15 @@ struct CallStackElement {
37}; 39};
38 40
39template <bool Debug> 41template <bool Debug>
40void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, 42static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
41 unsigned offset) { 43 unsigned offset) {
42 // TODO: Is there a maximal size for this? 44 // TODO: Is there a maximal size for this?
43 boost::container::static_vector<CallStackElement, 16> call_stack; 45 boost::container::static_vector<CallStackElement, 16> call_stack;
44 u32 program_counter = offset; 46 u32 program_counter = offset;
45 47
48 state.conditional_code[0] = false;
49 state.conditional_code[1] = false;
50
46 auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, 51 auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset,
47 u8 repeat_count, u8 loop_increment) { 52 u8 repeat_count, u8 loop_increment) {
48 // -1 to make sure when incrementing the PC we end up at the correct offset 53 // -1 to make sure when incrementing the PC we end up at the correct offset
@@ -73,9 +78,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
73 } 78 }
74 }; 79 };
75 80
76 const auto& uniforms = g_state.vs.uniforms; 81 const auto& uniforms = setup.uniforms;
77 const auto& swizzle_data = g_state.vs.swizzle_data; 82 const auto& swizzle_data = setup.swizzle_data;
78 const auto& program_code = g_state.vs.program_code; 83 const auto& program_code = setup.program_code;
79 84
80 // Placeholder for invalid inputs 85 // Placeholder for invalid inputs
81 static float24 dummy_vec4_float24[4]; 86 static float24 dummy_vec4_float24[4];
@@ -170,7 +175,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
170 175
171 float24* dest = 176 float24* dest =
172 (instr.common.dest.Value() < 0x10) 177 (instr.common.dest.Value() < 0x10)
173 ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] 178 ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
174 : (instr.common.dest.Value() < 0x20) 179 : (instr.common.dest.Value() < 0x20)
175 ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] 180 ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
176 : dummy_vec4_float24; 181 : dummy_vec4_float24;
@@ -513,7 +518,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
513 518
514 float24* dest = 519 float24* dest =
515 (instr.mad.dest.Value() < 0x10) 520 (instr.mad.dest.Value() < 0x10)
516 ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] 521 ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
517 : (instr.mad.dest.Value() < 0x20) 522 : (instr.mad.dest.Value() < 0x20)
518 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] 523 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
519 : dummy_vec4_float24; 524 : dummy_vec4_float24;
@@ -647,9 +652,33 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>
647 } 652 }
648} 653}
649 654
650// Explicit instantiation 655void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
651template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<false>&, unsigned offset); 656 ASSERT(entry_point < 1024);
652template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<true>&, unsigned offset); 657 setup.engine_data.entry_point = entry_point;
658}
659
660MICROPROFILE_DECLARE(GPU_Shader);
661
662void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
663
664 MICROPROFILE_SCOPE(GPU_Shader);
665
666 DebugData<false> dummy_debug_data;
667 RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point);
668}
669
670DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
671 const InputVertex& input,
672 int num_attributes) const {
673 UnitState state;
674 DebugData<true> debug_data;
675
676 // Setup input register table
677 boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
678 state.LoadInputVertex(input, num_attributes);
679 RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
680 return debug_data;
681}
653 682
654} // namespace 683} // namespace
655 684
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index d31dcd7a6..d6c0e2d8c 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -4,18 +4,28 @@
4 4
5#pragma once 5#pragma once
6 6
7#include "video_core/shader/debug_data.h"
8#include "video_core/shader/shader.h"
9
7namespace Pica { 10namespace Pica {
8 11
9namespace Shader { 12namespace Shader {
10 13
11struct UnitState; 14class InterpreterEngine final : public ShaderEngine {
12 15public:
13template <bool Debug> 16 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
14struct DebugData; 17 void Run(const ShaderSetup& setup, UnitState& state) const override;
15 18
16template <bool Debug> 19 /**
17void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, 20 * Produce debug information based on the given shader and input vertex
18 unsigned offset); 21 * @param input Input vertex into the shader
22 * @param num_attributes The number of vertex shader attributes
23 * @param config Configuration object for the shader pipeline
24 * @return Debug information for this shader with regards to the given vertex
25 */
26 DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input,
27 int num_attributes) const;
28};
19 29
20} // namespace 30} // namespace
21 31
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index c588b778b..0ee0dd9ef 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -1,888 +1,48 @@
1// Copyright 2015 Citra Emulator Project 1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <algorithm> 5#include "common/hash.h"
6#include <cmath> 6#include "common/microprofile.h"
7#include <cstdint>
8#include <nihstro/shader_bytecode.h>
9#include <smmintrin.h>
10#include <xmmintrin.h>
11#include "common/assert.h"
12#include "common/logging/log.h"
13#include "common/vector_math.h"
14#include "common/x64/cpu_detect.h"
15#include "common/x64/xbyak_abi.h"
16#include "common/x64/xbyak_util.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h" 7#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_jit_x64.h" 8#include "video_core/shader/shader_jit_x64.h"
21 9#include "video_core/shader/shader_jit_x64_compiler.h"
22using namespace Common::X64;
23using namespace Xbyak::util;
24using Xbyak::Label;
25using Xbyak::Reg32;
26using Xbyak::Reg64;
27using Xbyak::Xmm;
28 10
29namespace Pica { 11namespace Pica {
30
31namespace Shader { 12namespace Shader {
32 13
33typedef void (JitShader::*JitFunction)(Instruction instr); 14JitX64Engine::JitX64Engine() = default;
34 15JitX64Engine::~JitX64Engine() = default;
35const JitFunction instr_table[64] = {
36 &JitShader::Compile_ADD, // add
37 &JitShader::Compile_DP3, // dp3
38 &JitShader::Compile_DP4, // dp4
39 &JitShader::Compile_DPH, // dph
40 nullptr, // unknown
41 &JitShader::Compile_EX2, // ex2
42 &JitShader::Compile_LG2, // lg2
43 nullptr, // unknown
44 &JitShader::Compile_MUL, // mul
45 &JitShader::Compile_SGE, // sge
46 &JitShader::Compile_SLT, // slt
47 &JitShader::Compile_FLR, // flr
48 &JitShader::Compile_MAX, // max
49 &JitShader::Compile_MIN, // min
50 &JitShader::Compile_RCP, // rcp
51 &JitShader::Compile_RSQ, // rsq
52 nullptr, // unknown
53 nullptr, // unknown
54 &JitShader::Compile_MOVA, // mova
55 &JitShader::Compile_MOV, // mov
56 nullptr, // unknown
57 nullptr, // unknown
58 nullptr, // unknown
59 nullptr, // unknown
60 &JitShader::Compile_DPH, // dphi
61 nullptr, // unknown
62 &JitShader::Compile_SGE, // sgei
63 &JitShader::Compile_SLT, // slti
64 nullptr, // unknown
65 nullptr, // unknown
66 nullptr, // unknown
67 nullptr, // unknown
68 nullptr, // unknown
69 &JitShader::Compile_NOP, // nop
70 &JitShader::Compile_END, // end
71 nullptr, // break
72 &JitShader::Compile_CALL, // call
73 &JitShader::Compile_CALLC, // callc
74 &JitShader::Compile_CALLU, // callu
75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop
78 nullptr, // emit
79 nullptr, // sete
80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp
83 &JitShader::Compile_CMP, // cmp
84 &JitShader::Compile_MAD, // madi
85 &JitShader::Compile_MAD, // madi
86 &JitShader::Compile_MAD, // madi
87 &JitShader::Compile_MAD, // madi
88 &JitShader::Compile_MAD, // madi
89 &JitShader::Compile_MAD, // madi
90 &JitShader::Compile_MAD, // madi
91 &JitShader::Compile_MAD, // madi
92 &JitShader::Compile_MAD, // mad
93 &JitShader::Compile_MAD, // mad
94 &JitShader::Compile_MAD, // mad
95 &JitShader::Compile_MAD, // mad
96 &JitShader::Compile_MAD, // mad
97 &JitShader::Compile_MAD, // mad
98 &JitShader::Compile_MAD, // mad
99 &JitShader::Compile_MAD, // mad
100};
101
102// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
103// be used as scratch registers within a compiler function. The other registers have designated
104// purposes, as documented below:
105 16
106/// Pointer to the uniform memory 17void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
107static const Reg64 SETUP = r9; 18 ASSERT(entry_point < 1024);
108/// The two 32-bit VS address offset registers set by the MOVA instruction 19 setup.engine_data.entry_point = entry_point;
109static const Reg64 ADDROFFS_REG_0 = r10;
110static const Reg64 ADDROFFS_REG_1 = r11;
111/// VS loop count register (Multiplied by 16)
112static const Reg32 LOOPCOUNT_REG = r12d;
113/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
114static const Reg32 LOOPCOUNT = esi;
115/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
116static const Reg32 LOOPINC = edi;
117/// Result of the previous CMP instruction for the X-component comparison
118static const Reg64 COND0 = r13;
119/// Result of the previous CMP instruction for the Y-component comparison
120static const Reg64 COND1 = r14;
121/// Pointer to the UnitState instance for the current VS unit
122static const Reg64 STATE = r15;
123/// SIMD scratch register
124static const Xmm SCRATCH = xmm0;
125/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
126static const Xmm SRC1 = xmm1;
127/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
128static const Xmm SRC2 = xmm2;
129/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
130static const Xmm SRC3 = xmm3;
131/// Additional scratch register
132static const Xmm SCRATCH2 = xmm4;
133/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
134static const Xmm ONE = xmm14;
135/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
136static const Xmm NEGBIT = xmm15;
137 20
138// State registers that must not be modified by external functions calls 21 u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code));
139// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed 22 u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data));
140static const BitSet32 persistent_regs = BuildRegSet({
141 // Pointers to register blocks
142 SETUP, STATE,
143 // Cached registers
144 ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
145 // Constants
146 ONE, NEGBIT,
147});
148 23
149/// Raw constant for the source register selector that indicates no swizzling is performed 24 u64 cache_key = code_hash ^ swizzle_hash;
150static const u8 NO_SRC_REG_SWIZZLE = 0x1b; 25 auto iter = cache.find(cache_key);
151/// Raw constant for the destination register enable mask that indicates all components are enabled 26 if (iter != cache.end()) {
152static const u8 NO_DEST_REG_MASK = 0xf; 27 setup.engine_data.cached_shader = iter->second.get();
153
154/**
155 * Get the vertex shader instruction for a given offset in the current shader program
156 * @param offset Offset in the current shader program of the instruction
157 * @return Instruction at the specified offset
158 */
159static Instruction GetVertexShaderInstruction(size_t offset) {
160 return {g_state.vs.program_code[offset]};
161}
162
163static void LogCritical(const char* msg) {
164 LOG_CRITICAL(HW_GPU, "%s", msg);
165}
166
167void JitShader::Compile_Assert(bool condition, const char* msg) {
168 if (!condition) {
169 mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
170 CallFarFunction(*this, LogCritical);
171 }
172}
173
174/**
175 * Loads and swizzles a source register into the specified XMM register.
176 * @param instr VS instruction, used for determining how to load the source register
177 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
178 * @param src_reg SourceRegister object corresponding to the source register to load
179 * @param dest Destination XMM register to store the loaded, swizzled source register
180 */
181void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
182 Xmm dest) {
183 Reg64 src_ptr;
184 size_t src_offset;
185
186 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
187 src_ptr = SETUP;
188 src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
189 } else { 28 } else {
190 src_ptr = STATE; 29 auto shader = std::make_unique<JitShader>();
191 src_offset = UnitState::InputOffset(src_reg); 30 shader->Compile(&setup.program_code, &setup.swizzle_data);
192 } 31 setup.engine_data.cached_shader = shader.get();
193 32 cache.emplace_hint(iter, cache_key, std::move(shader));
194 int src_offset_disp = (int)src_offset;
195 ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
196
197 unsigned operand_desc_id;
198
199 const bool is_inverted =
200 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
201
202 unsigned address_register_index;
203 unsigned offset_src;
204
205 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
206 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
207 operand_desc_id = instr.mad.operand_desc_id;
208 offset_src = is_inverted ? 3 : 2;
209 address_register_index = instr.mad.address_register_index;
210 } else {
211 operand_desc_id = instr.common.operand_desc_id;
212 offset_src = is_inverted ? 2 : 1;
213 address_register_index = instr.common.address_register_index;
214 }
215
216 if (src_num == offset_src && address_register_index != 0) {
217 switch (address_register_index) {
218 case 1: // address offset 1
219 movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
220 break;
221 case 2: // address offset 2
222 movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
223 break;
224 case 3: // address offset 3
225 movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
226 break;
227 default:
228 UNREACHABLE();
229 break;
230 }
231 } else {
232 // Load the source
233 movaps(dest, xword[src_ptr + src_offset_disp]);
234 }
235
236 SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
237
238 // Generate instructions for source register swizzling as needed
239 u8 sel = swiz.GetRawSelector(src_num);
240 if (sel != NO_SRC_REG_SWIZZLE) {
241 // Selector component order needs to be reversed for the SHUFPS instruction
242 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
243
244 // Shuffle inputs for swizzle
245 shufps(dest, dest, sel);
246 }
247
248 // If the source register should be negated, flip the negative bit using XOR
249 const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
250 if (negate[src_num - 1]) {
251 xorps(dest, NEGBIT);
252 } 33 }
253} 34}
254 35
255void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { 36MICROPROFILE_DECLARE(GPU_Shader);
256 DestRegister dest;
257 unsigned operand_desc_id;
258 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
259 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
260 operand_desc_id = instr.mad.operand_desc_id;
261 dest = instr.mad.dest.Value();
262 } else {
263 operand_desc_id = instr.common.operand_desc_id;
264 dest = instr.common.dest.Value();
265 }
266
267 SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]};
268
269 size_t dest_offset_disp = UnitState::OutputOffset(dest);
270
271 // If all components are enabled, write the result to the destination register
272 if (swiz.dest_mask == NO_DEST_REG_MASK) {
273 // Store dest back to memory
274 movaps(xword[STATE + dest_offset_disp], src);
275
276 } else {
277 // Not all components are enabled, so mask the result when storing to the destination
278 // register...
279 movaps(SCRATCH, xword[STATE + dest_offset_disp]);
280
281 if (Common::GetCPUCaps().sse4_1) {
282 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
283 ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
284 blendps(SCRATCH, src, mask);
285 } else {
286 movaps(SCRATCH2, src);
287 unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
288 unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
289
290 // Compute selector to selectively copy source components to destination for SHUFPS
291 // instruction
292 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
293 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
294 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
295 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
296 shufps(SCRATCH, SCRATCH2, sel);
297 }
298
299 // Store dest back to memory
300 movaps(xword[STATE + dest_offset_disp], SCRATCH);
301 }
302}
303
304void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
305 movaps(scratch, src1);
306 cmpordps(scratch, src2);
307
308 mulps(src1, src2);
309 37
310 movaps(src2, src1); 38void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const {
311 cmpunordps(src2, src2); 39 ASSERT(setup.engine_data.cached_shader != nullptr);
312 40
313 xorps(scratch, src2); 41 MICROPROFILE_SCOPE(GPU_Shader);
314 andps(src1, scratch);
315}
316
317void JitShader::Compile_EvaluateCondition(Instruction instr) {
318 // Note: NXOR is used below to check for equality
319 switch (instr.flow_control.op) {
320 case Instruction::FlowControlType::Or:
321 mov(eax, COND0);
322 mov(ebx, COND1);
323 xor(eax, (instr.flow_control.refx.Value() ^ 1));
324 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
325 or (eax, ebx);
326 break;
327
328 case Instruction::FlowControlType::And:
329 mov(eax, COND0);
330 mov(ebx, COND1);
331 xor(eax, (instr.flow_control.refx.Value() ^ 1));
332 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
333 and(eax, ebx);
334 break;
335
336 case Instruction::FlowControlType::JustX:
337 mov(eax, COND0);
338 xor(eax, (instr.flow_control.refx.Value() ^ 1));
339 break;
340
341 case Instruction::FlowControlType::JustY:
342 mov(eax, COND1);
343 xor(eax, (instr.flow_control.refy.Value() ^ 1));
344 break;
345 }
346}
347 42
348void JitShader::Compile_UniformCondition(Instruction instr) { 43 const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader);
349 size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); 44 shader->Run(setup, state, setup.engine_data.entry_point);
350 cmp(byte[SETUP + offset], 0);
351} 45}
352 46
353BitSet32 JitShader::PersistentCallerSavedRegs() {
354 return persistent_regs & ABI_ALL_CALLER_SAVED;
355}
356
357void JitShader::Compile_ADD(Instruction instr) {
358 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
359 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
360 addps(SRC1, SRC2);
361 Compile_DestEnable(instr, SRC1);
362}
363
364void JitShader::Compile_DP3(Instruction instr) {
365 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
366 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
367
368 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
369
370 movaps(SRC2, SRC1);
371 shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
372
373 movaps(SRC3, SRC1);
374 shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
375
376 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
377 addps(SRC1, SRC2);
378 addps(SRC1, SRC3);
379
380 Compile_DestEnable(instr, SRC1);
381}
382
383void JitShader::Compile_DP4(Instruction instr) {
384 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
385 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
386
387 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
388
389 movaps(SRC2, SRC1);
390 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
391 addps(SRC1, SRC2);
392
393 movaps(SRC2, SRC1);
394 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
395 addps(SRC1, SRC2);
396
397 Compile_DestEnable(instr, SRC1);
398}
399
400void JitShader::Compile_DPH(Instruction instr) {
401 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
402 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
403 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
404 } else {
405 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
406 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
407 }
408
409 if (Common::GetCPUCaps().sse4_1) {
410 // Set 4th component to 1.0
411 blendps(SRC1, ONE, 0b1000);
412 } else {
413 // Set 4th component to 1.0
414 movaps(SCRATCH, SRC1);
415 unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
416 unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
417 }
418
419 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
420
421 movaps(SRC2, SRC1);
422 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
423 addps(SRC1, SRC2);
424
425 movaps(SRC2, SRC1);
426 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
427 addps(SRC1, SRC2);
428
429 Compile_DestEnable(instr, SRC1);
430}
431
432void JitShader::Compile_EX2(Instruction instr) {
433 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
434 movss(xmm0, SRC1); // ABI_PARAM1
435
436 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
437 CallFarFunction(*this, exp2f);
438 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
439
440 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
441 movaps(SRC1, xmm0);
442 Compile_DestEnable(instr, SRC1);
443}
444
445void JitShader::Compile_LG2(Instruction instr) {
446 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
447 movss(xmm0, SRC1); // ABI_PARAM1
448
449 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
450 CallFarFunction(*this, log2f);
451 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
452
453 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
454 movaps(SRC1, xmm0);
455 Compile_DestEnable(instr, SRC1);
456}
457
458void JitShader::Compile_MUL(Instruction instr) {
459 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
460 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
461 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
462 Compile_DestEnable(instr, SRC1);
463}
464
465void JitShader::Compile_SGE(Instruction instr) {
466 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
467 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
468 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
469 } else {
470 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
471 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
472 }
473
474 cmpleps(SRC2, SRC1);
475 andps(SRC2, ONE);
476
477 Compile_DestEnable(instr, SRC2);
478}
479
480void JitShader::Compile_SLT(Instruction instr) {
481 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
482 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
483 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
484 } else {
485 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
486 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
487 }
488
489 cmpltps(SRC1, SRC2);
490 andps(SRC1, ONE);
491
492 Compile_DestEnable(instr, SRC1);
493}
494
495void JitShader::Compile_FLR(Instruction instr) {
496 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
497
498 if (Common::GetCPUCaps().sse4_1) {
499 roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
500 } else {
501 cvttps2dq(SRC1, SRC1);
502 cvtdq2ps(SRC1, SRC1);
503 }
504
505 Compile_DestEnable(instr, SRC1);
506}
507
508void JitShader::Compile_MAX(Instruction instr) {
509 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
510 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
511 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
512 maxps(SRC1, SRC2);
513 Compile_DestEnable(instr, SRC1);
514}
515
516void JitShader::Compile_MIN(Instruction instr) {
517 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
518 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
519 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
520 minps(SRC1, SRC2);
521 Compile_DestEnable(instr, SRC1);
522}
523
524void JitShader::Compile_MOVA(Instruction instr) {
525 SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]};
526
527 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
528 return; // NoOp
529 }
530
531 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
532
533 // Convert floats to integers using truncation (only care about X and Y components)
534 cvttps2dq(SRC1, SRC1);
535
536 // Get result
537 movq(rax, SRC1);
538
539 // Handle destination enable
540 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
541 // Move and sign-extend low 32 bits
542 movsxd(ADDROFFS_REG_0, eax);
543
544 // Move and sign-extend high 32 bits
545 shr(rax, 32);
546 movsxd(ADDROFFS_REG_1, eax);
547
548 // Multiply by 16 to be used as an offset later
549 shl(ADDROFFS_REG_0, 4);
550 shl(ADDROFFS_REG_1, 4);
551 } else {
552 if (swiz.DestComponentEnabled(0)) {
553 // Move and sign-extend low 32 bits
554 movsxd(ADDROFFS_REG_0, eax);
555
556 // Multiply by 16 to be used as an offset later
557 shl(ADDROFFS_REG_0, 4);
558 } else if (swiz.DestComponentEnabled(1)) {
559 // Move and sign-extend high 32 bits
560 shr(rax, 32);
561 movsxd(ADDROFFS_REG_1, eax);
562
563 // Multiply by 16 to be used as an offset later
564 shl(ADDROFFS_REG_1, 4);
565 }
566 }
567}
568
569void JitShader::Compile_MOV(Instruction instr) {
570 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
571 Compile_DestEnable(instr, SRC1);
572}
573
574void JitShader::Compile_RCP(Instruction instr) {
575 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
576
577 // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
578 // performs this operation more accurately. This should be checked on hardware.
579 rcpss(SRC1, SRC1);
580 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
581
582 Compile_DestEnable(instr, SRC1);
583}
584
585void JitShader::Compile_RSQ(Instruction instr) {
586 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
587
588 // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
589 // performs this operation more accurately. This should be checked on hardware.
590 rsqrtss(SRC1, SRC1);
591 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
592
593 Compile_DestEnable(instr, SRC1);
594}
595
596void JitShader::Compile_NOP(Instruction instr) {}
597
598void JitShader::Compile_END(Instruction instr) {
599 ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
600 ret();
601}
602
603void JitShader::Compile_CALL(Instruction instr) {
604 // Push offset of the return
605 push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
606
607 // Call the subroutine
608 call(instruction_labels[instr.flow_control.dest_offset]);
609
610 // Skip over the return offset that's on the stack
611 add(rsp, 8);
612}
613
614void JitShader::Compile_CALLC(Instruction instr) {
615 Compile_EvaluateCondition(instr);
616 Label b;
617 jz(b);
618 Compile_CALL(instr);
619 L(b);
620}
621
622void JitShader::Compile_CALLU(Instruction instr) {
623 Compile_UniformCondition(instr);
624 Label b;
625 jz(b);
626 Compile_CALL(instr);
627 L(b);
628}
629
630void JitShader::Compile_CMP(Instruction instr) {
631 using Op = Instruction::Common::CompareOpType::Op;
632 Op op_x = instr.common.compare_op.x;
633 Op op_y = instr.common.compare_op.y;
634
635 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
636 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
637
638 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
639 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
640 // because they don't match when used with NaNs.
641 static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
642
643 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
644 Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
645 Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
646
647 if (op_x == op_y) {
648 // Compare X-component and Y-component together
649 cmpps(lhs_x, rhs_x, cmp[op_x]);
650 movq(COND0, lhs_x);
651
652 mov(COND1, COND0);
653 } else {
654 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
655 Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
656 Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
657
658 // Compare X-component
659 movaps(SCRATCH, lhs_x);
660 cmpss(SCRATCH, rhs_x, cmp[op_x]);
661
662 // Compare Y-component
663 cmpps(lhs_y, rhs_y, cmp[op_y]);
664
665 movq(COND0, SCRATCH);
666 movq(COND1, lhs_y);
667 }
668
669 shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
670 shr(COND1, 63);
671}
672
673void JitShader::Compile_MAD(Instruction instr) {
674 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
675
676 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
677 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
678 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
679 } else {
680 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
681 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
682 }
683
684 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
685 addps(SRC1, SRC3);
686
687 Compile_DestEnable(instr, SRC1);
688}
689
690void JitShader::Compile_IF(Instruction instr) {
691 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
692 "Backwards if-statements not supported");
693 Label l_else, l_endif;
694
695 // Evaluate the "IF" condition
696 if (instr.opcode.Value() == OpCode::Id::IFU) {
697 Compile_UniformCondition(instr);
698 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
699 Compile_EvaluateCondition(instr);
700 }
701 jz(l_else, T_NEAR);
702
703 // Compile the code that corresponds to the condition evaluating as true
704 Compile_Block(instr.flow_control.dest_offset);
705
706 // If there isn't an "ELSE" condition, we are done here
707 if (instr.flow_control.num_instructions == 0) {
708 L(l_else);
709 return;
710 }
711
712 jmp(l_endif, T_NEAR);
713
714 L(l_else);
715 // This code corresponds to the "ELSE" condition
716 // Comple the code that corresponds to the condition evaluating as false
717 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
718
719 L(l_endif);
720}
721
722void JitShader::Compile_LOOP(Instruction instr) {
723 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
724 "Backwards loops not supported");
725 Compile_Assert(!looping, "Nested loops not supported");
726
727 looping = true;
728
729 // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
730 // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
731 // 4 bits) to be used as an offset into the 16-byte vector registers later
732 size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
733 mov(LOOPCOUNT, dword[SETUP + offset]);
734 mov(LOOPCOUNT_REG, LOOPCOUNT);
735 shr(LOOPCOUNT_REG, 4);
736 and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
737 mov(LOOPINC, LOOPCOUNT);
738 shr(LOOPINC, 12);
739 and(LOOPINC, 0xFF0); // Z-component is the incrementer
740 movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
741 add(LOOPCOUNT, 1); // Iteration count is X-component + 1
742
743 Label l_loop_start;
744 L(l_loop_start);
745
746 Compile_Block(instr.flow_control.dest_offset + 1);
747
748 add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
749 sub(LOOPCOUNT, 1); // Increment loop count by 1
750 jnz(l_loop_start); // Loop if not equal
751
752 looping = false;
753}
754
755void JitShader::Compile_JMP(Instruction instr) {
756 if (instr.opcode.Value() == OpCode::Id::JMPC)
757 Compile_EvaluateCondition(instr);
758 else if (instr.opcode.Value() == OpCode::Id::JMPU)
759 Compile_UniformCondition(instr);
760 else
761 UNREACHABLE();
762
763 bool inverted_condition =
764 (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
765
766 Label& b = instruction_labels[instr.flow_control.dest_offset];
767 if (inverted_condition) {
768 jz(b, T_NEAR);
769 } else {
770 jnz(b, T_NEAR);
771 }
772}
773
774void JitShader::Compile_Block(unsigned end) {
775 while (program_counter < end) {
776 Compile_NextInstr();
777 }
778}
779
780void JitShader::Compile_Return() {
781 // Peek return offset on the stack and check if we're at that offset
782 mov(rax, qword[rsp + 8]);
783 cmp(eax, (program_counter));
784
785 // If so, jump back to before CALL
786 Label b;
787 jnz(b);
788 ret();
789 L(b);
790}
791
792void JitShader::Compile_NextInstr() {
793 if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
794 Compile_Return();
795 }
796
797 L(instruction_labels[program_counter]);
798
799 Instruction instr = GetVertexShaderInstruction(program_counter++);
800
801 OpCode::Id opcode = instr.opcode.Value();
802 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
803
804 if (instr_func) {
805 // JIT the instruction!
806 ((*this).*instr_func)(instr);
807 } else {
808 // Unhandled instruction
809 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
810 instr.opcode.Value().EffectiveOpCode(), instr.hex);
811 }
812}
813
814void JitShader::FindReturnOffsets() {
815 return_offsets.clear();
816
817 for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) {
818 Instruction instr = GetVertexShaderInstruction(offset);
819
820 switch (instr.opcode.Value()) {
821 case OpCode::Id::CALL:
822 case OpCode::Id::CALLC:
823 case OpCode::Id::CALLU:
824 return_offsets.push_back(instr.flow_control.dest_offset +
825 instr.flow_control.num_instructions);
826 break;
827 default:
828 break;
829 }
830 }
831
832 // Sort for efficient binary search later
833 std::sort(return_offsets.begin(), return_offsets.end());
834}
835
836void JitShader::Compile() {
837 // Reset flow control state
838 program = (CompiledShader*)getCurr();
839 program_counter = 0;
840 looping = false;
841 instruction_labels.fill(Xbyak::Label());
842
843 // Find all `CALL` instructions and identify return locations
844 FindReturnOffsets();
845
846 // The stack pointer is 8 modulo 16 at the entry of a procedure
847 ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
848
849 mov(SETUP, ABI_PARAM1);
850 mov(STATE, ABI_PARAM2);
851
852 // Zero address/loop registers
853 xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
854 xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
855 xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
856
857 // Used to set a register to one
858 static const __m128 one = {1.f, 1.f, 1.f, 1.f};
859 mov(rax, reinterpret_cast<size_t>(&one));
860 movaps(ONE, xword[rax]);
861
862 // Used to negate registers
863 static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
864 mov(rax, reinterpret_cast<size_t>(&neg));
865 movaps(NEGBIT, xword[rax]);
866
867 // Jump to start of the shader program
868 jmp(ABI_PARAM3);
869
870 // Compile entire program
871 Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size()));
872
873 // Free memory that's no longer needed
874 return_offsets.clear();
875 return_offsets.shrink_to_fit();
876
877 ready();
878
879 uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program);
880 ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
881 LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size);
882}
883
884JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
885
886} // namespace Shader 47} // namespace Shader
887
888} // namespace Pica 48} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
index f37548306..078b2cba5 100644
--- a/src/video_core/shader/shader_jit_x64.h
+++ b/src/video_core/shader/shader_jit_x64.h
@@ -1,121 +1,30 @@
1// Copyright 2015 Citra Emulator Project 1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#pragma once 5#pragma once
6 6
7#include <array> 7#include <memory>
8#include <cstddef> 8#include <unordered_map>
9#include <utility>
10#include <vector>
11#include <nihstro/shader_bytecode.h>
12#include <xbyak.h>
13#include "common/bit_set.h"
14#include "common/common_types.h" 9#include "common/common_types.h"
15#include "common/x64/emitter.h"
16#include "video_core/shader/shader.h" 10#include "video_core/shader/shader.h"
17 11
18using nihstro::Instruction;
19using nihstro::OpCode;
20using nihstro::SwizzlePattern;
21
22namespace Pica { 12namespace Pica {
23
24namespace Shader { 13namespace Shader {
25 14
26/// Memory allocated for each compiled shader (64Kb) 15class JitShader;
27constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
28 16
29/** 17class JitX64Engine final : public ShaderEngine {
30 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
31 * code that can be executed on the host machine directly.
32 */
33class JitShader : public Xbyak::CodeGenerator {
34public: 18public:
35 JitShader(); 19 JitX64Engine();
36 20 ~JitX64Engine() override;
37 void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
38 program(&setup, &state, instruction_labels[offset].getAddress());
39 }
40
41 void Compile();
42 21
43 void Compile_ADD(Instruction instr); 22 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
44 void Compile_DP3(Instruction instr); 23 void Run(const ShaderSetup& setup, UnitState& state) const override;
45 void Compile_DP4(Instruction instr);
46 void Compile_DPH(Instruction instr);
47 void Compile_EX2(Instruction instr);
48 void Compile_LG2(Instruction instr);
49 void Compile_MUL(Instruction instr);
50 void Compile_SGE(Instruction instr);
51 void Compile_SLT(Instruction instr);
52 void Compile_FLR(Instruction instr);
53 void Compile_MAX(Instruction instr);
54 void Compile_MIN(Instruction instr);
55 void Compile_RCP(Instruction instr);
56 void Compile_RSQ(Instruction instr);
57 void Compile_MOVA(Instruction instr);
58 void Compile_MOV(Instruction instr);
59 void Compile_NOP(Instruction instr);
60 void Compile_END(Instruction instr);
61 void Compile_CALL(Instruction instr);
62 void Compile_CALLC(Instruction instr);
63 void Compile_CALLU(Instruction instr);
64 void Compile_IF(Instruction instr);
65 void Compile_LOOP(Instruction instr);
66 void Compile_JMP(Instruction instr);
67 void Compile_CMP(Instruction instr);
68 void Compile_MAD(Instruction instr);
69 24
70private: 25private:
71 void Compile_Block(unsigned end); 26 std::unordered_map<u64, std::unique_ptr<JitShader>> cache;
72 void Compile_NextInstr();
73
74 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
75 Xbyak::Xmm dest);
76 void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
77
78 /**
79 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
80 * zero by inf. Clobbers `src2` and `scratch`.
81 */
82 void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
83
84 void Compile_EvaluateCondition(Instruction instr);
85 void Compile_UniformCondition(Instruction instr);
86
87 /**
88 * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
89 */
90 void Compile_Return();
91
92 BitSet32 PersistentCallerSavedRegs();
93
94 /**
95 * Assertion evaluated at compile-time, but only triggered if executed at runtime.
96 * @param msg Message to be logged if the assertion fails.
97 */
98 void Compile_Assert(bool condition, const char* msg);
99
100 /**
101 * Analyzes the entire shader program for `CALL` instructions before emitting any code,
102 * identifying the locations where a return needs to be inserted.
103 */
104 void FindReturnOffsets();
105
106 /// Mapping of Pica VS instructions to pointers in the emitted code
107 std::array<Xbyak::Label, 1024> instruction_labels;
108
109 /// Offsets in code where a return needs to be inserted
110 std::vector<unsigned> return_offsets;
111
112 unsigned program_counter = 0; ///< Offset of the next instruction to decode
113 bool looping = false; ///< True if compiling a loop, used to check for nested loops
114
115 using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
116 CompiledShader* program = nullptr;
117}; 27};
118 28
119} // Shader 29} // namespace Shader
120 30} // namespace Pica
121} // Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
new file mode 100644
index 000000000..49806e8c9
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.cpp
@@ -0,0 +1,884 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <cmath>
7#include <cstdint>
8#include <nihstro/shader_bytecode.h>
9#include <smmintrin.h>
10#include <xmmintrin.h>
11#include "common/assert.h"
12#include "common/logging/log.h"
13#include "common/vector_math.h"
14#include "common/x64/cpu_detect.h"
15#include "common/x64/xbyak_abi.h"
16#include "common/x64/xbyak_util.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_jit_x64_compiler.h"
21
22using namespace Common::X64;
23using namespace Xbyak::util;
24using Xbyak::Label;
25using Xbyak::Reg32;
26using Xbyak::Reg64;
27using Xbyak::Xmm;
28
29namespace Pica {
30
31namespace Shader {
32
33typedef void (JitShader::*JitFunction)(Instruction instr);
34
35const JitFunction instr_table[64] = {
36 &JitShader::Compile_ADD, // add
37 &JitShader::Compile_DP3, // dp3
38 &JitShader::Compile_DP4, // dp4
39 &JitShader::Compile_DPH, // dph
40 nullptr, // unknown
41 &JitShader::Compile_EX2, // ex2
42 &JitShader::Compile_LG2, // lg2
43 nullptr, // unknown
44 &JitShader::Compile_MUL, // mul
45 &JitShader::Compile_SGE, // sge
46 &JitShader::Compile_SLT, // slt
47 &JitShader::Compile_FLR, // flr
48 &JitShader::Compile_MAX, // max
49 &JitShader::Compile_MIN, // min
50 &JitShader::Compile_RCP, // rcp
51 &JitShader::Compile_RSQ, // rsq
52 nullptr, // unknown
53 nullptr, // unknown
54 &JitShader::Compile_MOVA, // mova
55 &JitShader::Compile_MOV, // mov
56 nullptr, // unknown
57 nullptr, // unknown
58 nullptr, // unknown
59 nullptr, // unknown
60 &JitShader::Compile_DPH, // dphi
61 nullptr, // unknown
62 &JitShader::Compile_SGE, // sgei
63 &JitShader::Compile_SLT, // slti
64 nullptr, // unknown
65 nullptr, // unknown
66 nullptr, // unknown
67 nullptr, // unknown
68 nullptr, // unknown
69 &JitShader::Compile_NOP, // nop
70 &JitShader::Compile_END, // end
71 nullptr, // break
72 &JitShader::Compile_CALL, // call
73 &JitShader::Compile_CALLC, // callc
74 &JitShader::Compile_CALLU, // callu
75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop
78 nullptr, // emit
79 nullptr, // sete
80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp
83 &JitShader::Compile_CMP, // cmp
84 &JitShader::Compile_MAD, // madi
85 &JitShader::Compile_MAD, // madi
86 &JitShader::Compile_MAD, // madi
87 &JitShader::Compile_MAD, // madi
88 &JitShader::Compile_MAD, // madi
89 &JitShader::Compile_MAD, // madi
90 &JitShader::Compile_MAD, // madi
91 &JitShader::Compile_MAD, // madi
92 &JitShader::Compile_MAD, // mad
93 &JitShader::Compile_MAD, // mad
94 &JitShader::Compile_MAD, // mad
95 &JitShader::Compile_MAD, // mad
96 &JitShader::Compile_MAD, // mad
97 &JitShader::Compile_MAD, // mad
98 &JitShader::Compile_MAD, // mad
99 &JitShader::Compile_MAD, // mad
100};
101
102// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
103// be used as scratch registers within a compiler function. The other registers have designated
104// purposes, as documented below:
105
106/// Pointer to the uniform memory
107static const Reg64 SETUP = r9;
108/// The two 32-bit VS address offset registers set by the MOVA instruction
109static const Reg64 ADDROFFS_REG_0 = r10;
110static const Reg64 ADDROFFS_REG_1 = r11;
111/// VS loop count register (Multiplied by 16)
112static const Reg32 LOOPCOUNT_REG = r12d;
113/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
114static const Reg32 LOOPCOUNT = esi;
115/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
116static const Reg32 LOOPINC = edi;
117/// Result of the previous CMP instruction for the X-component comparison
118static const Reg64 COND0 = r13;
119/// Result of the previous CMP instruction for the Y-component comparison
120static const Reg64 COND1 = r14;
121/// Pointer to the UnitState instance for the current VS unit
122static const Reg64 STATE = r15;
123/// SIMD scratch register
124static const Xmm SCRATCH = xmm0;
125/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
126static const Xmm SRC1 = xmm1;
127/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
128static const Xmm SRC2 = xmm2;
129/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
130static const Xmm SRC3 = xmm3;
131/// Additional scratch register
132static const Xmm SCRATCH2 = xmm4;
133/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
134static const Xmm ONE = xmm14;
135/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
136static const Xmm NEGBIT = xmm15;
137
138// State registers that must not be modified by external functions calls
139// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
140static const BitSet32 persistent_regs = BuildRegSet({
141 // Pointers to register blocks
142 SETUP, STATE,
143 // Cached registers
144 ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
145 // Constants
146 ONE, NEGBIT,
147});
148
149/// Raw constant for the source register selector that indicates no swizzling is performed
150static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
151/// Raw constant for the destination register enable mask that indicates all components are enabled
152static const u8 NO_DEST_REG_MASK = 0xf;
153
154static void LogCritical(const char* msg) {
155 LOG_CRITICAL(HW_GPU, "%s", msg);
156}
157
158void JitShader::Compile_Assert(bool condition, const char* msg) {
159 if (!condition) {
160 mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
161 CallFarFunction(*this, LogCritical);
162 }
163}
164
165/**
166 * Loads and swizzles a source register into the specified XMM register.
167 * @param instr VS instruction, used for determining how to load the source register
168 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
169 * @param src_reg SourceRegister object corresponding to the source register to load
170 * @param dest Destination XMM register to store the loaded, swizzled source register
171 */
172void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
173 Xmm dest) {
174 Reg64 src_ptr;
175 size_t src_offset;
176
177 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
178 src_ptr = SETUP;
179 src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
180 } else {
181 src_ptr = STATE;
182 src_offset = UnitState::InputOffset(src_reg);
183 }
184
185 int src_offset_disp = (int)src_offset;
186 ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
187
188 unsigned operand_desc_id;
189
190 const bool is_inverted =
191 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
192
193 unsigned address_register_index;
194 unsigned offset_src;
195
196 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
197 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
198 operand_desc_id = instr.mad.operand_desc_id;
199 offset_src = is_inverted ? 3 : 2;
200 address_register_index = instr.mad.address_register_index;
201 } else {
202 operand_desc_id = instr.common.operand_desc_id;
203 offset_src = is_inverted ? 2 : 1;
204 address_register_index = instr.common.address_register_index;
205 }
206
207 if (src_num == offset_src && address_register_index != 0) {
208 switch (address_register_index) {
209 case 1: // address offset 1
210 movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
211 break;
212 case 2: // address offset 2
213 movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
214 break;
215 case 3: // address offset 3
216 movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
217 break;
218 default:
219 UNREACHABLE();
220 break;
221 }
222 } else {
223 // Load the source
224 movaps(dest, xword[src_ptr + src_offset_disp]);
225 }
226
227 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
228
229 // Generate instructions for source register swizzling as needed
230 u8 sel = swiz.GetRawSelector(src_num);
231 if (sel != NO_SRC_REG_SWIZZLE) {
232 // Selector component order needs to be reversed for the SHUFPS instruction
233 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
234
235 // Shuffle inputs for swizzle
236 shufps(dest, dest, sel);
237 }
238
239 // If the source register should be negated, flip the negative bit using XOR
240 const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
241 if (negate[src_num - 1]) {
242 xorps(dest, NEGBIT);
243 }
244}
245
246void JitShader::Compile_DestEnable(Instruction instr, Xmm src) {
247 DestRegister dest;
248 unsigned operand_desc_id;
249 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
250 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
251 operand_desc_id = instr.mad.operand_desc_id;
252 dest = instr.mad.dest.Value();
253 } else {
254 operand_desc_id = instr.common.operand_desc_id;
255 dest = instr.common.dest.Value();
256 }
257
258 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
259
260 size_t dest_offset_disp = UnitState::OutputOffset(dest);
261
262 // If all components are enabled, write the result to the destination register
263 if (swiz.dest_mask == NO_DEST_REG_MASK) {
264 // Store dest back to memory
265 movaps(xword[STATE + dest_offset_disp], src);
266
267 } else {
268 // Not all components are enabled, so mask the result when storing to the destination
269 // register...
270 movaps(SCRATCH, xword[STATE + dest_offset_disp]);
271
272 if (Common::GetCPUCaps().sse4_1) {
273 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
274 ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
275 blendps(SCRATCH, src, mask);
276 } else {
277 movaps(SCRATCH2, src);
278 unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
279 unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
280
281 // Compute selector to selectively copy source components to destination for SHUFPS
282 // instruction
283 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
284 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
285 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
286 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
287 shufps(SCRATCH, SCRATCH2, sel);
288 }
289
290 // Store dest back to memory
291 movaps(xword[STATE + dest_offset_disp], SCRATCH);
292 }
293}
294
295void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
296 movaps(scratch, src1);
297 cmpordps(scratch, src2);
298
299 mulps(src1, src2);
300
301 movaps(src2, src1);
302 cmpunordps(src2, src2);
303
304 xorps(scratch, src2);
305 andps(src1, scratch);
306}
307
308void JitShader::Compile_EvaluateCondition(Instruction instr) {
309 // Note: NXOR is used below to check for equality
310 switch (instr.flow_control.op) {
311 case Instruction::FlowControlType::Or:
312 mov(eax, COND0);
313 mov(ebx, COND1);
314 xor(eax, (instr.flow_control.refx.Value() ^ 1));
315 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
316 or (eax, ebx);
317 break;
318
319 case Instruction::FlowControlType::And:
320 mov(eax, COND0);
321 mov(ebx, COND1);
322 xor(eax, (instr.flow_control.refx.Value() ^ 1));
323 xor(ebx, (instr.flow_control.refy.Value() ^ 1));
324 and(eax, ebx);
325 break;
326
327 case Instruction::FlowControlType::JustX:
328 mov(eax, COND0);
329 xor(eax, (instr.flow_control.refx.Value() ^ 1));
330 break;
331
332 case Instruction::FlowControlType::JustY:
333 mov(eax, COND1);
334 xor(eax, (instr.flow_control.refy.Value() ^ 1));
335 break;
336 }
337}
338
339void JitShader::Compile_UniformCondition(Instruction instr) {
340 size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id);
341 cmp(byte[SETUP + offset], 0);
342}
343
344BitSet32 JitShader::PersistentCallerSavedRegs() {
345 return persistent_regs & ABI_ALL_CALLER_SAVED;
346}
347
348void JitShader::Compile_ADD(Instruction instr) {
349 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
350 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
351 addps(SRC1, SRC2);
352 Compile_DestEnable(instr, SRC1);
353}
354
355void JitShader::Compile_DP3(Instruction instr) {
356 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
357 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
358
359 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
360
361 movaps(SRC2, SRC1);
362 shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
363
364 movaps(SRC3, SRC1);
365 shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
366
367 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
368 addps(SRC1, SRC2);
369 addps(SRC1, SRC3);
370
371 Compile_DestEnable(instr, SRC1);
372}
373
374void JitShader::Compile_DP4(Instruction instr) {
375 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
376 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
377
378 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
379
380 movaps(SRC2, SRC1);
381 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
382 addps(SRC1, SRC2);
383
384 movaps(SRC2, SRC1);
385 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
386 addps(SRC1, SRC2);
387
388 Compile_DestEnable(instr, SRC1);
389}
390
391void JitShader::Compile_DPH(Instruction instr) {
392 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
393 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
394 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
395 } else {
396 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
397 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
398 }
399
400 if (Common::GetCPUCaps().sse4_1) {
401 // Set 4th component to 1.0
402 blendps(SRC1, ONE, 0b1000);
403 } else {
404 // Set 4th component to 1.0
405 movaps(SCRATCH, SRC1);
406 unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
407 unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
408 }
409
410 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
411
412 movaps(SRC2, SRC1);
413 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
414 addps(SRC1, SRC2);
415
416 movaps(SRC2, SRC1);
417 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
418 addps(SRC1, SRC2);
419
420 Compile_DestEnable(instr, SRC1);
421}
422
423void JitShader::Compile_EX2(Instruction instr) {
424 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
425 movss(xmm0, SRC1); // ABI_PARAM1
426
427 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
428 CallFarFunction(*this, exp2f);
429 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
430
431 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
432 movaps(SRC1, xmm0);
433 Compile_DestEnable(instr, SRC1);
434}
435
436void JitShader::Compile_LG2(Instruction instr) {
437 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
438 movss(xmm0, SRC1); // ABI_PARAM1
439
440 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
441 CallFarFunction(*this, log2f);
442 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
443
444 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
445 movaps(SRC1, xmm0);
446 Compile_DestEnable(instr, SRC1);
447}
448
449void JitShader::Compile_MUL(Instruction instr) {
450 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
451 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
452 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
453 Compile_DestEnable(instr, SRC1);
454}
455
456void JitShader::Compile_SGE(Instruction instr) {
457 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
458 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
459 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
460 } else {
461 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
462 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
463 }
464
465 cmpleps(SRC2, SRC1);
466 andps(SRC2, ONE);
467
468 Compile_DestEnable(instr, SRC2);
469}
470
471void JitShader::Compile_SLT(Instruction instr) {
472 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
473 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
474 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
475 } else {
476 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
477 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
478 }
479
480 cmpltps(SRC1, SRC2);
481 andps(SRC1, ONE);
482
483 Compile_DestEnable(instr, SRC1);
484}
485
486void JitShader::Compile_FLR(Instruction instr) {
487 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
488
489 if (Common::GetCPUCaps().sse4_1) {
490 roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
491 } else {
492 cvttps2dq(SRC1, SRC1);
493 cvtdq2ps(SRC1, SRC1);
494 }
495
496 Compile_DestEnable(instr, SRC1);
497}
498
499void JitShader::Compile_MAX(Instruction instr) {
500 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
501 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
502 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
503 maxps(SRC1, SRC2);
504 Compile_DestEnable(instr, SRC1);
505}
506
507void JitShader::Compile_MIN(Instruction instr) {
508 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
509 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
510 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
511 minps(SRC1, SRC2);
512 Compile_DestEnable(instr, SRC1);
513}
514
515void JitShader::Compile_MOVA(Instruction instr) {
516 SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]};
517
518 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
519 return; // NoOp
520 }
521
522 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
523
524 // Convert floats to integers using truncation (only care about X and Y components)
525 cvttps2dq(SRC1, SRC1);
526
527 // Get result
528 movq(rax, SRC1);
529
530 // Handle destination enable
531 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
532 // Move and sign-extend low 32 bits
533 movsxd(ADDROFFS_REG_0, eax);
534
535 // Move and sign-extend high 32 bits
536 shr(rax, 32);
537 movsxd(ADDROFFS_REG_1, eax);
538
539 // Multiply by 16 to be used as an offset later
540 shl(ADDROFFS_REG_0, 4);
541 shl(ADDROFFS_REG_1, 4);
542 } else {
543 if (swiz.DestComponentEnabled(0)) {
544 // Move and sign-extend low 32 bits
545 movsxd(ADDROFFS_REG_0, eax);
546
547 // Multiply by 16 to be used as an offset later
548 shl(ADDROFFS_REG_0, 4);
549 } else if (swiz.DestComponentEnabled(1)) {
550 // Move and sign-extend high 32 bits
551 shr(rax, 32);
552 movsxd(ADDROFFS_REG_1, eax);
553
554 // Multiply by 16 to be used as an offset later
555 shl(ADDROFFS_REG_1, 4);
556 }
557 }
558}
559
560void JitShader::Compile_MOV(Instruction instr) {
561 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
562 Compile_DestEnable(instr, SRC1);
563}
564
565void JitShader::Compile_RCP(Instruction instr) {
566 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
567
568 // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
569 // performs this operation more accurately. This should be checked on hardware.
570 rcpss(SRC1, SRC1);
571 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
572
573 Compile_DestEnable(instr, SRC1);
574}
575
576void JitShader::Compile_RSQ(Instruction instr) {
577 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
578
579 // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
580 // performs this operation more accurately. This should be checked on hardware.
581 rsqrtss(SRC1, SRC1);
582 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
583
584 Compile_DestEnable(instr, SRC1);
585}
586
587void JitShader::Compile_NOP(Instruction instr) {}
588
589void JitShader::Compile_END(Instruction instr) {
590 ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
591 ret();
592}
593
594void JitShader::Compile_CALL(Instruction instr) {
595 // Push offset of the return
596 push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
597
598 // Call the subroutine
599 call(instruction_labels[instr.flow_control.dest_offset]);
600
601 // Skip over the return offset that's on the stack
602 add(rsp, 8);
603}
604
605void JitShader::Compile_CALLC(Instruction instr) {
606 Compile_EvaluateCondition(instr);
607 Label b;
608 jz(b);
609 Compile_CALL(instr);
610 L(b);
611}
612
613void JitShader::Compile_CALLU(Instruction instr) {
614 Compile_UniformCondition(instr);
615 Label b;
616 jz(b);
617 Compile_CALL(instr);
618 L(b);
619}
620
621void JitShader::Compile_CMP(Instruction instr) {
622 using Op = Instruction::Common::CompareOpType::Op;
623 Op op_x = instr.common.compare_op.x;
624 Op op_y = instr.common.compare_op.y;
625
626 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
627 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
628
629 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
630 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
631 // because they don't match when used with NaNs.
632 static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
633
634 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
635 Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
636 Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
637
638 if (op_x == op_y) {
639 // Compare X-component and Y-component together
640 cmpps(lhs_x, rhs_x, cmp[op_x]);
641 movq(COND0, lhs_x);
642
643 mov(COND1, COND0);
644 } else {
645 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
646 Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
647 Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
648
649 // Compare X-component
650 movaps(SCRATCH, lhs_x);
651 cmpss(SCRATCH, rhs_x, cmp[op_x]);
652
653 // Compare Y-component
654 cmpps(lhs_y, rhs_y, cmp[op_y]);
655
656 movq(COND0, SCRATCH);
657 movq(COND1, lhs_y);
658 }
659
660 shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
661 shr(COND1, 63);
662}
663
664void JitShader::Compile_MAD(Instruction instr) {
665 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
666
667 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
668 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
669 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
670 } else {
671 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
672 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
673 }
674
675 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
676 addps(SRC1, SRC3);
677
678 Compile_DestEnable(instr, SRC1);
679}
680
681void JitShader::Compile_IF(Instruction instr) {
682 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
683 "Backwards if-statements not supported");
684 Label l_else, l_endif;
685
686 // Evaluate the "IF" condition
687 if (instr.opcode.Value() == OpCode::Id::IFU) {
688 Compile_UniformCondition(instr);
689 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
690 Compile_EvaluateCondition(instr);
691 }
692 jz(l_else, T_NEAR);
693
694 // Compile the code that corresponds to the condition evaluating as true
695 Compile_Block(instr.flow_control.dest_offset);
696
697 // If there isn't an "ELSE" condition, we are done here
698 if (instr.flow_control.num_instructions == 0) {
699 L(l_else);
700 return;
701 }
702
703 jmp(l_endif, T_NEAR);
704
705 L(l_else);
706 // This code corresponds to the "ELSE" condition
707 // Comple the code that corresponds to the condition evaluating as false
708 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
709
710 L(l_endif);
711}
712
713void JitShader::Compile_LOOP(Instruction instr) {
714 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
715 "Backwards loops not supported");
716 Compile_Assert(!looping, "Nested loops not supported");
717
718 looping = true;
719
720 // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
721 // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
722 // 4 bits) to be used as an offset into the 16-byte vector registers later
723 size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
724 mov(LOOPCOUNT, dword[SETUP + offset]);
725 mov(LOOPCOUNT_REG, LOOPCOUNT);
726 shr(LOOPCOUNT_REG, 4);
727 and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
728 mov(LOOPINC, LOOPCOUNT);
729 shr(LOOPINC, 12);
730 and(LOOPINC, 0xFF0); // Z-component is the incrementer
731 movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
732 add(LOOPCOUNT, 1); // Iteration count is X-component + 1
733
734 Label l_loop_start;
735 L(l_loop_start);
736
737 Compile_Block(instr.flow_control.dest_offset + 1);
738
739 add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
740 sub(LOOPCOUNT, 1); // Increment loop count by 1
741 jnz(l_loop_start); // Loop if not equal
742
743 looping = false;
744}
745
746void JitShader::Compile_JMP(Instruction instr) {
747 if (instr.opcode.Value() == OpCode::Id::JMPC)
748 Compile_EvaluateCondition(instr);
749 else if (instr.opcode.Value() == OpCode::Id::JMPU)
750 Compile_UniformCondition(instr);
751 else
752 UNREACHABLE();
753
754 bool inverted_condition =
755 (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
756
757 Label& b = instruction_labels[instr.flow_control.dest_offset];
758 if (inverted_condition) {
759 jz(b, T_NEAR);
760 } else {
761 jnz(b, T_NEAR);
762 }
763}
764
765void JitShader::Compile_Block(unsigned end) {
766 while (program_counter < end) {
767 Compile_NextInstr();
768 }
769}
770
771void JitShader::Compile_Return() {
772 // Peek return offset on the stack and check if we're at that offset
773 mov(rax, qword[rsp + 8]);
774 cmp(eax, (program_counter));
775
776 // If so, jump back to before CALL
777 Label b;
778 jnz(b);
779 ret();
780 L(b);
781}
782
783void JitShader::Compile_NextInstr() {
784 if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
785 Compile_Return();
786 }
787
788 L(instruction_labels[program_counter]);
789
790 Instruction instr = {(*program_code)[program_counter++]};
791
792 OpCode::Id opcode = instr.opcode.Value();
793 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
794
795 if (instr_func) {
796 // JIT the instruction!
797 ((*this).*instr_func)(instr);
798 } else {
799 // Unhandled instruction
800 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
801 instr.opcode.Value().EffectiveOpCode(), instr.hex);
802 }
803}
804
805void JitShader::FindReturnOffsets() {
806 return_offsets.clear();
807
808 for (size_t offset = 0; offset < program_code->size(); ++offset) {
809 Instruction instr = {(*program_code)[offset]};
810
811 switch (instr.opcode.Value()) {
812 case OpCode::Id::CALL:
813 case OpCode::Id::CALLC:
814 case OpCode::Id::CALLU:
815 return_offsets.push_back(instr.flow_control.dest_offset +
816 instr.flow_control.num_instructions);
817 break;
818 default:
819 break;
820 }
821 }
822
823 // Sort for efficient binary search later
824 std::sort(return_offsets.begin(), return_offsets.end());
825}
826
827void JitShader::Compile(const std::array<u32, 1024>* program_code_,
828 const std::array<u32, 1024>* swizzle_data_) {
829 program_code = program_code_;
830 swizzle_data = swizzle_data_;
831
832 // Reset flow control state
833 program = (CompiledShader*)getCurr();
834 program_counter = 0;
835 looping = false;
836 instruction_labels.fill(Xbyak::Label());
837
838 // Find all `CALL` instructions and identify return locations
839 FindReturnOffsets();
840
841 // The stack pointer is 8 modulo 16 at the entry of a procedure
842 ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8);
843
844 mov(SETUP, ABI_PARAM1);
845 mov(STATE, ABI_PARAM2);
846
847 // Zero address/loop registers
848 xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
849 xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
850 xor(LOOPCOUNT_REG, LOOPCOUNT_REG);
851
852 // Used to set a register to one
853 static const __m128 one = {1.f, 1.f, 1.f, 1.f};
854 mov(rax, reinterpret_cast<size_t>(&one));
855 movaps(ONE, xword[rax]);
856
857 // Used to negate registers
858 static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
859 mov(rax, reinterpret_cast<size_t>(&neg));
860 movaps(NEGBIT, xword[rax]);
861
862 // Jump to start of the shader program
863 jmp(ABI_PARAM3);
864
865 // Compile entire program
866 Compile_Block(static_cast<unsigned>(program_code->size()));
867
868 // Free memory that's no longer needed
869 program_code = nullptr;
870 swizzle_data = nullptr;
871 return_offsets.clear();
872 return_offsets.shrink_to_fit();
873
874 ready();
875
876 ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
877 LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize());
878}
879
880JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
881
882} // namespace Shader
883
884} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
new file mode 100644
index 000000000..29e9875ea
--- /dev/null
+++ b/src/video_core/shader/shader_jit_x64_compiler.h
@@ -0,0 +1,125 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <utility>
10#include <vector>
11#include <nihstro/shader_bytecode.h>
12#include <xbyak.h>
13#include "common/bit_set.h"
14#include "common/common_types.h"
15#include "common/x64/emitter.h"
16#include "video_core/shader/shader.h"
17
18using nihstro::Instruction;
19using nihstro::OpCode;
20using nihstro::SwizzlePattern;
21
22namespace Pica {
23
24namespace Shader {
25
26/// Memory allocated for each compiled shader (64Kb)
27constexpr size_t MAX_SHADER_SIZE = 1024 * 64;
28
29/**
30 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
31 * code that can be executed on the host machine directly.
32 */
33class JitShader : public Xbyak::CodeGenerator {
34public:
35 JitShader();
36
37 void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
38 program(&setup, &state, instruction_labels[offset].getAddress());
39 }
40
41 void Compile(const std::array<u32, 1024>* program_code,
42 const std::array<u32, 1024>* swizzle_data);
43
44 void Compile_ADD(Instruction instr);
45 void Compile_DP3(Instruction instr);
46 void Compile_DP4(Instruction instr);
47 void Compile_DPH(Instruction instr);
48 void Compile_EX2(Instruction instr);
49 void Compile_LG2(Instruction instr);
50 void Compile_MUL(Instruction instr);
51 void Compile_SGE(Instruction instr);
52 void Compile_SLT(Instruction instr);
53 void Compile_FLR(Instruction instr);
54 void Compile_MAX(Instruction instr);
55 void Compile_MIN(Instruction instr);
56 void Compile_RCP(Instruction instr);
57 void Compile_RSQ(Instruction instr);
58 void Compile_MOVA(Instruction instr);
59 void Compile_MOV(Instruction instr);
60 void Compile_NOP(Instruction instr);
61 void Compile_END(Instruction instr);
62 void Compile_CALL(Instruction instr);
63 void Compile_CALLC(Instruction instr);
64 void Compile_CALLU(Instruction instr);
65 void Compile_IF(Instruction instr);
66 void Compile_LOOP(Instruction instr);
67 void Compile_JMP(Instruction instr);
68 void Compile_CMP(Instruction instr);
69 void Compile_MAD(Instruction instr);
70
71private:
72 void Compile_Block(unsigned end);
73 void Compile_NextInstr();
74
75 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
76 Xbyak::Xmm dest);
77 void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
78
79 /**
80 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
81 * zero by inf. Clobbers `src2` and `scratch`.
82 */
83 void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
84
85 void Compile_EvaluateCondition(Instruction instr);
86 void Compile_UniformCondition(Instruction instr);
87
88 /**
89 * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
90 */
91 void Compile_Return();
92
93 BitSet32 PersistentCallerSavedRegs();
94
95 /**
96 * Assertion evaluated at compile-time, but only triggered if executed at runtime.
97 * @param msg Message to be logged if the assertion fails.
98 */
99 void Compile_Assert(bool condition, const char* msg);
100
101 /**
102 * Analyzes the entire shader program for `CALL` instructions before emitting any code,
103 * identifying the locations where a return needs to be inserted.
104 */
105 void FindReturnOffsets();
106
107 const std::array<u32, 1024>* program_code = nullptr;
108 const std::array<u32, 1024>* swizzle_data = nullptr;
109
110 /// Mapping of Pica VS instructions to pointers in the emitted code
111 std::array<Xbyak::Label, 1024> instruction_labels;
112
113 /// Offsets in code where a return needs to be inserted
114 std::vector<unsigned> return_offsets;
115
116 unsigned program_counter = 0; ///< Offset of the next instruction to decode
117 bool looping = false; ///< True if compiling a loop, used to check for nested loops
118
119 using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
120 CompiledShader* program = nullptr;
121};
122
123} // Shader
124
125} // Pica