summaryrefslogtreecommitdiff
path: root/src/video_core/shader
diff options
context:
space:
mode:
authorGravatar James Rowe2018-01-11 20:07:44 -0700
committerGravatar James Rowe2018-01-12 19:11:03 -0700
commit1d28b2e142f845773e2b90e267d9632e196a99b9 (patch)
tree027a3586a0fc927731afb3711c328c6dafc8551f /src/video_core/shader
parentMassive removal of unused modules (diff)
downloadyuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.gz
yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.xz
yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.zip
Remove references to PICA and rasterizers in video_core
Diffstat (limited to 'src/video_core/shader')
-rw-r--r--src/video_core/shader/debug_data.h186
-rw-r--r--src/video_core/shader/shader.cpp154
-rw-r--r--src/video_core/shader/shader.h233
-rw-r--r--src/video_core/shader/shader_interpreter.cpp701
-rw-r--r--src/video_core/shader/shader_interpreter.h32
-rw-r--r--src/video_core/shader/shader_jit_x64.cpp48
-rw-r--r--src/video_core/shader/shader_jit_x64.h30
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.cpp942
-rw-r--r--src/video_core/shader/shader_jit_x64_compiler.h127
9 files changed, 0 insertions, 2453 deletions
diff --git a/src/video_core/shader/debug_data.h b/src/video_core/shader/debug_data.h
deleted file mode 100644
index 9e82122e1..000000000
--- a/src/video_core/shader/debug_data.h
+++ /dev/null
@@ -1,186 +0,0 @@
1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <vector>
8#include "common/common_types.h"
9#include "common/vector_math.h"
10#include "video_core/pica_types.h"
11
12namespace Pica {
13namespace Shader {
14
15/// Helper structure used to keep track of data useful for inspection of shader emulation
16template <bool full_debugging>
17struct DebugData;
18
19template <>
20struct DebugData<false> {
21 // TODO: Hide these behind and interface and move them to DebugData<true>
22 u32 max_offset = 0; ///< maximum program counter ever reached
23 u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used
24};
25
26template <>
27struct DebugData<true> {
28 /// Records store the input and output operands of a particular instruction.
29 struct Record {
30 enum Type {
31 // Floating point arithmetic operands
32 SRC1 = 0x1,
33 SRC2 = 0x2,
34 SRC3 = 0x4,
35
36 // Initial and final output operand value
37 DEST_IN = 0x8,
38 DEST_OUT = 0x10,
39
40 // Current and next instruction offset (in words)
41 CUR_INSTR = 0x20,
42 NEXT_INSTR = 0x40,
43
44 // Output address register value
45 ADDR_REG_OUT = 0x80,
46
47 // Result of a comparison instruction
48 CMP_RESULT = 0x100,
49
50 // Input values for conditional flow control instructions
51 COND_BOOL_IN = 0x200,
52 COND_CMP_IN = 0x400,
53
54 // Input values for a loop
55 LOOP_INT_IN = 0x800,
56 };
57
58 Math::Vec4<float24> src1;
59 Math::Vec4<float24> src2;
60 Math::Vec4<float24> src3;
61
62 Math::Vec4<float24> dest_in;
63 Math::Vec4<float24> dest_out;
64
65 s32 address_registers[2];
66 bool conditional_code[2];
67 bool cond_bool;
68 bool cond_cmp[2];
69 Math::Vec4<u8> loop_int;
70
71 u32 instruction_offset;
72 u32 next_instruction;
73
74 /// set of enabled fields (as a combination of Type flags)
75 unsigned mask = 0;
76 };
77
78 u32 max_offset = 0; ///< maximum program counter ever reached
79 u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used
80
81 /// List of records for each executed shader instruction
82 std::vector<DebugData<true>::Record> records;
83};
84
85/// Type alias for better readability
86using DebugDataRecord = DebugData<true>::Record;
87
88/// Helper function to set a DebugData<true>::Record field based on the template enum parameter.
89template <DebugDataRecord::Type type, typename ValueType>
90inline void SetField(DebugDataRecord& record, ValueType value);
91
92template <>
93inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
94 record.src1.x = value[0];
95 record.src1.y = value[1];
96 record.src1.z = value[2];
97 record.src1.w = value[3];
98}
99
100template <>
101inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
102 record.src2.x = value[0];
103 record.src2.y = value[1];
104 record.src2.z = value[2];
105 record.src2.w = value[3];
106}
107
108template <>
109inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
110 record.src3.x = value[0];
111 record.src3.y = value[1];
112 record.src3.z = value[2];
113 record.src3.w = value[3];
114}
115
116template <>
117inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
118 record.dest_in.x = value[0];
119 record.dest_in.y = value[1];
120 record.dest_in.z = value[2];
121 record.dest_in.w = value[3];
122}
123
124template <>
125inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
126 record.dest_out.x = value[0];
127 record.dest_out.y = value[1];
128 record.dest_out.z = value[2];
129 record.dest_out.w = value[3];
130}
131
132template <>
133inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) {
134 record.address_registers[0] = value[0];
135 record.address_registers[1] = value[1];
136}
137
138template <>
139inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) {
140 record.conditional_code[0] = value[0];
141 record.conditional_code[1] = value[1];
142}
143
144template <>
145inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) {
146 record.cond_bool = value;
147}
148
149template <>
150inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) {
151 record.cond_cmp[0] = value[0];
152 record.cond_cmp[1] = value[1];
153}
154
155template <>
156inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) {
157 record.loop_int = value;
158}
159
160template <>
161inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) {
162 record.instruction_offset = value;
163}
164
165template <>
166inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) {
167 record.next_instruction = value;
168}
169
170/// Helper function to set debug information on the current shader iteration.
171template <DebugDataRecord::Type type, typename ValueType>
172inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) {
173 // Debugging disabled => nothing to do
174}
175
176template <DebugDataRecord::Type type, typename ValueType>
177inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) {
178 if (offset >= debug_data.records.size())
179 debug_data.records.resize(offset + 1);
180
181 SetField<type, ValueType>(debug_data.records[offset], value);
182 debug_data.records[offset].mask |= type;
183}
184
185} // namespace Shader
186} // namespace Pica
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
deleted file mode 100644
index 2857d2829..000000000
--- a/src/video_core/shader/shader.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <cmath>
6#include <cstring>
7#include "common/bit_set.h"
8#include "common/logging/log.h"
9#include "common/microprofile.h"
10#include "video_core/pica_state.h"
11#include "video_core/regs_rasterizer.h"
12#include "video_core/regs_shader.h"
13#include "video_core/shader/shader.h"
14#include "video_core/shader/shader_interpreter.h"
15#ifdef ARCHITECTURE_x86_64
16#include "video_core/shader/shader_jit_x64.h"
17#endif // ARCHITECTURE_x86_64
18#include "video_core/video_core.h"
19
20namespace Pica {
21
22namespace Shader {
23
24OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs,
25 const AttributeBuffer& input) {
26 // Setup output data
27 union {
28 OutputVertex ret{};
29 std::array<float24, 24> vertex_slots;
30 };
31 static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes.");
32
33 unsigned int num_attributes = regs.vs_output_total;
34 ASSERT(num_attributes <= 7);
35 for (unsigned int i = 0; i < num_attributes; ++i) {
36 const auto& output_register_map = regs.vs_output_attributes[i];
37
38 RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = {
39 output_register_map.map_x, output_register_map.map_y, output_register_map.map_z,
40 output_register_map.map_w};
41
42 for (unsigned comp = 0; comp < 4; ++comp) {
43 RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp];
44 if (semantic < vertex_slots.size()) {
45 vertex_slots[semantic] = input.attr[i][comp];
46 } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) {
47 LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic);
48 }
49 }
50 }
51
52 // The hardware takes the absolute and saturates vertex colors like this, *before* doing
53 // interpolation
54 for (unsigned i = 0; i < 4; ++i) {
55 float c = std::fabs(ret.color[i].ToFloat32());
56 ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f);
57 }
58
59 LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), "
60 "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)",
61 ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(),
62 ret.pos.w.ToFloat32(), ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(),
63 ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), ret.color.x.ToFloat32(),
64 ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
65 ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(), ret.view.x.ToFloat32(),
66 ret.view.y.ToFloat32(), ret.view.z.ToFloat32());
67
68 return ret;
69}
70
71void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input) {
72 const unsigned max_attribute = config.max_input_attribute_index;
73
74 for (unsigned attr = 0; attr <= max_attribute; ++attr) {
75 unsigned reg = config.GetRegisterForAttribute(attr);
76 registers.input[reg] = input.attr[attr];
77 }
78}
79
80void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) {
81 unsigned int output_i = 0;
82 for (unsigned int reg : Common::BitSet<u32>(config.output_mask)) {
83 output.attr[output_i++] = registers.output[reg];
84 }
85}
86
87UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {}
88
89GSEmitter::GSEmitter() {
90 handlers = new Handlers;
91}
92
93GSEmitter::~GSEmitter() {
94 delete handlers;
95}
96
97void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) {
98 ASSERT(vertex_id < 3);
99 std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin());
100 if (prim_emit) {
101 if (winding)
102 handlers->winding_setter();
103 for (size_t i = 0; i < buffer.size(); ++i) {
104 AttributeBuffer output;
105 unsigned int output_i = 0;
106 for (unsigned int reg : Common::BitSet<u32>(output_mask)) {
107 output.attr[output_i++] = buffer[i][reg];
108 }
109 handlers->vertex_handler(output);
110 }
111 }
112}
113
114GSUnitState::GSUnitState() : UnitState(&emitter) {}
115
116void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) {
117 emitter.handlers->vertex_handler = std::move(vertex_handler);
118 emitter.handlers->winding_setter = std::move(winding_setter);
119}
120
121void GSUnitState::ConfigOutput(const ShaderRegs& config) {
122 emitter.output_mask = config.output_mask;
123}
124
125MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
126
127#ifdef ARCHITECTURE_x86_64
128static std::unique_ptr<JitX64Engine> jit_engine;
129#endif // ARCHITECTURE_x86_64
130static InterpreterEngine interpreter_engine;
131
132ShaderEngine* GetEngine() {
133#ifdef ARCHITECTURE_x86_64
134 // TODO(yuriks): Re-initialize on each change rather than being persistent
135 if (VideoCore::g_shader_jit_enabled) {
136 if (jit_engine == nullptr) {
137 jit_engine = std::make_unique<JitX64Engine>();
138 }
139 return jit_engine.get();
140 }
141#endif // ARCHITECTURE_x86_64
142
143 return &interpreter_engine;
144}
145
146void Shutdown() {
147#ifdef ARCHITECTURE_x86_64
148 jit_engine = nullptr;
149#endif // ARCHITECTURE_x86_64
150}
151
152} // namespace Shader
153
154} // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
deleted file mode 100644
index a3789da01..000000000
--- a/src/video_core/shader/shader.h
+++ /dev/null
@@ -1,233 +0,0 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <functional>
10#include <type_traits>
11#include <nihstro/shader_bytecode.h>
12#include "common/assert.h"
13#include "common/common_funcs.h"
14#include "common/common_types.h"
15#include "common/vector_math.h"
16#include "video_core/pica_types.h"
17#include "video_core/regs_rasterizer.h"
18#include "video_core/regs_shader.h"
19
20using nihstro::RegisterType;
21using nihstro::SourceRegister;
22using nihstro::DestRegister;
23
24namespace Pica {
25
26namespace Shader {
27
28constexpr unsigned MAX_PROGRAM_CODE_LENGTH = 4096;
29constexpr unsigned MAX_SWIZZLE_DATA_LENGTH = 4096;
30
31struct AttributeBuffer {
32 alignas(16) Math::Vec4<float24> attr[16];
33};
34
35/// Handler type for receiving vertex outputs from vertex shader or geometry shader
36using VertexHandler = std::function<void(const AttributeBuffer&)>;
37
38/// Handler type for signaling to invert the vertex order of the next triangle
39using WindingSetter = std::function<void()>;
40
41struct OutputVertex {
42 Math::Vec4<float24> pos;
43 Math::Vec4<float24> quat;
44 Math::Vec4<float24> color;
45 Math::Vec2<float24> tc0;
46 Math::Vec2<float24> tc1;
47 float24 tc0_w;
48 INSERT_PADDING_WORDS(1);
49 Math::Vec3<float24> view;
50 INSERT_PADDING_WORDS(1);
51 Math::Vec2<float24> tc2;
52
53 static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs,
54 const AttributeBuffer& output);
55};
56#define ASSERT_POS(var, pos) \
57 static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \
58 "offset.")
59ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X);
60ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X);
61ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R);
62ASSERT_POS(tc0, RasterizerRegs::VSOutputAttributes::TEXCOORD0_U);
63ASSERT_POS(tc1, RasterizerRegs::VSOutputAttributes::TEXCOORD1_U);
64ASSERT_POS(tc0_w, RasterizerRegs::VSOutputAttributes::TEXCOORD0_W);
65ASSERT_POS(view, RasterizerRegs::VSOutputAttributes::VIEW_X);
66ASSERT_POS(tc2, RasterizerRegs::VSOutputAttributes::TEXCOORD2_U);
67#undef ASSERT_POS
68static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
69static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size");
70
71/**
72 * This structure contains state information for primitive emitting in geometry shader.
73 */
74struct GSEmitter {
75 std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer;
76 u8 vertex_id;
77 bool prim_emit;
78 bool winding;
79 u32 output_mask;
80
81 // Function objects are hidden behind a raw pointer to make the structure standard layout type,
82 // for JIT to use offsetof to access other members.
83 struct Handlers {
84 VertexHandler vertex_handler;
85 WindingSetter winding_setter;
86 } * handlers;
87
88 GSEmitter();
89 ~GSEmitter();
90 void Emit(Math::Vec4<float24> (&vertex)[16]);
91};
92static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type");
93
94/**
95 * This structure contains the state information that needs to be unique for a shader unit. The 3DS
96 * has four shader units that process shaders in parallel. At the present, Citra only implements a
97 * single shader unit that processes all shaders serially. Putting the state information in a struct
98 * here will make it easier for us to parallelize the shader processing later.
99 */
100struct UnitState {
101 explicit UnitState(GSEmitter* emitter = nullptr);
102 struct Registers {
103 // The registers are accessed by the shader JIT using SSE instructions, and are therefore
104 // required to be 16-byte aligned.
105 alignas(16) Math::Vec4<float24> input[16];
106 alignas(16) Math::Vec4<float24> temporary[16];
107 alignas(16) Math::Vec4<float24> output[16];
108 } registers;
109 static_assert(std::is_pod<Registers>::value, "Structure is not POD");
110
111 bool conditional_code[2];
112
113 // Two Address registers and one loop counter
114 // TODO: How many bits do these actually have?
115 s32 address_registers[3];
116
117 GSEmitter* emitter_ptr;
118
119 static size_t InputOffset(const SourceRegister& reg) {
120 switch (reg.GetRegisterType()) {
121 case RegisterType::Input:
122 return offsetof(UnitState, registers.input) +
123 reg.GetIndex() * sizeof(Math::Vec4<float24>);
124
125 case RegisterType::Temporary:
126 return offsetof(UnitState, registers.temporary) +
127 reg.GetIndex() * sizeof(Math::Vec4<float24>);
128
129 default:
130 UNREACHABLE();
131 return 0;
132 }
133 }
134
135 static size_t OutputOffset(const DestRegister& reg) {
136 switch (reg.GetRegisterType()) {
137 case RegisterType::Output:
138 return offsetof(UnitState, registers.output) +
139 reg.GetIndex() * sizeof(Math::Vec4<float24>);
140
141 case RegisterType::Temporary:
142 return offsetof(UnitState, registers.temporary) +
143 reg.GetIndex() * sizeof(Math::Vec4<float24>);
144
145 default:
146 UNREACHABLE();
147 return 0;
148 }
149 }
150
151 /**
152 * Loads the unit state with an input vertex.
153 *
154 * @param config Shader configuration registers corresponding to the unit.
155 * @param input Attribute buffer to load into the input registers.
156 */
157 void LoadInput(const ShaderRegs& config, const AttributeBuffer& input);
158
159 void WriteOutput(const ShaderRegs& config, AttributeBuffer& output);
160};
161
162/**
163 * This is an extended shader unit state that represents the special unit that can run both vertex
164 * shader and geometry shader. It contains an additional primitive emitter and utilities for
165 * geometry shader.
166 */
167struct GSUnitState : public UnitState {
168 GSUnitState();
169 void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter);
170 void ConfigOutput(const ShaderRegs& config);
171
172 GSEmitter emitter;
173};
174
175struct ShaderSetup {
176 struct {
177 // The float uniforms are accessed by the shader JIT using SSE instructions, and are
178 // therefore required to be 16-byte aligned.
179 alignas(16) Math::Vec4<float24> f[96];
180
181 std::array<bool, 16> b;
182 std::array<Math::Vec4<u8>, 4> i;
183 } uniforms;
184
185 static size_t GetFloatUniformOffset(unsigned index) {
186 return offsetof(ShaderSetup, uniforms.f) + index * sizeof(Math::Vec4<float24>);
187 }
188
189 static size_t GetBoolUniformOffset(unsigned index) {
190 return offsetof(ShaderSetup, uniforms.b) + index * sizeof(bool);
191 }
192
193 static size_t GetIntUniformOffset(unsigned index) {
194 return offsetof(ShaderSetup, uniforms.i) + index * sizeof(Math::Vec4<u8>);
195 }
196
197 std::array<u32, MAX_PROGRAM_CODE_LENGTH> program_code;
198 std::array<u32, MAX_SWIZZLE_DATA_LENGTH> swizzle_data;
199
200 /// Data private to ShaderEngines
201 struct EngineData {
202 unsigned int entry_point;
203 /// Used by the JIT, points to a compiled shader object.
204 const void* cached_shader = nullptr;
205 } engine_data;
206};
207
208class ShaderEngine {
209public:
210 virtual ~ShaderEngine() = default;
211
212 /**
213 * Performs any shader unit setup that only needs to happen once per shader (as opposed to once
214 * per vertex, which would happen within the `Run` function).
215 */
216 virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0;
217
218 /**
219 * Runs the currently setup shader.
220 *
221 * @param setup Shader engine state, must be setup with SetupBatch on each shader change.
222 * @param state Shader unit state, must be setup with input data before each shader invocation.
223 */
224 virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0;
225};
226
227// TODO(yuriks): Remove and make it non-global state somewhere
228ShaderEngine* GetEngine();
229void Shutdown();
230
231} // namespace Shader
232
233} // namespace Pica
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
deleted file mode 100644
index 9d4da4904..000000000
--- a/src/video_core/shader/shader_interpreter.cpp
+++ /dev/null
@@ -1,701 +0,0 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <array>
7#include <cmath>
8#include <numeric>
9#include <boost/container/static_vector.hpp>
10#include <boost/range/algorithm/fill.hpp>
11#include <nihstro/shader_bytecode.h>
12#include "common/assert.h"
13#include "common/common_types.h"
14#include "common/logging/log.h"
15#include "common/microprofile.h"
16#include "common/vector_math.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_interpreter.h"
21
22using nihstro::OpCode;
23using nihstro::Instruction;
24using nihstro::RegisterType;
25using nihstro::SourceRegister;
26using nihstro::SwizzlePattern;
27
28namespace Pica {
29
30namespace Shader {
31
32struct CallStackElement {
33 u32 final_address; // Address upon which we jump to return_address
34 u32 return_address; // Where to jump when leaving scope
35 u8 repeat_counter; // How often to repeat until this call stack element is removed
36 u8 loop_increment; // Which value to add to the loop counter after an iteration
37 // TODO: Should this be a signed value? Does it even matter?
38 u32 loop_address; // The address where we'll return to after each loop iteration
39};
40
41template <bool Debug>
42static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data,
43 unsigned offset) {
44 // TODO: Is there a maximal size for this?
45 boost::container::static_vector<CallStackElement, 16> call_stack;
46 u32 program_counter = offset;
47
48 state.conditional_code[0] = false;
49 state.conditional_code[1] = false;
50
51 auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset,
52 u8 repeat_count, u8 loop_increment) {
53 // -1 to make sure when incrementing the PC we end up at the correct offset
54 program_counter = offset - 1;
55 ASSERT(call_stack.size() < call_stack.capacity());
56 call_stack.push_back(
57 {offset + num_instructions, return_offset, repeat_count, loop_increment, offset});
58 };
59
60 auto evaluate_condition = [&state](Instruction::FlowControlType flow_control) {
61 using Op = Instruction::FlowControlType::Op;
62
63 bool result_x = flow_control.refx.Value() == state.conditional_code[0];
64 bool result_y = flow_control.refy.Value() == state.conditional_code[1];
65
66 switch (flow_control.op) {
67 case Op::Or:
68 return result_x || result_y;
69 case Op::And:
70 return result_x && result_y;
71 case Op::JustX:
72 return result_x;
73 case Op::JustY:
74 return result_y;
75 default:
76 UNREACHABLE();
77 return false;
78 }
79 };
80
81 const auto& uniforms = setup.uniforms;
82 const auto& swizzle_data = setup.swizzle_data;
83 const auto& program_code = setup.program_code;
84
85 // Placeholder for invalid inputs
86 static float24 dummy_vec4_float24[4];
87
88 unsigned iteration = 0;
89 bool exit_loop = false;
90 while (!exit_loop) {
91 if (!call_stack.empty()) {
92 auto& top = call_stack.back();
93 if (program_counter == top.final_address) {
94 state.address_registers[2] += top.loop_increment;
95
96 if (top.repeat_counter-- == 0) {
97 program_counter = top.return_address;
98 call_stack.pop_back();
99 } else {
100 program_counter = top.loop_address;
101 }
102
103 // TODO: Is "trying again" accurate to hardware?
104 continue;
105 }
106 }
107
108 const Instruction instr = {program_code[program_counter]};
109 const SwizzlePattern swizzle = {swizzle_data[instr.common.operand_desc_id]};
110
111 Record<DebugDataRecord::CUR_INSTR>(debug_data, iteration, program_counter);
112 if (iteration > 0)
113 Record<DebugDataRecord::NEXT_INSTR>(debug_data, iteration - 1, program_counter);
114
115 debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter);
116
117 auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
118 switch (source_reg.GetRegisterType()) {
119 case RegisterType::Input:
120 return &state.registers.input[source_reg.GetIndex()].x;
121
122 case RegisterType::Temporary:
123 return &state.registers.temporary[source_reg.GetIndex()].x;
124
125 case RegisterType::FloatUniform:
126 return &uniforms.f[source_reg.GetIndex()].x;
127
128 default:
129 return dummy_vec4_float24;
130 }
131 };
132
133 switch (instr.opcode.Value().GetInfo().type) {
134 case OpCode::Type::Arithmetic: {
135 const bool is_inverted =
136 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
137
138 const int address_offset =
139 (instr.common.address_register_index == 0)
140 ? 0
141 : state.address_registers[instr.common.address_register_index - 1];
142
143 const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) +
144 (is_inverted ? 0 : address_offset));
145 const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) +
146 (is_inverted ? address_offset : 0));
147
148 const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
149 const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
150
151 float24 src1[4] = {
152 src1_[(int)swizzle.src1_selector_0.Value()],
153 src1_[(int)swizzle.src1_selector_1.Value()],
154 src1_[(int)swizzle.src1_selector_2.Value()],
155 src1_[(int)swizzle.src1_selector_3.Value()],
156 };
157 if (negate_src1) {
158 src1[0] = -src1[0];
159 src1[1] = -src1[1];
160 src1[2] = -src1[2];
161 src1[3] = -src1[3];
162 }
163 float24 src2[4] = {
164 src2_[(int)swizzle.src2_selector_0.Value()],
165 src2_[(int)swizzle.src2_selector_1.Value()],
166 src2_[(int)swizzle.src2_selector_2.Value()],
167 src2_[(int)swizzle.src2_selector_3.Value()],
168 };
169 if (negate_src2) {
170 src2[0] = -src2[0];
171 src2[1] = -src2[1];
172 src2[2] = -src2[2];
173 src2[3] = -src2[3];
174 }
175
176 float24* dest =
177 (instr.common.dest.Value() < 0x10)
178 ? &state.registers.output[instr.common.dest.Value().GetIndex()][0]
179 : (instr.common.dest.Value() < 0x20)
180 ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0]
181 : dummy_vec4_float24;
182
183 debug_data.max_opdesc_id =
184 std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id);
185
186 switch (instr.opcode.Value().EffectiveOpCode()) {
187 case OpCode::Id::ADD: {
188 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
189 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
190 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
191 for (int i = 0; i < 4; ++i) {
192 if (!swizzle.DestComponentEnabled(i))
193 continue;
194
195 dest[i] = src1[i] + src2[i];
196 }
197 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
198 break;
199 }
200
201 case OpCode::Id::MUL: {
202 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
203 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
204 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
205 for (int i = 0; i < 4; ++i) {
206 if (!swizzle.DestComponentEnabled(i))
207 continue;
208
209 dest[i] = src1[i] * src2[i];
210 }
211 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
212 break;
213 }
214
215 case OpCode::Id::FLR:
216 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
217 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
218 for (int i = 0; i < 4; ++i) {
219 if (!swizzle.DestComponentEnabled(i))
220 continue;
221
222 dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
223 }
224 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
225 break;
226
227 case OpCode::Id::MAX:
228 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
229 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
230 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
231 for (int i = 0; i < 4; ++i) {
232 if (!swizzle.DestComponentEnabled(i))
233 continue;
234
235 // NOTE: Exact form required to match NaN semantics to hardware:
236 // max(0, NaN) -> NaN
237 // max(NaN, 0) -> 0
238 dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
239 }
240 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
241 break;
242
243 case OpCode::Id::MIN:
244 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
245 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
246 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
247 for (int i = 0; i < 4; ++i) {
248 if (!swizzle.DestComponentEnabled(i))
249 continue;
250
251 // NOTE: Exact form required to match NaN semantics to hardware:
252 // min(0, NaN) -> NaN
253 // min(NaN, 0) -> 0
254 dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
255 }
256 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
257 break;
258
259 case OpCode::Id::DP3:
260 case OpCode::Id::DP4:
261 case OpCode::Id::DPH:
262 case OpCode::Id::DPHI: {
263 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
264 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
265 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
266
267 OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode();
268 if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI)
269 src1[3] = float24::FromFloat32(1.0f);
270
271 int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4;
272 float24 dot = std::inner_product(src1, src1 + num_components, src2,
273 float24::FromFloat32(0.f));
274
275 for (int i = 0; i < 4; ++i) {
276 if (!swizzle.DestComponentEnabled(i))
277 continue;
278
279 dest[i] = dot;
280 }
281 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
282 break;
283 }
284
285 // Reciprocal
286 case OpCode::Id::RCP: {
287 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
288 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
289 float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32());
290 for (int i = 0; i < 4; ++i) {
291 if (!swizzle.DestComponentEnabled(i))
292 continue;
293
294 dest[i] = rcp_res;
295 }
296 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
297 break;
298 }
299
300 // Reciprocal Square Root
301 case OpCode::Id::RSQ: {
302 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
303 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
304 float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32()));
305 for (int i = 0; i < 4; ++i) {
306 if (!swizzle.DestComponentEnabled(i))
307 continue;
308
309 dest[i] = rsq_res;
310 }
311 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
312 break;
313 }
314
315 case OpCode::Id::MOVA: {
316 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
317 for (int i = 0; i < 2; ++i) {
318 if (!swizzle.DestComponentEnabled(i))
319 continue;
320
321 // TODO: Figure out how the rounding is done on hardware
322 state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
323 }
324 Record<DebugDataRecord::ADDR_REG_OUT>(debug_data, iteration,
325 state.address_registers);
326 break;
327 }
328
329 case OpCode::Id::MOV: {
330 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
331 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
332 for (int i = 0; i < 4; ++i) {
333 if (!swizzle.DestComponentEnabled(i))
334 continue;
335
336 dest[i] = src1[i];
337 }
338 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
339 break;
340 }
341
342 case OpCode::Id::SGE:
343 case OpCode::Id::SGEI:
344 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
345 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
346 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
347 for (int i = 0; i < 4; ++i) {
348 if (!swizzle.DestComponentEnabled(i))
349 continue;
350
351 dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f)
352 : float24::FromFloat32(0.0f);
353 }
354 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
355 break;
356
357 case OpCode::Id::SLT:
358 case OpCode::Id::SLTI:
359 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
360 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
361 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
362 for (int i = 0; i < 4; ++i) {
363 if (!swizzle.DestComponentEnabled(i))
364 continue;
365
366 dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f)
367 : float24::FromFloat32(0.0f);
368 }
369 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
370 break;
371
372 case OpCode::Id::CMP:
373 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
374 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
375 for (int i = 0; i < 2; ++i) {
376 // TODO: Can you restrict to one compare via dest masking?
377
378 auto compare_op = instr.common.compare_op;
379 auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
380
381 switch (op) {
382 case Instruction::Common::CompareOpType::Equal:
383 state.conditional_code[i] = (src1[i] == src2[i]);
384 break;
385
386 case Instruction::Common::CompareOpType::NotEqual:
387 state.conditional_code[i] = (src1[i] != src2[i]);
388 break;
389
390 case Instruction::Common::CompareOpType::LessThan:
391 state.conditional_code[i] = (src1[i] < src2[i]);
392 break;
393
394 case Instruction::Common::CompareOpType::LessEqual:
395 state.conditional_code[i] = (src1[i] <= src2[i]);
396 break;
397
398 case Instruction::Common::CompareOpType::GreaterThan:
399 state.conditional_code[i] = (src1[i] > src2[i]);
400 break;
401
402 case Instruction::Common::CompareOpType::GreaterEqual:
403 state.conditional_code[i] = (src1[i] >= src2[i]);
404 break;
405
406 default:
407 LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op));
408 break;
409 }
410 }
411 Record<DebugDataRecord::CMP_RESULT>(debug_data, iteration, state.conditional_code);
412 break;
413
414 case OpCode::Id::EX2: {
415 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
416 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
417
418 // EX2 only takes first component exp2 and writes it to all dest components
419 float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32()));
420 for (int i = 0; i < 4; ++i) {
421 if (!swizzle.DestComponentEnabled(i))
422 continue;
423
424 dest[i] = ex2_res;
425 }
426
427 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
428 break;
429 }
430
431 case OpCode::Id::LG2: {
432 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
433 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
434
435 // LG2 only takes the first component log2 and writes it to all dest components
436 float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32()));
437 for (int i = 0; i < 4; ++i) {
438 if (!swizzle.DestComponentEnabled(i))
439 continue;
440
441 dest[i] = lg2_res;
442 }
443
444 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
445 break;
446 }
447
448 default:
449 LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
450 (int)instr.opcode.Value().EffectiveOpCode(),
451 instr.opcode.Value().GetInfo().name, instr.hex);
452 DEBUG_ASSERT(false);
453 break;
454 }
455
456 break;
457 }
458
459 case OpCode::Type::MultiplyAdd: {
460 if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) ||
461 (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) {
462 const SwizzlePattern& swizzle = *reinterpret_cast<const SwizzlePattern*>(
463 &swizzle_data[instr.mad.operand_desc_id]);
464
465 bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI);
466
467 const int address_offset =
468 (instr.mad.address_register_index == 0)
469 ? 0
470 : state.address_registers[instr.mad.address_register_index - 1];
471
472 const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted));
473 const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) +
474 (!is_inverted * address_offset));
475 const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) +
476 (is_inverted * address_offset));
477
478 const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
479 const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
480 const bool negate_src3 = ((bool)swizzle.negate_src3 != false);
481
482 float24 src1[4] = {
483 src1_[(int)swizzle.src1_selector_0.Value()],
484 src1_[(int)swizzle.src1_selector_1.Value()],
485 src1_[(int)swizzle.src1_selector_2.Value()],
486 src1_[(int)swizzle.src1_selector_3.Value()],
487 };
488 if (negate_src1) {
489 src1[0] = -src1[0];
490 src1[1] = -src1[1];
491 src1[2] = -src1[2];
492 src1[3] = -src1[3];
493 }
494 float24 src2[4] = {
495 src2_[(int)swizzle.src2_selector_0.Value()],
496 src2_[(int)swizzle.src2_selector_1.Value()],
497 src2_[(int)swizzle.src2_selector_2.Value()],
498 src2_[(int)swizzle.src2_selector_3.Value()],
499 };
500 if (negate_src2) {
501 src2[0] = -src2[0];
502 src2[1] = -src2[1];
503 src2[2] = -src2[2];
504 src2[3] = -src2[3];
505 }
506 float24 src3[4] = {
507 src3_[(int)swizzle.src3_selector_0.Value()],
508 src3_[(int)swizzle.src3_selector_1.Value()],
509 src3_[(int)swizzle.src3_selector_2.Value()],
510 src3_[(int)swizzle.src3_selector_3.Value()],
511 };
512 if (negate_src3) {
513 src3[0] = -src3[0];
514 src3[1] = -src3[1];
515 src3[2] = -src3[2];
516 src3[3] = -src3[3];
517 }
518
519 float24* dest =
520 (instr.mad.dest.Value() < 0x10)
521 ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0]
522 : (instr.mad.dest.Value() < 0x20)
523 ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
524 : dummy_vec4_float24;
525
526 Record<DebugDataRecord::SRC1>(debug_data, iteration, src1);
527 Record<DebugDataRecord::SRC2>(debug_data, iteration, src2);
528 Record<DebugDataRecord::SRC3>(debug_data, iteration, src3);
529 Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest);
530 for (int i = 0; i < 4; ++i) {
531 if (!swizzle.DestComponentEnabled(i))
532 continue;
533
534 dest[i] = src1[i] * src2[i] + src3[i];
535 }
536 Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest);
537 } else {
538 LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x",
539 (int)instr.opcode.Value().EffectiveOpCode(),
540 instr.opcode.Value().GetInfo().name, instr.hex);
541 }
542 break;
543 }
544
545 default: {
546 // Handle each instruction on its own
547 switch (instr.opcode.Value()) {
548 case OpCode::Id::END:
549 exit_loop = true;
550 break;
551
552 case OpCode::Id::JMPC:
553 Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code);
554 if (evaluate_condition(instr.flow_control)) {
555 program_counter = instr.flow_control.dest_offset - 1;
556 }
557 break;
558
559 case OpCode::Id::JMPU:
560 Record<DebugDataRecord::COND_BOOL_IN>(
561 debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
562
563 if (uniforms.b[instr.flow_control.bool_uniform_id] ==
564 !(instr.flow_control.num_instructions & 1)) {
565 program_counter = instr.flow_control.dest_offset - 1;
566 }
567 break;
568
569 case OpCode::Id::CALL:
570 call(instr.flow_control.dest_offset, instr.flow_control.num_instructions,
571 program_counter + 1, 0, 0);
572 break;
573
574 case OpCode::Id::CALLU:
575 Record<DebugDataRecord::COND_BOOL_IN>(
576 debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
577 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
578 call(instr.flow_control.dest_offset, instr.flow_control.num_instructions,
579 program_counter + 1, 0, 0);
580 }
581 break;
582
583 case OpCode::Id::CALLC:
584 Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code);
585 if (evaluate_condition(instr.flow_control)) {
586 call(instr.flow_control.dest_offset, instr.flow_control.num_instructions,
587 program_counter + 1, 0, 0);
588 }
589 break;
590
591 case OpCode::Id::NOP:
592 break;
593
594 case OpCode::Id::IFU:
595 Record<DebugDataRecord::COND_BOOL_IN>(
596 debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
597 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
598 call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1,
599 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0,
600 0);
601 } else {
602 call(instr.flow_control.dest_offset, instr.flow_control.num_instructions,
603 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0,
604 0);
605 }
606
607 break;
608
609 case OpCode::Id::IFC: {
610 // TODO: Do we need to consider swizzlers here?
611
612 Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code);
613 if (evaluate_condition(instr.flow_control)) {
614 call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1,
615 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0,
616 0);
617 } else {
618 call(instr.flow_control.dest_offset, instr.flow_control.num_instructions,
619 instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0,
620 0);
621 }
622
623 break;
624 }
625
626 case OpCode::Id::LOOP: {
627 Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x,
628 uniforms.i[instr.flow_control.int_uniform_id].y,
629 uniforms.i[instr.flow_control.int_uniform_id].z,
630 uniforms.i[instr.flow_control.int_uniform_id].w);
631 state.address_registers[2] = loop_param.y;
632
633 Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param);
634 call(program_counter + 1, instr.flow_control.dest_offset - program_counter,
635 instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z);
636 break;
637 }
638
639 case OpCode::Id::EMIT: {
640 GSEmitter* emitter = state.emitter_ptr;
641 ASSERT_MSG(emitter, "Execute EMIT on VS");
642 emitter->Emit(state.registers.output);
643 break;
644 }
645
646 case OpCode::Id::SETEMIT: {
647 GSEmitter* emitter = state.emitter_ptr;
648 ASSERT_MSG(emitter, "Execute SETEMIT on VS");
649 emitter->vertex_id = instr.setemit.vertex_id;
650 emitter->prim_emit = instr.setemit.prim_emit != 0;
651 emitter->winding = instr.setemit.winding != 0;
652 break;
653 }
654
655 default:
656 LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x",
657 (int)instr.opcode.Value().EffectiveOpCode(),
658 instr.opcode.Value().GetInfo().name, instr.hex);
659 break;
660 }
661
662 break;
663 }
664 }
665
666 ++program_counter;
667 ++iteration;
668 }
669}
670
671void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
672 ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH);
673 setup.engine_data.entry_point = entry_point;
674}
675
676MICROPROFILE_DECLARE(GPU_Shader);
677
678void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const {
679
680 MICROPROFILE_SCOPE(GPU_Shader);
681
682 DebugData<false> dummy_debug_data;
683 RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point);
684}
685
686DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup,
687 const AttributeBuffer& input,
688 const ShaderRegs& config) const {
689 UnitState state;
690 DebugData<true> debug_data;
691
692 // Setup input register table
693 boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero()));
694 state.LoadInput(config, input);
695 RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point);
696 return debug_data;
697}
698
699} // namespace
700
701} // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
deleted file mode 100644
index 50fd7c69d..000000000
--- a/src/video_core/shader/shader_interpreter.h
+++ /dev/null
@@ -1,32 +0,0 @@
1// Copyright 2014 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include "video_core/shader/debug_data.h"
8#include "video_core/shader/shader.h"
9
10namespace Pica {
11
12namespace Shader {
13
14class InterpreterEngine final : public ShaderEngine {
15public:
16 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
17 void Run(const ShaderSetup& setup, UnitState& state) const override;
18
19 /**
20 * Produce debug information based on the given shader and input vertex
21 * @param setup Shader engine state
22 * @param input Input vertex into the shader
23 * @param config Configuration object for the shader pipeline
24 * @return Debug information for this shader with regards to the given vertex
25 */
26 DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input,
27 const ShaderRegs& config) const;
28};
29
30} // namespace
31
32} // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
deleted file mode 100644
index 73c21871c..000000000
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/hash.h"
6#include "common/microprofile.h"
7#include "video_core/shader/shader.h"
8#include "video_core/shader/shader_jit_x64.h"
9#include "video_core/shader/shader_jit_x64_compiler.h"
10
11namespace Pica {
12namespace Shader {
13
14JitX64Engine::JitX64Engine() = default;
15JitX64Engine::~JitX64Engine() = default;
16
17void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) {
18 ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH);
19 setup.engine_data.entry_point = entry_point;
20
21 u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code));
22 u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data));
23
24 u64 cache_key = code_hash ^ swizzle_hash;
25 auto iter = cache.find(cache_key);
26 if (iter != cache.end()) {
27 setup.engine_data.cached_shader = iter->second.get();
28 } else {
29 auto shader = std::make_unique<JitShader>();
30 shader->Compile(&setup.program_code, &setup.swizzle_data);
31 setup.engine_data.cached_shader = shader.get();
32 cache.emplace_hint(iter, cache_key, std::move(shader));
33 }
34}
35
36MICROPROFILE_DECLARE(GPU_Shader);
37
38void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const {
39 ASSERT(setup.engine_data.cached_shader != nullptr);
40
41 MICROPROFILE_SCOPE(GPU_Shader);
42
43 const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader);
44 shader->Run(setup, state, setup.engine_data.entry_point);
45}
46
47} // namespace Shader
48} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h
deleted file mode 100644
index 078b2cba5..000000000
--- a/src/video_core/shader/shader_jit_x64.h
+++ /dev/null
@@ -1,30 +0,0 @@
1// Copyright 2016 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <memory>
8#include <unordered_map>
9#include "common/common_types.h"
10#include "video_core/shader/shader.h"
11
12namespace Pica {
13namespace Shader {
14
15class JitShader;
16
17class JitX64Engine final : public ShaderEngine {
18public:
19 JitX64Engine();
20 ~JitX64Engine() override;
21
22 void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override;
23 void Run(const ShaderSetup& setup, UnitState& state) const override;
24
25private:
26 std::unordered_map<u64, std::unique_ptr<JitShader>> cache;
27};
28
29} // namespace Shader
30} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp
deleted file mode 100644
index 1b31623bd..000000000
--- a/src/video_core/shader/shader_jit_x64_compiler.cpp
+++ /dev/null
@@ -1,942 +0,0 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <algorithm>
6#include <cmath>
7#include <cstdint>
8#include <nihstro/shader_bytecode.h>
9#include <smmintrin.h>
10#include <xmmintrin.h>
11#include "common/assert.h"
12#include "common/logging/log.h"
13#include "common/vector_math.h"
14#include "common/x64/cpu_detect.h"
15#include "common/x64/xbyak_abi.h"
16#include "common/x64/xbyak_util.h"
17#include "video_core/pica_state.h"
18#include "video_core/pica_types.h"
19#include "video_core/shader/shader.h"
20#include "video_core/shader/shader_jit_x64_compiler.h"
21
22using namespace Common::X64;
23using namespace Xbyak::util;
24using Xbyak::Label;
25using Xbyak::Reg32;
26using Xbyak::Reg64;
27using Xbyak::Xmm;
28
29namespace Pica {
30
31namespace Shader {
32
33typedef void (JitShader::*JitFunction)(Instruction instr);
34
35const JitFunction instr_table[64] = {
36 &JitShader::Compile_ADD, // add
37 &JitShader::Compile_DP3, // dp3
38 &JitShader::Compile_DP4, // dp4
39 &JitShader::Compile_DPH, // dph
40 nullptr, // unknown
41 &JitShader::Compile_EX2, // ex2
42 &JitShader::Compile_LG2, // lg2
43 nullptr, // unknown
44 &JitShader::Compile_MUL, // mul
45 &JitShader::Compile_SGE, // sge
46 &JitShader::Compile_SLT, // slt
47 &JitShader::Compile_FLR, // flr
48 &JitShader::Compile_MAX, // max
49 &JitShader::Compile_MIN, // min
50 &JitShader::Compile_RCP, // rcp
51 &JitShader::Compile_RSQ, // rsq
52 nullptr, // unknown
53 nullptr, // unknown
54 &JitShader::Compile_MOVA, // mova
55 &JitShader::Compile_MOV, // mov
56 nullptr, // unknown
57 nullptr, // unknown
58 nullptr, // unknown
59 nullptr, // unknown
60 &JitShader::Compile_DPH, // dphi
61 nullptr, // unknown
62 &JitShader::Compile_SGE, // sgei
63 &JitShader::Compile_SLT, // slti
64 nullptr, // unknown
65 nullptr, // unknown
66 nullptr, // unknown
67 nullptr, // unknown
68 nullptr, // unknown
69 &JitShader::Compile_NOP, // nop
70 &JitShader::Compile_END, // end
71 nullptr, // break
72 &JitShader::Compile_CALL, // call
73 &JitShader::Compile_CALLC, // callc
74 &JitShader::Compile_CALLU, // callu
75 &JitShader::Compile_IF, // ifu
76 &JitShader::Compile_IF, // ifc
77 &JitShader::Compile_LOOP, // loop
78 &JitShader::Compile_EMIT, // emit
79 &JitShader::Compile_SETE, // sete
80 &JitShader::Compile_JMP, // jmpc
81 &JitShader::Compile_JMP, // jmpu
82 &JitShader::Compile_CMP, // cmp
83 &JitShader::Compile_CMP, // cmp
84 &JitShader::Compile_MAD, // madi
85 &JitShader::Compile_MAD, // madi
86 &JitShader::Compile_MAD, // madi
87 &JitShader::Compile_MAD, // madi
88 &JitShader::Compile_MAD, // madi
89 &JitShader::Compile_MAD, // madi
90 &JitShader::Compile_MAD, // madi
91 &JitShader::Compile_MAD, // madi
92 &JitShader::Compile_MAD, // mad
93 &JitShader::Compile_MAD, // mad
94 &JitShader::Compile_MAD, // mad
95 &JitShader::Compile_MAD, // mad
96 &JitShader::Compile_MAD, // mad
97 &JitShader::Compile_MAD, // mad
98 &JitShader::Compile_MAD, // mad
99 &JitShader::Compile_MAD, // mad
100};
101
102// The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can
103// be used as scratch registers within a compiler function. The other registers have designated
104// purposes, as documented below:
105
106/// Pointer to the uniform memory
107static const Reg64 SETUP = r9;
108/// The two 32-bit VS address offset registers set by the MOVA instruction
109static const Reg64 ADDROFFS_REG_0 = r10;
110static const Reg64 ADDROFFS_REG_1 = r11;
111/// VS loop count register (Multiplied by 16)
112static const Reg32 LOOPCOUNT_REG = r12d;
113/// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker)
114static const Reg32 LOOPCOUNT = esi;
115/// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16)
116static const Reg32 LOOPINC = edi;
117/// Result of the previous CMP instruction for the X-component comparison
118static const Reg64 COND0 = r13;
119/// Result of the previous CMP instruction for the Y-component comparison
120static const Reg64 COND1 = r14;
121/// Pointer to the UnitState instance for the current VS unit
122static const Reg64 STATE = r15;
123/// SIMD scratch register
124static const Xmm SCRATCH = xmm0;
125/// Loaded with the first swizzled source register, otherwise can be used as a scratch register
126static const Xmm SRC1 = xmm1;
127/// Loaded with the second swizzled source register, otherwise can be used as a scratch register
128static const Xmm SRC2 = xmm2;
129/// Loaded with the third swizzled source register, otherwise can be used as a scratch register
130static const Xmm SRC3 = xmm3;
131/// Additional scratch register
132static const Xmm SCRATCH2 = xmm4;
133/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
134static const Xmm ONE = xmm14;
135/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
136static const Xmm NEGBIT = xmm15;
137
138// State registers that must not be modified by external functions calls
139// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
140static const BitSet32 persistent_regs = BuildRegSet({
141 // Pointers to register blocks
142 SETUP, STATE,
143 // Cached registers
144 ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1,
145 // Constants
146 ONE, NEGBIT,
147 // Loop variables
148 LOOPCOUNT, LOOPINC,
149});
150
151/// Raw constant for the source register selector that indicates no swizzling is performed
152static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
153/// Raw constant for the destination register enable mask that indicates all components are enabled
154static const u8 NO_DEST_REG_MASK = 0xf;
155
156static void LogCritical(const char* msg) {
157 LOG_CRITICAL(HW_GPU, "%s", msg);
158}
159
160void JitShader::Compile_Assert(bool condition, const char* msg) {
161 if (!condition) {
162 mov(ABI_PARAM1, reinterpret_cast<size_t>(msg));
163 CallFarFunction(*this, LogCritical);
164 }
165}
166
167/**
168 * Loads and swizzles a source register into the specified XMM register.
169 * @param instr VS instruction, used for determining how to load the source register
170 * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3)
171 * @param src_reg SourceRegister object corresponding to the source register to load
172 * @param dest Destination XMM register to store the loaded, swizzled source register
173 */
174void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
175 Xmm dest) {
176 Reg64 src_ptr;
177 size_t src_offset;
178
179 if (src_reg.GetRegisterType() == RegisterType::FloatUniform) {
180 src_ptr = SETUP;
181 src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex());
182 } else {
183 src_ptr = STATE;
184 src_offset = UnitState::InputOffset(src_reg);
185 }
186
187 int src_offset_disp = (int)src_offset;
188 ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type");
189
190 unsigned operand_desc_id;
191
192 const bool is_inverted =
193 (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed));
194
195 unsigned address_register_index;
196 unsigned offset_src;
197
198 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
199 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
200 operand_desc_id = instr.mad.operand_desc_id;
201 offset_src = is_inverted ? 3 : 2;
202 address_register_index = instr.mad.address_register_index;
203 } else {
204 operand_desc_id = instr.common.operand_desc_id;
205 offset_src = is_inverted ? 2 : 1;
206 address_register_index = instr.common.address_register_index;
207 }
208
209 if (src_num == offset_src && address_register_index != 0) {
210 switch (address_register_index) {
211 case 1: // address offset 1
212 movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]);
213 break;
214 case 2: // address offset 2
215 movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]);
216 break;
217 case 3: // address offset 3
218 movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]);
219 break;
220 default:
221 UNREACHABLE();
222 break;
223 }
224 } else {
225 // Load the source
226 movaps(dest, xword[src_ptr + src_offset_disp]);
227 }
228
229 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
230
231 // Generate instructions for source register swizzling as needed
232 u8 sel = swiz.GetRawSelector(src_num);
233 if (sel != NO_SRC_REG_SWIZZLE) {
234 // Selector component order needs to be reversed for the SHUFPS instruction
235 sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2);
236
237 // Shuffle inputs for swizzle
238 shufps(dest, dest, sel);
239 }
240
241 // If the source register should be negated, flip the negative bit using XOR
242 const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3};
243 if (negate[src_num - 1]) {
244 xorps(dest, NEGBIT);
245 }
246}
247
248void JitShader::Compile_DestEnable(Instruction instr, Xmm src) {
249 DestRegister dest;
250 unsigned operand_desc_id;
251 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD ||
252 instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
253 operand_desc_id = instr.mad.operand_desc_id;
254 dest = instr.mad.dest.Value();
255 } else {
256 operand_desc_id = instr.common.operand_desc_id;
257 dest = instr.common.dest.Value();
258 }
259
260 SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]};
261
262 size_t dest_offset_disp = UnitState::OutputOffset(dest);
263
264 // If all components are enabled, write the result to the destination register
265 if (swiz.dest_mask == NO_DEST_REG_MASK) {
266 // Store dest back to memory
267 movaps(xword[STATE + dest_offset_disp], src);
268
269 } else {
270 // Not all components are enabled, so mask the result when storing to the destination
271 // register...
272 movaps(SCRATCH, xword[STATE + dest_offset_disp]);
273
274 if (Common::GetCPUCaps().sse4_1) {
275 u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) |
276 ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
277 blendps(SCRATCH, src, mask);
278 } else {
279 movaps(SCRATCH2, src);
280 unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination
281 unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination
282
283 // Compute selector to selectively copy source components to destination for SHUFPS
284 // instruction
285 u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) |
286 ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
287 ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
288 ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
289 shufps(SCRATCH, SCRATCH2, sel);
290 }
291
292 // Store dest back to memory
293 movaps(xword[STATE + dest_offset_disp], SCRATCH);
294 }
295}
296
297void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) {
298 // 0 * inf and inf * 0 in the PICA should return 0 instead of NaN. This can be implemented by
299 // checking for NaNs before and after the multiplication. If the multiplication result is NaN
300 // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the
301 // result should be transformed to 0 to match PICA fp rules.
302
303 // Set scratch to mask of (src1 != NaN and src2 != NaN)
304 movaps(scratch, src1);
305 cmpordps(scratch, src2);
306
307 mulps(src1, src2);
308
309 // Set src2 to mask of (result == NaN)
310 movaps(src2, src1);
311 cmpunordps(src2, src2);
312
313 // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN)
314 xorps(scratch, src2);
315 andps(src1, scratch);
316}
317
318void JitShader::Compile_EvaluateCondition(Instruction instr) {
319 // Note: NXOR is used below to check for equality
320 switch (instr.flow_control.op) {
321 case Instruction::FlowControlType::Or:
322 mov(eax, COND0);
323 mov(ebx, COND1);
324 xor_(eax, (instr.flow_control.refx.Value() ^ 1));
325 xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
326 or_(eax, ebx);
327 break;
328
329 case Instruction::FlowControlType::And:
330 mov(eax, COND0);
331 mov(ebx, COND1);
332 xor_(eax, (instr.flow_control.refx.Value() ^ 1));
333 xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
334 and_(eax, ebx);
335 break;
336
337 case Instruction::FlowControlType::JustX:
338 mov(eax, COND0);
339 xor_(eax, (instr.flow_control.refx.Value() ^ 1));
340 break;
341
342 case Instruction::FlowControlType::JustY:
343 mov(eax, COND1);
344 xor_(eax, (instr.flow_control.refy.Value() ^ 1));
345 break;
346 }
347}
348
349void JitShader::Compile_UniformCondition(Instruction instr) {
350 size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id);
351 cmp(byte[SETUP + offset], 0);
352}
353
354BitSet32 JitShader::PersistentCallerSavedRegs() {
355 return persistent_regs & ABI_ALL_CALLER_SAVED;
356}
357
358void JitShader::Compile_ADD(Instruction instr) {
359 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
360 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
361 addps(SRC1, SRC2);
362 Compile_DestEnable(instr, SRC1);
363}
364
365void JitShader::Compile_DP3(Instruction instr) {
366 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
367 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
368
369 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
370
371 movaps(SRC2, SRC1);
372 shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1));
373
374 movaps(SRC3, SRC1);
375 shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2));
376
377 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0));
378 addps(SRC1, SRC2);
379 addps(SRC1, SRC3);
380
381 Compile_DestEnable(instr, SRC1);
382}
383
384void JitShader::Compile_DP4(Instruction instr) {
385 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
386 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
387
388 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
389
390 movaps(SRC2, SRC1);
391 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
392 addps(SRC1, SRC2);
393
394 movaps(SRC2, SRC1);
395 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
396 addps(SRC1, SRC2);
397
398 Compile_DestEnable(instr, SRC1);
399}
400
401void JitShader::Compile_DPH(Instruction instr) {
402 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) {
403 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
404 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
405 } else {
406 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
407 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
408 }
409
410 if (Common::GetCPUCaps().sse4_1) {
411 // Set 4th component to 1.0
412 blendps(SRC1, ONE, 0b1000);
413 } else {
414 // Set 4th component to 1.0
415 movaps(SCRATCH, SRC1);
416 unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__
417 unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1
418 }
419
420 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
421
422 movaps(SRC2, SRC1);
423 shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
424 addps(SRC1, SRC2);
425
426 movaps(SRC2, SRC1);
427 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
428 addps(SRC1, SRC2);
429
430 Compile_DestEnable(instr, SRC1);
431}
432
433void JitShader::Compile_EX2(Instruction instr) {
434 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
435 movss(xmm0, SRC1); // ABI_PARAM1
436
437 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
438 CallFarFunction(*this, exp2f);
439 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
440
441 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
442 movaps(SRC1, xmm0);
443 Compile_DestEnable(instr, SRC1);
444}
445
446void JitShader::Compile_LG2(Instruction instr) {
447 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
448 movss(xmm0, SRC1); // ABI_PARAM1
449
450 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
451 CallFarFunction(*this, log2f);
452 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
453
454 shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN
455 movaps(SRC1, xmm0);
456 Compile_DestEnable(instr, SRC1);
457}
458
459void JitShader::Compile_MUL(Instruction instr) {
460 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
461 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
462 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
463 Compile_DestEnable(instr, SRC1);
464}
465
466void JitShader::Compile_SGE(Instruction instr) {
467 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) {
468 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
469 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
470 } else {
471 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
472 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
473 }
474
475 cmpleps(SRC2, SRC1);
476 andps(SRC2, ONE);
477
478 Compile_DestEnable(instr, SRC2);
479}
480
481void JitShader::Compile_SLT(Instruction instr) {
482 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) {
483 Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1);
484 Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2);
485 } else {
486 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
487 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
488 }
489
490 cmpltps(SRC1, SRC2);
491 andps(SRC1, ONE);
492
493 Compile_DestEnable(instr, SRC1);
494}
495
496void JitShader::Compile_FLR(Instruction instr) {
497 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
498
499 if (Common::GetCPUCaps().sse4_1) {
500 roundps(SRC1, SRC1, _MM_FROUND_FLOOR);
501 } else {
502 cvttps2dq(SRC1, SRC1);
503 cvtdq2ps(SRC1, SRC1);
504 }
505
506 Compile_DestEnable(instr, SRC1);
507}
508
509void JitShader::Compile_MAX(Instruction instr) {
510 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
511 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
512 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
513 maxps(SRC1, SRC2);
514 Compile_DestEnable(instr, SRC1);
515}
516
517void JitShader::Compile_MIN(Instruction instr) {
518 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
519 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
520 // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
521 minps(SRC1, SRC2);
522 Compile_DestEnable(instr, SRC1);
523}
524
525void JitShader::Compile_MOVA(Instruction instr) {
526 SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]};
527
528 if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) {
529 return; // NoOp
530 }
531
532 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
533
534 // Convert floats to integers using truncation (only care about X and Y components)
535 cvttps2dq(SRC1, SRC1);
536
537 // Get result
538 movq(rax, SRC1);
539
540 // Handle destination enable
541 if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) {
542 // Move and sign-extend low 32 bits
543 movsxd(ADDROFFS_REG_0, eax);
544
545 // Move and sign-extend high 32 bits
546 shr(rax, 32);
547 movsxd(ADDROFFS_REG_1, eax);
548
549 // Multiply by 16 to be used as an offset later
550 shl(ADDROFFS_REG_0, 4);
551 shl(ADDROFFS_REG_1, 4);
552 } else {
553 if (swiz.DestComponentEnabled(0)) {
554 // Move and sign-extend low 32 bits
555 movsxd(ADDROFFS_REG_0, eax);
556
557 // Multiply by 16 to be used as an offset later
558 shl(ADDROFFS_REG_0, 4);
559 } else if (swiz.DestComponentEnabled(1)) {
560 // Move and sign-extend high 32 bits
561 shr(rax, 32);
562 movsxd(ADDROFFS_REG_1, eax);
563
564 // Multiply by 16 to be used as an offset later
565 shl(ADDROFFS_REG_1, 4);
566 }
567 }
568}
569
570void JitShader::Compile_MOV(Instruction instr) {
571 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
572 Compile_DestEnable(instr, SRC1);
573}
574
575void JitShader::Compile_RCP(Instruction instr) {
576 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
577
578 // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica
579 // performs this operation more accurately. This should be checked on hardware.
580 rcpss(SRC1, SRC1);
581 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
582
583 Compile_DestEnable(instr, SRC1);
584}
585
586void JitShader::Compile_RSQ(Instruction instr) {
587 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
588
589 // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica
590 // performs this operation more accurately. This should be checked on hardware.
591 rsqrtss(SRC1, SRC1);
592 shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX
593
594 Compile_DestEnable(instr, SRC1);
595}
596
597void JitShader::Compile_NOP(Instruction instr) {}
598
599void JitShader::Compile_END(Instruction instr) {
600 ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
601 ret();
602}
603
604void JitShader::Compile_CALL(Instruction instr) {
605 // Push offset of the return
606 push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions));
607
608 // Call the subroutine
609 call(instruction_labels[instr.flow_control.dest_offset]);
610
611 // Skip over the return offset that's on the stack
612 add(rsp, 8);
613}
614
615void JitShader::Compile_CALLC(Instruction instr) {
616 Compile_EvaluateCondition(instr);
617 Label b;
618 jz(b);
619 Compile_CALL(instr);
620 L(b);
621}
622
623void JitShader::Compile_CALLU(Instruction instr) {
624 Compile_UniformCondition(instr);
625 Label b;
626 jz(b);
627 Compile_CALL(instr);
628 L(b);
629}
630
631void JitShader::Compile_CMP(Instruction instr) {
632 using Op = Instruction::Common::CompareOpType::Op;
633 Op op_x = instr.common.compare_op.x;
634 Op op_y = instr.common.compare_op.y;
635
636 Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
637 Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
638
639 // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
640 // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
641 // because they don't match when used with NaNs.
642 static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE};
643
644 bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
645 Xmm lhs_x = invert_op_x ? SRC2 : SRC1;
646 Xmm rhs_x = invert_op_x ? SRC1 : SRC2;
647
648 if (op_x == op_y) {
649 // Compare X-component and Y-component together
650 cmpps(lhs_x, rhs_x, cmp[op_x]);
651 movq(COND0, lhs_x);
652
653 mov(COND1, COND0);
654 } else {
655 bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
656 Xmm lhs_y = invert_op_y ? SRC2 : SRC1;
657 Xmm rhs_y = invert_op_y ? SRC1 : SRC2;
658
659 // Compare X-component
660 movaps(SCRATCH, lhs_x);
661 cmpss(SCRATCH, rhs_x, cmp[op_x]);
662
663 // Compare Y-component
664 cmpps(lhs_y, rhs_y, cmp[op_y]);
665
666 movq(COND0, SCRATCH);
667 movq(COND1, lhs_y);
668 }
669
670 shr(COND0.cvt32(), 31); // ignores upper 32 bits in source
671 shr(COND1, 63);
672}
673
674void JitShader::Compile_MAD(Instruction instr) {
675 Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1);
676
677 if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) {
678 Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2);
679 Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3);
680 } else {
681 Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2);
682 Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
683 }
684
685 Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
686 addps(SRC1, SRC3);
687
688 Compile_DestEnable(instr, SRC1);
689}
690
691void JitShader::Compile_IF(Instruction instr) {
692 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
693 "Backwards if-statements not supported");
694 Label l_else, l_endif;
695
696 // Evaluate the "IF" condition
697 if (instr.opcode.Value() == OpCode::Id::IFU) {
698 Compile_UniformCondition(instr);
699 } else if (instr.opcode.Value() == OpCode::Id::IFC) {
700 Compile_EvaluateCondition(instr);
701 }
702 jz(l_else, T_NEAR);
703
704 // Compile the code that corresponds to the condition evaluating as true
705 Compile_Block(instr.flow_control.dest_offset);
706
707 // If there isn't an "ELSE" condition, we are done here
708 if (instr.flow_control.num_instructions == 0) {
709 L(l_else);
710 return;
711 }
712
713 jmp(l_endif, T_NEAR);
714
715 L(l_else);
716 // This code corresponds to the "ELSE" condition
717 // Comple the code that corresponds to the condition evaluating as false
718 Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions);
719
720 L(l_endif);
721}
722
723void JitShader::Compile_LOOP(Instruction instr) {
724 Compile_Assert(instr.flow_control.dest_offset >= program_counter,
725 "Backwards loops not supported");
726 Compile_Assert(!looping, "Nested loops not supported");
727
728 looping = true;
729
730 // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id.
731 // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by
732 // 4 bits) to be used as an offset into the 16-byte vector registers later
733 size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id);
734 mov(LOOPCOUNT, dword[SETUP + offset]);
735 mov(LOOPCOUNT_REG, LOOPCOUNT);
736 shr(LOOPCOUNT_REG, 4);
737 and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start
738 mov(LOOPINC, LOOPCOUNT);
739 shr(LOOPINC, 12);
740 and_(LOOPINC, 0xFF0); // Z-component is the incrementer
741 movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count
742 add(LOOPCOUNT, 1); // Iteration count is X-component + 1
743
744 Label l_loop_start;
745 L(l_loop_start);
746
747 Compile_Block(instr.flow_control.dest_offset + 1);
748
749 add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component
750 sub(LOOPCOUNT, 1); // Increment loop count by 1
751 jnz(l_loop_start); // Loop if not equal
752
753 looping = false;
754}
755
756void JitShader::Compile_JMP(Instruction instr) {
757 if (instr.opcode.Value() == OpCode::Id::JMPC)
758 Compile_EvaluateCondition(instr);
759 else if (instr.opcode.Value() == OpCode::Id::JMPU)
760 Compile_UniformCondition(instr);
761 else
762 UNREACHABLE();
763
764 bool inverted_condition =
765 (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1);
766
767 Label& b = instruction_labels[instr.flow_control.dest_offset];
768 if (inverted_condition) {
769 jz(b, T_NEAR);
770 } else {
771 jnz(b, T_NEAR);
772 }
773}
774
775static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) {
776 emitter->Emit(*output);
777}
778
779void JitShader::Compile_EMIT(Instruction instr) {
780 Label have_emitter, end;
781 mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
782 test(rax, rax);
783 jnz(have_emitter);
784
785 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
786 mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS"));
787 CallFarFunction(*this, LogCritical);
788 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
789 jmp(end);
790
791 L(have_emitter);
792 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
793 mov(ABI_PARAM1, rax);
794 mov(ABI_PARAM2, STATE);
795 add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output)));
796 CallFarFunction(*this, Emit);
797 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
798 L(end);
799}
800
801void JitShader::Compile_SETE(Instruction instr) {
802 Label have_emitter, end;
803 mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]);
804 test(rax, rax);
805 jnz(have_emitter);
806
807 ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
808 mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS"));
809 CallFarFunction(*this, LogCritical);
810 ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
811 jmp(end);
812
813 L(have_emitter);
814 mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id);
815 mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit);
816 mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding);
817 L(end);
818}
819
820void JitShader::Compile_Block(unsigned end) {
821 while (program_counter < end) {
822 Compile_NextInstr();
823 }
824}
825
826void JitShader::Compile_Return() {
827 // Peek return offset on the stack and check if we're at that offset
828 mov(rax, qword[rsp + 8]);
829 cmp(eax, (program_counter));
830
831 // If so, jump back to before CALL
832 Label b;
833 jnz(b);
834 ret();
835 L(b);
836}
837
838void JitShader::Compile_NextInstr() {
839 if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) {
840 Compile_Return();
841 }
842
843 L(instruction_labels[program_counter]);
844
845 Instruction instr = {(*program_code)[program_counter++]};
846
847 OpCode::Id opcode = instr.opcode.Value();
848 auto instr_func = instr_table[static_cast<unsigned>(opcode)];
849
850 if (instr_func) {
851 // JIT the instruction!
852 ((*this).*instr_func)(instr);
853 } else {
854 // Unhandled instruction
855 LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)",
856 instr.opcode.Value().EffectiveOpCode(), instr.hex);
857 }
858}
859
860void JitShader::FindReturnOffsets() {
861 return_offsets.clear();
862
863 for (size_t offset = 0; offset < program_code->size(); ++offset) {
864 Instruction instr = {(*program_code)[offset]};
865
866 switch (instr.opcode.Value()) {
867 case OpCode::Id::CALL:
868 case OpCode::Id::CALLC:
869 case OpCode::Id::CALLU:
870 return_offsets.push_back(instr.flow_control.dest_offset +
871 instr.flow_control.num_instructions);
872 break;
873 default:
874 break;
875 }
876 }
877
878 // Sort for efficient binary search later
879 std::sort(return_offsets.begin(), return_offsets.end());
880}
881
882void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code_,
883 const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data_) {
884 program_code = program_code_;
885 swizzle_data = swizzle_data_;
886
887 // Reset flow control state
888 program = (CompiledShader*)getCurr();
889 program_counter = 0;
890 looping = false;
891 instruction_labels.fill(Xbyak::Label());
892
893 // Find all `CALL` instructions and identify return locations
894 FindReturnOffsets();
895
896 // The stack pointer is 8 modulo 16 at the entry of a procedure
897 // We reserve 16 bytes and assign a dummy value to the first 8 bytes, to catch any potential
898 // return checks (see Compile_Return) that happen in shader main routine.
899 ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16);
900 mov(qword[rsp + 8], 0xFFFFFFFFFFFFFFFFULL);
901
902 mov(SETUP, ABI_PARAM1);
903 mov(STATE, ABI_PARAM2);
904
905 // Zero address/loop registers
906 xor_(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32());
907 xor_(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32());
908 xor_(LOOPCOUNT_REG, LOOPCOUNT_REG);
909
910 // Used to set a register to one
911 static const __m128 one = {1.f, 1.f, 1.f, 1.f};
912 mov(rax, reinterpret_cast<size_t>(&one));
913 movaps(ONE, xword[rax]);
914
915 // Used to negate registers
916 static const __m128 neg = {-0.f, -0.f, -0.f, -0.f};
917 mov(rax, reinterpret_cast<size_t>(&neg));
918 movaps(NEGBIT, xword[rax]);
919
920 // Jump to start of the shader program
921 jmp(ABI_PARAM3);
922
923 // Compile entire program
924 Compile_Block(static_cast<unsigned>(program_code->size()));
925
926 // Free memory that's no longer needed
927 program_code = nullptr;
928 swizzle_data = nullptr;
929 return_offsets.clear();
930 return_offsets.shrink_to_fit();
931
932 ready();
933
934 ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!");
935 LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize());
936}
937
938JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {}
939
940} // namespace Shader
941
942} // namespace Pica
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h
deleted file mode 100644
index 4aee56b1d..000000000
--- a/src/video_core/shader/shader_jit_x64_compiler.h
+++ /dev/null
@@ -1,127 +0,0 @@
1// Copyright 2015 Citra Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <cstddef>
9#include <utility>
10#include <vector>
11#include <nihstro/shader_bytecode.h>
12#include <xbyak.h>
13#include "common/bit_set.h"
14#include "common/common_types.h"
15#include "video_core/shader/shader.h"
16
17using nihstro::Instruction;
18using nihstro::OpCode;
19using nihstro::SwizzlePattern;
20
21namespace Pica {
22
23namespace Shader {
24
25/// Memory allocated for each compiled shader
26constexpr size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 64;
27
28/**
29 * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64
30 * code that can be executed on the host machine directly.
31 */
32class JitShader : public Xbyak::CodeGenerator {
33public:
34 JitShader();
35
36 void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const {
37 program(&setup, &state, instruction_labels[offset].getAddress());
38 }
39
40 void Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code,
41 const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data);
42
43 void Compile_ADD(Instruction instr);
44 void Compile_DP3(Instruction instr);
45 void Compile_DP4(Instruction instr);
46 void Compile_DPH(Instruction instr);
47 void Compile_EX2(Instruction instr);
48 void Compile_LG2(Instruction instr);
49 void Compile_MUL(Instruction instr);
50 void Compile_SGE(Instruction instr);
51 void Compile_SLT(Instruction instr);
52 void Compile_FLR(Instruction instr);
53 void Compile_MAX(Instruction instr);
54 void Compile_MIN(Instruction instr);
55 void Compile_RCP(Instruction instr);
56 void Compile_RSQ(Instruction instr);
57 void Compile_MOVA(Instruction instr);
58 void Compile_MOV(Instruction instr);
59 void Compile_NOP(Instruction instr);
60 void Compile_END(Instruction instr);
61 void Compile_CALL(Instruction instr);
62 void Compile_CALLC(Instruction instr);
63 void Compile_CALLU(Instruction instr);
64 void Compile_IF(Instruction instr);
65 void Compile_LOOP(Instruction instr);
66 void Compile_JMP(Instruction instr);
67 void Compile_CMP(Instruction instr);
68 void Compile_MAD(Instruction instr);
69 void Compile_EMIT(Instruction instr);
70 void Compile_SETE(Instruction instr);
71
72private:
73 void Compile_Block(unsigned end);
74 void Compile_NextInstr();
75
76 void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg,
77 Xbyak::Xmm dest);
78 void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest);
79
80 /**
81 * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
82 * zero by inf. Clobbers `src2` and `scratch`.
83 */
84 void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch);
85
86 void Compile_EvaluateCondition(Instruction instr);
87 void Compile_UniformCondition(Instruction instr);
88
89 /**
90 * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction.
91 */
92 void Compile_Return();
93
94 BitSet32 PersistentCallerSavedRegs();
95
96 /**
97 * Assertion evaluated at compile-time, but only triggered if executed at runtime.
98 * @param condition Condition to be evaluated.
99 * @param msg Message to be logged if the assertion fails.
100 */
101 void Compile_Assert(bool condition, const char* msg);
102
103 /**
104 * Analyzes the entire shader program for `CALL` instructions before emitting any code,
105 * identifying the locations where a return needs to be inserted.
106 */
107 void FindReturnOffsets();
108
109 const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr;
110 const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr;
111
112 /// Mapping of Pica VS instructions to pointers in the emitted code
113 std::array<Xbyak::Label, MAX_PROGRAM_CODE_LENGTH> instruction_labels;
114
115 /// Offsets in code where a return needs to be inserted
116 std::vector<unsigned> return_offsets;
117
118 unsigned program_counter = 0; ///< Offset of the next instruction to decode
119 bool looping = false; ///< True if compiling a loop, used to check for nested loops
120
121 using CompiledShader = void(const void* setup, void* state, const u8* start_addr);
122 CompiledShader* program = nullptr;
123};
124
125} // Shader
126
127} // Pica