summaryrefslogtreecommitdiff
path: root/src/video_core/macro
diff options
context:
space:
mode:
authorGravatar VolcaEM2020-06-25 23:34:37 +0200
committerGravatar GitHub2020-06-25 23:34:37 +0200
commit0f4512291a0504b32fac248f73a68fec34f657fe (patch)
tree3c69736a2ac82a9a0076ec3b79673c814e5f1abd /src/video_core/macro
parentFix typo 2: electric boogaloo (diff)
parentMerge pull request #4136 from VolcaEM/mods (diff)
downloadyuzu-0f4512291a0504b32fac248f73a68fec34f657fe.tar.gz
yuzu-0f4512291a0504b32fac248f73a68fec34f657fe.tar.xz
yuzu-0f4512291a0504b32fac248f73a68fec34f657fe.zip
Merge branch 'master' into quickstart-faq
Diffstat (limited to 'src/video_core/macro')
-rw-r--r--src/video_core/macro/macro.cpp72
-rw-r--r--src/video_core/macro/macro.h141
-rw-r--r--src/video_core/macro/macro_hle.cpp113
-rw-r--r--src/video_core/macro/macro_hle.h44
-rw-r--r--src/video_core/macro/macro_interpreter.cpp289
-rw-r--r--src/video_core/macro/macro_interpreter.h102
-rw-r--r--src/video_core/macro/macro_jit_x64.cpp621
-rw-r--r--src/video_core/macro/macro_jit_x64.h98
8 files changed, 1480 insertions, 0 deletions
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp
new file mode 100644
index 000000000..ef7dad349
--- /dev/null
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,72 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <boost/container_hash/hash.hpp>
6#include "common/assert.h"
7#include "common/logging/log.h"
8#include "core/settings.h"
9#include "video_core/engines/maxwell_3d.h"
10#include "video_core/macro/macro.h"
11#include "video_core/macro/macro_hle.h"
12#include "video_core/macro/macro_interpreter.h"
13#include "video_core/macro/macro_jit_x64.h"
14
15namespace Tegra {
16
17MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
18 : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
19
20MacroEngine::~MacroEngine() = default;
21
22void MacroEngine::AddCode(u32 method, u32 data) {
23 uploaded_macro_code[method].push_back(data);
24}
25
26void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
27 const std::vector<u32>& parameters) {
28 auto compiled_macro = macro_cache.find(method);
29 if (compiled_macro != macro_cache.end()) {
30 const auto& cache_info = compiled_macro->second;
31 if (cache_info.has_hle_program) {
32 cache_info.hle_program->Execute(parameters, method);
33 } else {
34 cache_info.lle_program->Execute(parameters, method);
35 }
36 } else {
37 // Macro not compiled, check if it's uploaded and if so, compile it
38 auto macro_code = uploaded_macro_code.find(method);
39 if (macro_code == uploaded_macro_code.end()) {
40 UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
41 return;
42 }
43 auto& cache_info = macro_cache[method];
44 cache_info.hash = boost::hash_value(macro_code->second);
45 cache_info.lle_program = Compile(macro_code->second);
46
47 auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
48 if (hle_program.has_value()) {
49 cache_info.has_hle_program = true;
50 cache_info.hle_program = std::move(hle_program.value());
51 }
52
53 if (cache_info.has_hle_program) {
54 cache_info.hle_program->Execute(parameters, method);
55 } else {
56 cache_info.lle_program->Execute(parameters, method);
57 }
58 }
59}
60
61std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
62 if (Settings::values.disable_macro_jit) {
63 return std::make_unique<MacroInterpreter>(maxwell3d);
64 }
65#ifdef ARCHITECTURE_x86_64
66 return std::make_unique<MacroJITx64>(maxwell3d);
67#else
68 return std::make_unique<MacroInterpreter>(maxwell3d);
69#endif
70}
71
72} // namespace Tegra
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h
new file mode 100644
index 000000000..4d00b84b0
--- /dev/null
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,141 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <memory>
8#include <unordered_map>
9#include <vector>
10#include "common/bit_field.h"
11#include "common/common_types.h"
12
13namespace Tegra {
14
15namespace Engines {
16class Maxwell3D;
17}
18
19namespace Macro {
20constexpr std::size_t NUM_MACRO_REGISTERS = 8;
21enum class Operation : u32 {
22 ALU = 0,
23 AddImmediate = 1,
24 ExtractInsert = 2,
25 ExtractShiftLeftImmediate = 3,
26 ExtractShiftLeftRegister = 4,
27 Read = 5,
28 Unused = 6, // This operation doesn't seem to be a valid encoding.
29 Branch = 7,
30};
31
32enum class ALUOperation : u32 {
33 Add = 0,
34 AddWithCarry = 1,
35 Subtract = 2,
36 SubtractWithBorrow = 3,
37 // Operations 4-7 don't seem to be valid encodings.
38 Xor = 8,
39 Or = 9,
40 And = 10,
41 AndNot = 11,
42 Nand = 12
43};
44
45enum class ResultOperation : u32 {
46 IgnoreAndFetch = 0,
47 Move = 1,
48 MoveAndSetMethod = 2,
49 FetchAndSend = 3,
50 MoveAndSend = 4,
51 FetchAndSetMethod = 5,
52 MoveAndSetMethodFetchAndSend = 6,
53 MoveAndSetMethodSend = 7
54};
55
56enum class BranchCondition : u32 {
57 Zero = 0,
58 NotZero = 1,
59};
60
61union Opcode {
62 u32 raw;
63 BitField<0, 3, Operation> operation;
64 BitField<4, 3, ResultOperation> result_operation;
65 BitField<4, 1, BranchCondition> branch_condition;
66 // If set on a branch, then the branch doesn't have a delay slot.
67 BitField<5, 1, u32> branch_annul;
68 BitField<7, 1, u32> is_exit;
69 BitField<8, 3, u32> dst;
70 BitField<11, 3, u32> src_a;
71 BitField<14, 3, u32> src_b;
72 // The signed immediate overlaps the second source operand and the alu operation.
73 BitField<14, 18, s32> immediate;
74
75 BitField<17, 5, ALUOperation> alu_operation;
76
77 // Bitfield instructions data
78 BitField<17, 5, u32> bf_src_bit;
79 BitField<22, 5, u32> bf_size;
80 BitField<27, 5, u32> bf_dst_bit;
81
82 u32 GetBitfieldMask() const {
83 return (1 << bf_size) - 1;
84 }
85
86 s32 GetBranchTarget() const {
87 return static_cast<s32>(immediate * sizeof(u32));
88 }
89};
90
91union MethodAddress {
92 u32 raw;
93 BitField<0, 12, u32> address;
94 BitField<12, 6, u32> increment;
95};
96
97} // namespace Macro
98
99class HLEMacro;
100
101class CachedMacro {
102public:
103 virtual ~CachedMacro() = default;
104 /**
105 * Executes the macro code with the specified input parameters.
106 * @param code The macro byte code to execute
107 * @param parameters The parameters of the macro
108 */
109 virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
110};
111
112class MacroEngine {
113public:
114 explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
115 virtual ~MacroEngine();
116
117 // Store the uploaded macro code to compile them when they're called.
118 void AddCode(u32 method, u32 data);
119
120 // Compiles the macro if its not in the cache, and executes the compiled macro
121 void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
122
123protected:
124 virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
125
126private:
127 struct CacheInfo {
128 std::unique_ptr<CachedMacro> lle_program{};
129 std::unique_ptr<CachedMacro> hle_program{};
130 u64 hash{};
131 bool has_hle_program{};
132 };
133
134 std::unordered_map<u32, CacheInfo> macro_cache;
135 std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
136 std::unique_ptr<HLEMacro> hle_macros;
137};
138
139std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
140
141} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
new file mode 100644
index 000000000..410f99018
--- /dev/null
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <array>
6#include <vector>
7#include "video_core/engines/maxwell_3d.h"
8#include "video_core/macro/macro_hle.h"
9#include "video_core/rasterizer_interface.h"
10
11namespace Tegra {
12
13namespace {
14// HLE'd functions
15static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
16 const std::vector<u32>& parameters) {
17 const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
18
19 maxwell3d.regs.draw.topology.Assign(
20 static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
21 ~(0x3ffffff << 26)));
22 maxwell3d.regs.vb_base_instance = parameters[5];
23 maxwell3d.mme_draw.instance_count = instance_count;
24 maxwell3d.regs.vb_element_base = parameters[3];
25 maxwell3d.regs.index_array.count = parameters[1];
26 maxwell3d.regs.index_array.first = parameters[4];
27
28 if (maxwell3d.ShouldExecute()) {
29 maxwell3d.GetRasterizer().Draw(true, true);
30 }
31 maxwell3d.regs.index_array.count = 0;
32 maxwell3d.mme_draw.instance_count = 0;
33 maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
34}
35
36static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
37 const std::vector<u32>& parameters) {
38 const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
39
40 maxwell3d.regs.vertex_buffer.first = parameters[3];
41 maxwell3d.regs.vertex_buffer.count = parameters[1];
42 maxwell3d.regs.vb_base_instance = parameters[4];
43 maxwell3d.regs.draw.topology.Assign(
44 static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
45 maxwell3d.mme_draw.instance_count = count;
46
47 if (maxwell3d.ShouldExecute()) {
48 maxwell3d.GetRasterizer().Draw(false, true);
49 }
50 maxwell3d.regs.vertex_buffer.count = 0;
51 maxwell3d.mme_draw.instance_count = 0;
52 maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
53}
54
55static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
56 const std::vector<u32>& parameters) {
57 const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
58 const u32 element_base = parameters[4];
59 const u32 base_instance = parameters[5];
60 maxwell3d.regs.index_array.first = parameters[3];
61 maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
62 maxwell3d.regs.index_array.count = parameters[1];
63 maxwell3d.regs.vb_element_base = element_base;
64 maxwell3d.regs.vb_base_instance = base_instance;
65 maxwell3d.mme_draw.instance_count = instance_count;
66 maxwell3d.CallMethodFromMME(0x8e3, 0x640);
67 maxwell3d.CallMethodFromMME(0x8e4, element_base);
68 maxwell3d.CallMethodFromMME(0x8e5, base_instance);
69 maxwell3d.regs.draw.topology.Assign(
70 static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
71 if (maxwell3d.ShouldExecute()) {
72 maxwell3d.GetRasterizer().Draw(true, true);
73 }
74 maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
75 maxwell3d.regs.index_array.count = 0;
76 maxwell3d.regs.vb_element_base = 0x0;
77 maxwell3d.regs.vb_base_instance = 0x0;
78 maxwell3d.mme_draw.instance_count = 0;
79 maxwell3d.CallMethodFromMME(0x8e3, 0x640);
80 maxwell3d.CallMethodFromMME(0x8e4, 0x0);
81 maxwell3d.CallMethodFromMME(0x8e5, 0x0);
82 maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
83}
84} // namespace
85
86constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
87 std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
88 std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
89 std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
90}};
91
92HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
93HLEMacro::~HLEMacro() = default;
94
95std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
96 const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
97 [hash](const auto& pair) { return pair.first == hash; });
98 if (it == hle_funcs.end()) {
99 return std::nullopt;
100 }
101 return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
102}
103
104HLEMacroImpl::~HLEMacroImpl() = default;
105
106HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
107 : maxwell3d(maxwell3d), func(func) {}
108
109void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
110 func(maxwell3d, parameters);
111}
112
113} // namespace Tegra
diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h
new file mode 100644
index 000000000..37af875a0
--- /dev/null
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <memory>
8#include <optional>
9#include <vector>
10#include "common/common_types.h"
11#include "video_core/macro/macro.h"
12
13namespace Tegra {
14
15namespace Engines {
16class Maxwell3D;
17}
18
19using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
20
21class HLEMacro {
22public:
23 explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
24 ~HLEMacro();
25
26 std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
27
28private:
29 Engines::Maxwell3D& maxwell3d;
30};
31
32class HLEMacroImpl : public CachedMacro {
33public:
34 explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
35 ~HLEMacroImpl();
36
37 void Execute(const std::vector<u32>& parameters, u32 method) override;
38
39private:
40 Engines::Maxwell3D& maxwell3d;
41 HLEFunction func;
42};
43
44} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp
new file mode 100644
index 000000000..aa5256419
--- /dev/null
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -0,0 +1,289 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/logging/log.h"
7#include "common/microprofile.h"
8#include "video_core/engines/maxwell_3d.h"
9#include "video_core/macro/macro_interpreter.h"
10
11MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
12
13namespace Tegra {
14MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
15 : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
16
17std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
18 return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
19}
20
21MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
22 const std::vector<u32>& code)
23 : maxwell3d(maxwell3d), code(code) {}
24
25void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
26 MICROPROFILE_SCOPE(MacroInterp);
27 Reset();
28
29 registers[1] = parameters[0];
30 num_parameters = parameters.size();
31
32 if (num_parameters > parameters_capacity) {
33 parameters_capacity = num_parameters;
34 this->parameters = std::make_unique<u32[]>(num_parameters);
35 }
36 std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
37 this->num_parameters = num_parameters;
38
39 // Execute the code until we hit an exit condition.
40 bool keep_executing = true;
41 while (keep_executing) {
42 keep_executing = Step(false);
43 }
44
45 // Assert the the macro used all the input parameters
46 ASSERT(next_parameter_index == num_parameters);
47}
48
49void MacroInterpreterImpl::Reset() {
50 registers = {};
51 pc = 0;
52 delayed_pc = {};
53 method_address.raw = 0;
54 num_parameters = 0;
55 // The next parameter index starts at 1, because $r1 already has the value of the first
56 // parameter.
57 next_parameter_index = 1;
58 carry_flag = false;
59}
60
61bool MacroInterpreterImpl::Step(bool is_delay_slot) {
62 u32 base_address = pc;
63
64 Macro::Opcode opcode = GetOpcode();
65 pc += 4;
66
67 // Update the program counter if we were delayed
68 if (delayed_pc) {
69 ASSERT(is_delay_slot);
70 pc = *delayed_pc;
71 delayed_pc = {};
72 }
73
74 switch (opcode.operation) {
75 case Macro::Operation::ALU: {
76 u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
77 GetRegister(opcode.src_b));
78 ProcessResult(opcode.result_operation, opcode.dst, result);
79 break;
80 }
81 case Macro::Operation::AddImmediate: {
82 ProcessResult(opcode.result_operation, opcode.dst,
83 GetRegister(opcode.src_a) + opcode.immediate);
84 break;
85 }
86 case Macro::Operation::ExtractInsert: {
87 u32 dst = GetRegister(opcode.src_a);
88 u32 src = GetRegister(opcode.src_b);
89
90 src = (src >> opcode.bf_src_bit) & opcode.GetBitfieldMask();
91 dst &= ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
92 dst |= src << opcode.bf_dst_bit;
93 ProcessResult(opcode.result_operation, opcode.dst, dst);
94 break;
95 }
96 case Macro::Operation::ExtractShiftLeftImmediate: {
97 u32 dst = GetRegister(opcode.src_a);
98 u32 src = GetRegister(opcode.src_b);
99
100 u32 result = ((src >> dst) & opcode.GetBitfieldMask()) << opcode.bf_dst_bit;
101
102 ProcessResult(opcode.result_operation, opcode.dst, result);
103 break;
104 }
105 case Macro::Operation::ExtractShiftLeftRegister: {
106 u32 dst = GetRegister(opcode.src_a);
107 u32 src = GetRegister(opcode.src_b);
108
109 u32 result = ((src >> opcode.bf_src_bit) & opcode.GetBitfieldMask()) << dst;
110
111 ProcessResult(opcode.result_operation, opcode.dst, result);
112 break;
113 }
114 case Macro::Operation::Read: {
115 u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
116 ProcessResult(opcode.result_operation, opcode.dst, result);
117 break;
118 }
119 case Macro::Operation::Branch: {
120 ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
121 u32 value = GetRegister(opcode.src_a);
122 bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
123 if (taken) {
124 // Ignore the delay slot if the branch has the annul bit.
125 if (opcode.branch_annul) {
126 pc = base_address + opcode.GetBranchTarget();
127 return true;
128 }
129
130 delayed_pc = base_address + opcode.GetBranchTarget();
131 // Execute one more instruction due to the delay slot.
132 return Step(true);
133 }
134 break;
135 }
136 default:
137 UNIMPLEMENTED_MSG("Unimplemented macro operation {}",
138 static_cast<u32>(opcode.operation.Value()));
139 }
140
141 // An instruction with the Exit flag will not actually
142 // cause an exit if it's executed inside a delay slot.
143 if (opcode.is_exit && !is_delay_slot) {
144 // Exit has a delay slot, execute the next instruction
145 Step(true);
146 return false;
147 }
148
149 return true;
150}
151
152u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
153 switch (operation) {
154 case Macro::ALUOperation::Add: {
155 const u64 result{static_cast<u64>(src_a) + src_b};
156 carry_flag = result > 0xffffffff;
157 return static_cast<u32>(result);
158 }
159 case Macro::ALUOperation::AddWithCarry: {
160 const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
161 carry_flag = result > 0xffffffff;
162 return static_cast<u32>(result);
163 }
164 case Macro::ALUOperation::Subtract: {
165 const u64 result{static_cast<u64>(src_a) - src_b};
166 carry_flag = result < 0x100000000;
167 return static_cast<u32>(result);
168 }
169 case Macro::ALUOperation::SubtractWithBorrow: {
170 const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
171 carry_flag = result < 0x100000000;
172 return static_cast<u32>(result);
173 }
174 case Macro::ALUOperation::Xor:
175 return src_a ^ src_b;
176 case Macro::ALUOperation::Or:
177 return src_a | src_b;
178 case Macro::ALUOperation::And:
179 return src_a & src_b;
180 case Macro::ALUOperation::AndNot:
181 return src_a & ~src_b;
182 case Macro::ALUOperation::Nand:
183 return ~(src_a & src_b);
184
185 default:
186 UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation));
187 return 0;
188 }
189}
190
191void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
192 switch (operation) {
193 case Macro::ResultOperation::IgnoreAndFetch:
194 // Fetch parameter and ignore result.
195 SetRegister(reg, FetchParameter());
196 break;
197 case Macro::ResultOperation::Move:
198 // Move result.
199 SetRegister(reg, result);
200 break;
201 case Macro::ResultOperation::MoveAndSetMethod:
202 // Move result and use as Method Address.
203 SetRegister(reg, result);
204 SetMethodAddress(result);
205 break;
206 case Macro::ResultOperation::FetchAndSend:
207 // Fetch parameter and send result.
208 SetRegister(reg, FetchParameter());
209 Send(result);
210 break;
211 case Macro::ResultOperation::MoveAndSend:
212 // Move and send result.
213 SetRegister(reg, result);
214 Send(result);
215 break;
216 case Macro::ResultOperation::FetchAndSetMethod:
217 // Fetch parameter and use result as Method Address.
218 SetRegister(reg, FetchParameter());
219 SetMethodAddress(result);
220 break;
221 case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
222 // Move result and use as Method Address, then fetch and send parameter.
223 SetRegister(reg, result);
224 SetMethodAddress(result);
225 Send(FetchParameter());
226 break;
227 case Macro::ResultOperation::MoveAndSetMethodSend:
228 // Move result and use as Method Address, then send bits 12:17 of result.
229 SetRegister(reg, result);
230 SetMethodAddress(result);
231 Send((result >> 12) & 0b111111);
232 break;
233 default:
234 UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast<u32>(operation));
235 }
236}
237
238bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
239 switch (cond) {
240 case Macro::BranchCondition::Zero:
241 return value == 0;
242 case Macro::BranchCondition::NotZero:
243 return value != 0;
244 }
245 UNREACHABLE();
246 return true;
247}
248
249Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
250 ASSERT((pc % sizeof(u32)) == 0);
251 ASSERT(pc < code.size() * sizeof(u32));
252 return {code[pc / sizeof(u32)]};
253}
254
255u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
256 return registers.at(register_id);
257}
258
259void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
260 // Register 0 is hardwired as the zero register.
261 // Ensure no writes to it actually occur.
262 if (register_id == 0) {
263 return;
264 }
265
266 registers.at(register_id) = value;
267}
268
269void MacroInterpreterImpl::SetMethodAddress(u32 address) {
270 method_address.raw = address;
271}
272
273void MacroInterpreterImpl::Send(u32 value) {
274 maxwell3d.CallMethodFromMME(method_address.address, value);
275 // Increment the method address by the method increment.
276 method_address.address.Assign(method_address.address.Value() +
277 method_address.increment.Value());
278}
279
280u32 MacroInterpreterImpl::Read(u32 method) const {
281 return maxwell3d.GetRegisterValue(method);
282}
283
284u32 MacroInterpreterImpl::FetchParameter() {
285 ASSERT(next_parameter_index < num_parameters);
286 return parameters[next_parameter_index++];
287}
288
289} // namespace Tegra
diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h
new file mode 100644
index 000000000..90217fc89
--- /dev/null
+++ b/src/video_core/macro/macro_interpreter.h
@@ -0,0 +1,102 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6#include <array>
7#include <optional>
8#include <vector>
9#include "common/bit_field.h"
10#include "common/common_types.h"
11#include "video_core/macro/macro.h"
12
13namespace Tegra {
14namespace Engines {
15class Maxwell3D;
16}
17
18class MacroInterpreter final : public MacroEngine {
19public:
20 explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);
21
22protected:
23 std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
24
25private:
26 Engines::Maxwell3D& maxwell3d;
27};
28
29class MacroInterpreterImpl : public CachedMacro {
30public:
31 MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
32 void Execute(const std::vector<u32>& parameters, u32 method) override;
33
34private:
35 /// Resets the execution engine state, zeroing registers, etc.
36 void Reset();
37
38 /**
39 * Executes a single macro instruction located at the current program counter. Returns whether
40 * the interpreter should keep running.
41 * @param offset Offset to start execution at.
42 * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
43 * previous instruction.
44 */
45 bool Step(bool is_delay_slot);
46
47 /// Calculates the result of an ALU operation. src_a OP src_b;
48 u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
49
50 /// Performs the result operation on the input result and stores it in the specified register
51 /// (if necessary).
52 void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
53
54 /// Evaluates the branch condition and returns whether the branch should be taken or not.
55 bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
56
57 /// Reads an opcode at the current program counter location.
58 Macro::Opcode GetOpcode() const;
59
60 /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
61 u32 GetRegister(u32 register_id) const;
62
63 /// Sets the register to the input value.
64 void SetRegister(u32 register_id, u32 value);
65
66 /// Sets the method address to use for the next Send instruction.
67 void SetMethodAddress(u32 address);
68
69 /// Calls a GPU Engine method with the input parameter.
70 void Send(u32 value);
71
72 /// Reads a GPU register located at the method address.
73 u32 Read(u32 method) const;
74
75 /// Returns the next parameter in the parameter queue.
76 u32 FetchParameter();
77
78 Engines::Maxwell3D& maxwell3d;
79
80 /// Current program counter
81 u32 pc;
82 /// Program counter to execute at after the delay slot is executed.
83 std::optional<u32> delayed_pc;
84
85 /// General purpose macro registers.
86 std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
87
88 /// Method address to use for the next Send instruction.
89 Macro::MethodAddress method_address = {};
90
91 /// Input parameters of the current macro.
92 std::unique_ptr<u32[]> parameters;
93 std::size_t num_parameters = 0;
94 std::size_t parameters_capacity = 0;
95 /// Index of the next parameter that will be fetched by the 'parm' instruction.
96 u32 next_parameter_index = 0;
97
98 bool carry_flag = false;
99 const std::vector<u32>& code;
100};
101
102} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp
new file mode 100644
index 000000000..07292702f
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,621 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include "common/assert.h"
6#include "common/logging/log.h"
7#include "common/microprofile.h"
8#include "common/x64/xbyak_util.h"
9#include "video_core/engines/maxwell_3d.h"
10#include "video_core/macro/macro_interpreter.h"
11#include "video_core/macro/macro_jit_x64.h"
12
13MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
14MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
15
16namespace Tegra {
17static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
18static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
19static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
20static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
21static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
22
23static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
24 STATE,
25 RESULT,
26 PARAMETERS,
27 METHOD_ADDRESS,
28 BRANCH_HOLDER,
29});
30
31MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
32 : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
33
34std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
35 return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
36}
37
38MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
39 : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
40 Compile();
41}
42
43MacroJITx64Impl::~MacroJITx64Impl() = default;
44
45void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
46 MICROPROFILE_SCOPE(MacroJitExecute);
47 ASSERT_OR_EXECUTE(program != nullptr, { return; });
48 JITState state{};
49 state.maxwell3d = &maxwell3d;
50 state.registers = {};
51 program(&state, parameters.data());
52}
53
54void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
55 const bool is_a_zero = opcode.src_a == 0;
56 const bool is_b_zero = opcode.src_b == 0;
57 const bool valid_operation = !is_a_zero && !is_b_zero;
58 [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
59 const bool has_zero_register = is_a_zero || is_b_zero;
60 const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
61 opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
62
63 Xbyak::Reg32 src_a;
64 Xbyak::Reg32 src_b;
65
66 if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
67 src_a = Compile_GetRegister(opcode.src_a, RESULT);
68 src_b = Compile_GetRegister(opcode.src_b, eax);
69 } else {
70 if (!is_a_zero) {
71 src_a = Compile_GetRegister(opcode.src_a, RESULT);
72 }
73 if (!is_b_zero) {
74 src_b = Compile_GetRegister(opcode.src_b, eax);
75 }
76 }
77
78 bool has_emitted = false;
79
80 switch (opcode.alu_operation) {
81 case Macro::ALUOperation::Add:
82 if (optimizer.zero_reg_skip) {
83 if (valid_operation) {
84 add(src_a, src_b);
85 }
86 } else {
87 add(src_a, src_b);
88 }
89
90 if (!optimizer.can_skip_carry) {
91 setc(byte[STATE + offsetof(JITState, carry_flag)]);
92 }
93 break;
94 case Macro::ALUOperation::AddWithCarry:
95 bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
96 adc(src_a, src_b);
97 setc(byte[STATE + offsetof(JITState, carry_flag)]);
98 break;
99 case Macro::ALUOperation::Subtract:
100 if (optimizer.zero_reg_skip) {
101 if (valid_operation) {
102 sub(src_a, src_b);
103 has_emitted = true;
104 }
105 } else {
106 sub(src_a, src_b);
107 has_emitted = true;
108 }
109 if (!optimizer.can_skip_carry && has_emitted) {
110 setc(byte[STATE + offsetof(JITState, carry_flag)]);
111 }
112 break;
113 case Macro::ALUOperation::SubtractWithBorrow:
114 bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
115 sbb(src_a, src_b);
116 setc(byte[STATE + offsetof(JITState, carry_flag)]);
117 break;
118 case Macro::ALUOperation::Xor:
119 if (optimizer.zero_reg_skip) {
120 if (valid_operation) {
121 xor_(src_a, src_b);
122 }
123 } else {
124 xor_(src_a, src_b);
125 }
126 break;
127 case Macro::ALUOperation::Or:
128 if (optimizer.zero_reg_skip) {
129 if (valid_operation) {
130 or_(src_a, src_b);
131 }
132 } else {
133 or_(src_a, src_b);
134 }
135 break;
136 case Macro::ALUOperation::And:
137 if (optimizer.zero_reg_skip) {
138 if (!has_zero_register) {
139 and_(src_a, src_b);
140 }
141 } else {
142 and_(src_a, src_b);
143 }
144 break;
145 case Macro::ALUOperation::AndNot:
146 if (optimizer.zero_reg_skip) {
147 if (!is_a_zero) {
148 not_(src_b);
149 and_(src_a, src_b);
150 }
151 } else {
152 not_(src_b);
153 and_(src_a, src_b);
154 }
155 break;
156 case Macro::ALUOperation::Nand:
157 if (optimizer.zero_reg_skip) {
158 if (!is_a_zero) {
159 and_(src_a, src_b);
160 not_(src_a);
161 }
162 } else {
163 and_(src_a, src_b);
164 not_(src_a);
165 }
166 break;
167 default:
168 UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
169 static_cast<std::size_t>(opcode.alu_operation.Value()));
170 break;
171 }
172 Compile_ProcessResult(opcode.result_operation, opcode.dst);
173}
174
175void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
176 if (optimizer.skip_dummy_addimmediate) {
177 // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
178 // without doing anything. In our case we can just not emit anything.
179 if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
180 return;
181 }
182 }
183 // Check for redundant moves
184 if (optimizer.optimize_for_method_move &&
185 opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
186 if (next_opcode.has_value()) {
187 const auto next = *next_opcode;
188 if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
189 opcode.dst == next.dst) {
190 return;
191 }
192 }
193 }
194 if (optimizer.zero_reg_skip && opcode.src_a == 0) {
195 if (opcode.immediate == 0) {
196 xor_(RESULT, RESULT);
197 } else {
198 mov(RESULT, opcode.immediate);
199 }
200 } else {
201 auto result = Compile_GetRegister(opcode.src_a, RESULT);
202 if (opcode.immediate > 2) {
203 add(result, opcode.immediate);
204 } else if (opcode.immediate == 1) {
205 inc(result);
206 } else if (opcode.immediate < 0) {
207 sub(result, opcode.immediate * -1);
208 }
209 }
210 Compile_ProcessResult(opcode.result_operation, opcode.dst);
211}
212
213void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
214 auto dst = Compile_GetRegister(opcode.src_a, RESULT);
215 auto src = Compile_GetRegister(opcode.src_b, eax);
216
217 if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
218 shr(src, opcode.bf_src_bit);
219 } else if (opcode.bf_src_bit == 31) {
220 xor_(src, src);
221 }
222 // Don't bother masking the whole register since we're using a 32 bit register
223 if (opcode.bf_size != 31 && opcode.bf_size != 0) {
224 and_(src, opcode.GetBitfieldMask());
225 } else if (opcode.bf_size == 0) {
226 xor_(src, src);
227 }
228 if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
229 shl(src, opcode.bf_dst_bit);
230 } else if (opcode.bf_dst_bit == 31) {
231 xor_(src, src);
232 }
233
234 const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
235 if (mask != 0xffffffff) {
236 and_(dst, mask);
237 }
238 or_(dst, src);
239 Compile_ProcessResult(opcode.result_operation, opcode.dst);
240}
241
242void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
243 const auto dst = Compile_GetRegister(opcode.src_a, ecx);
244 const auto src = Compile_GetRegister(opcode.src_b, RESULT);
245
246 shr(src, dst.cvt8());
247 if (opcode.bf_size != 0 && opcode.bf_size != 31) {
248 and_(src, opcode.GetBitfieldMask());
249 } else if (opcode.bf_size == 0) {
250 xor_(src, src);
251 }
252
253 if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
254 shl(src, opcode.bf_dst_bit);
255 } else if (opcode.bf_dst_bit == 31) {
256 xor_(src, src);
257 }
258 Compile_ProcessResult(opcode.result_operation, opcode.dst);
259}
260
261void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
262 const auto dst = Compile_GetRegister(opcode.src_a, ecx);
263 const auto src = Compile_GetRegister(opcode.src_b, RESULT);
264
265 if (opcode.bf_src_bit != 0) {
266 shr(src, opcode.bf_src_bit);
267 }
268
269 if (opcode.bf_size != 31) {
270 and_(src, opcode.GetBitfieldMask());
271 }
272 shl(src, dst.cvt8());
273
274 Compile_ProcessResult(opcode.result_operation, opcode.dst);
275}
276
277void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
278 if (optimizer.zero_reg_skip && opcode.src_a == 0) {
279 if (opcode.immediate == 0) {
280 xor_(RESULT, RESULT);
281 } else {
282 mov(RESULT, opcode.immediate);
283 }
284 } else {
285 auto result = Compile_GetRegister(opcode.src_a, RESULT);
286 if (opcode.immediate > 2) {
287 add(result, opcode.immediate);
288 } else if (opcode.immediate == 1) {
289 inc(result);
290 } else if (opcode.immediate < 0) {
291 sub(result, opcode.immediate * -1);
292 }
293 }
294
295 // Equivalent to Engines::Maxwell3D::GetRegisterValue:
296 if (optimizer.enable_asserts) {
297 Xbyak::Label pass_range_check;
298 cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
299 jb(pass_range_check);
300 int3();
301 L(pass_range_check);
302 }
303 mov(rax, qword[STATE]);
304 mov(RESULT,
305 dword[rax + offsetof(Engines::Maxwell3D, regs) +
306 offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
307
308 Compile_ProcessResult(opcode.result_operation, opcode.dst);
309}
310
311static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
312 maxwell3d->CallMethodFromMME(method_address.address, value);
313}
314
315void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
316 Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
317 mov(Common::X64::ABI_PARAM1, qword[STATE]);
318 mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
319 mov(Common::X64::ABI_PARAM3, value);
320 Common::X64::CallFarFunction(*this, &Send);
321 Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
322
323 Xbyak::Label dont_process{};
324 // Get increment
325 test(METHOD_ADDRESS, 0x3f000);
326 // If zero, method address doesn't update
327 je(dont_process);
328
329 mov(ecx, METHOD_ADDRESS);
330 and_(METHOD_ADDRESS, 0xfff);
331 shr(ecx, 12);
332 and_(ecx, 0x3f);
333 lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
334 sal(ecx, 12);
335 or_(eax, ecx);
336
337 mov(METHOD_ADDRESS, eax);
338
339 L(dont_process);
340}
341
342void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
343 ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
344 const s32 jump_address =
345 static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
346
347 Xbyak::Label end;
348 auto value = Compile_GetRegister(opcode.src_a, eax);
349 test(value, value);
350 if (optimizer.has_delayed_pc) {
351 switch (opcode.branch_condition) {
352 case Macro::BranchCondition::Zero:
353 jne(end, T_NEAR);
354 break;
355 case Macro::BranchCondition::NotZero:
356 je(end, T_NEAR);
357 break;
358 }
359
360 if (opcode.branch_annul) {
361 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
362 jmp(labels[jump_address], T_NEAR);
363 } else {
364 Xbyak::Label handle_post_exit{};
365 Xbyak::Label skip{};
366 jmp(skip, T_NEAR);
367 if (opcode.is_exit) {
368 L(handle_post_exit);
369 // Execute 1 instruction
370 mov(BRANCH_HOLDER, end_of_code);
371 // Jump to next instruction to skip delay slot check
372 jmp(labels[jump_address], T_NEAR);
373 } else {
374 L(handle_post_exit);
375 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
376 jmp(labels[jump_address], T_NEAR);
377 }
378 L(skip);
379 mov(BRANCH_HOLDER, handle_post_exit);
380 jmp(delay_skip[pc], T_NEAR);
381 }
382 } else {
383 switch (opcode.branch_condition) {
384 case Macro::BranchCondition::Zero:
385 je(labels[jump_address], T_NEAR);
386 break;
387 case Macro::BranchCondition::NotZero:
388 jne(labels[jump_address], T_NEAR);
389 break;
390 }
391 }
392
393 L(end);
394}
395
396void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
397 optimizer.can_skip_carry = true;
398 optimizer.has_delayed_pc = false;
399 for (auto raw_op : code) {
400 Macro::Opcode op{};
401 op.raw = raw_op;
402
403 if (op.operation == Macro::Operation::ALU) {
404 // Scan for any ALU operations which actually use the carry flag, if they don't exist in
405 // our current code we can skip emitting the carry flag handling operations
406 if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
407 op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
408 optimizer.can_skip_carry = false;
409 }
410 }
411
412 if (op.operation == Macro::Operation::Branch) {
413 if (!op.branch_annul) {
414 optimizer.has_delayed_pc = true;
415 }
416 }
417 }
418}
419
420void MacroJITx64Impl::Compile() {
421 MICROPROFILE_SCOPE(MacroJitCompile);
422 bool keep_executing = true;
423 labels.fill(Xbyak::Label());
424
425 Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
426 // JIT state
427 mov(STATE, Common::X64::ABI_PARAM1);
428 mov(PARAMETERS, Common::X64::ABI_PARAM2);
429 xor_(RESULT, RESULT);
430 xor_(METHOD_ADDRESS, METHOD_ADDRESS);
431 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
432
433 mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
434
435 // Track get register for zero registers and mark it as no-op
436 optimizer.zero_reg_skip = true;
437
438 // AddImmediate tends to be used as a NOP instruction, if we detect this we can
439 // completely skip the entire code path and no emit anything
440 optimizer.skip_dummy_addimmediate = true;
441
442 // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
443 // one if our register isn't "dirty"
444 optimizer.optimize_for_method_move = true;
445
446 // Enable run-time assertions in JITted code
447 optimizer.enable_asserts = false;
448
449 // Check to see if we can skip emitting certain instructions
450 Optimizer_ScanFlags();
451
452 const u32 op_count = static_cast<u32>(code.size());
453 for (u32 i = 0; i < op_count; i++) {
454 if (i < op_count - 1) {
455 pc = i + 1;
456 next_opcode = GetOpCode();
457 } else {
458 next_opcode = {};
459 }
460 pc = i;
461 Compile_NextInstruction();
462 }
463
464 L(end_of_code);
465
466 Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
467 ret();
468 ready();
469 program = getCode<ProgramType>();
470}
471
472bool MacroJITx64Impl::Compile_NextInstruction() {
473 const auto opcode = GetOpCode();
474 if (labels[pc].getAddress()) {
475 return false;
476 }
477
478 L(labels[pc]);
479
480 switch (opcode.operation) {
481 case Macro::Operation::ALU:
482 Compile_ALU(opcode);
483 break;
484 case Macro::Operation::AddImmediate:
485 Compile_AddImmediate(opcode);
486 break;
487 case Macro::Operation::ExtractInsert:
488 Compile_ExtractInsert(opcode);
489 break;
490 case Macro::Operation::ExtractShiftLeftImmediate:
491 Compile_ExtractShiftLeftImmediate(opcode);
492 break;
493 case Macro::Operation::ExtractShiftLeftRegister:
494 Compile_ExtractShiftLeftRegister(opcode);
495 break;
496 case Macro::Operation::Read:
497 Compile_Read(opcode);
498 break;
499 case Macro::Operation::Branch:
500 Compile_Branch(opcode);
501 break;
502 default:
503 UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
504 break;
505 }
506
507 if (optimizer.has_delayed_pc) {
508 if (opcode.is_exit) {
509 mov(rax, end_of_code);
510 test(BRANCH_HOLDER, BRANCH_HOLDER);
511 cmove(BRANCH_HOLDER, rax);
512 // Jump to next instruction to skip delay slot check
513 je(labels[pc + 1], T_NEAR);
514 } else {
515 // TODO(ogniK): Optimize delay slot branching
516 Xbyak::Label no_delay_slot{};
517 test(BRANCH_HOLDER, BRANCH_HOLDER);
518 je(no_delay_slot, T_NEAR);
519 mov(rax, BRANCH_HOLDER);
520 xor_(BRANCH_HOLDER, BRANCH_HOLDER);
521 jmp(rax);
522 L(no_delay_slot);
523 }
524 L(delay_skip[pc]);
525 if (opcode.is_exit) {
526 return false;
527 }
528 } else {
529 test(BRANCH_HOLDER, BRANCH_HOLDER);
530 jne(end_of_code, T_NEAR);
531 if (opcode.is_exit) {
532 inc(BRANCH_HOLDER);
533 return false;
534 }
535 }
536 return true;
537}
538
539Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
540 mov(eax, dword[PARAMETERS]);
541 add(PARAMETERS, sizeof(u32));
542 return eax;
543}
544
545Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
546 if (index == 0) {
547 // Register 0 is always zero
548 xor_(dst, dst);
549 } else {
550 mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
551 }
552
553 return dst;
554}
555
556void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
557 const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
558 // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
559 // register.
560 if (reg == 0) {
561 return;
562 }
563 mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
564 };
565 const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
566
567 switch (operation) {
568 case Macro::ResultOperation::IgnoreAndFetch:
569 SetRegister(reg, Compile_FetchParameter());
570 break;
571 case Macro::ResultOperation::Move:
572 SetRegister(reg, RESULT);
573 break;
574 case Macro::ResultOperation::MoveAndSetMethod:
575 SetRegister(reg, RESULT);
576 SetMethodAddress(RESULT);
577 break;
578 case Macro::ResultOperation::FetchAndSend:
579 // Fetch parameter and send result.
580 SetRegister(reg, Compile_FetchParameter());
581 Compile_Send(RESULT);
582 break;
583 case Macro::ResultOperation::MoveAndSend:
584 // Move and send result.
585 SetRegister(reg, RESULT);
586 Compile_Send(RESULT);
587 break;
588 case Macro::ResultOperation::FetchAndSetMethod:
589 // Fetch parameter and use result as Method Address.
590 SetRegister(reg, Compile_FetchParameter());
591 SetMethodAddress(RESULT);
592 break;
593 case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
594 // Move result and use as Method Address, then fetch and send parameter.
595 SetRegister(reg, RESULT);
596 SetMethodAddress(RESULT);
597 Compile_Send(Compile_FetchParameter());
598 break;
599 case Macro::ResultOperation::MoveAndSetMethodSend:
600 // Move result and use as Method Address, then send bits 12:17 of result.
601 SetRegister(reg, RESULT);
602 SetMethodAddress(RESULT);
603 shr(RESULT, 12);
604 and_(RESULT, 0b111111);
605 Compile_Send(RESULT);
606 break;
607 default:
608 UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
609 }
610}
611
612Macro::Opcode MacroJITx64Impl::GetOpCode() const {
613 ASSERT(pc < code.size());
614 return {code[pc]};
615}
616
617std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
618 return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
619}
620
621} // namespace Tegra
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
new file mode 100644
index 000000000..a180e7428
--- /dev/null
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,98 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <array>
8#include <bitset>
9#include <xbyak.h>
10#include "common/bit_field.h"
11#include "common/common_types.h"
12#include "common/x64/xbyak_abi.h"
13#include "video_core/macro/macro.h"
14
15namespace Tegra {
16
17namespace Engines {
18class Maxwell3D;
19}
20
21/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
22constexpr size_t MAX_CODE_SIZE = 0x10000;
23
24class MacroJITx64 final : public MacroEngine {
25public:
26 explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
27
28protected:
29 std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
30
31private:
32 Engines::Maxwell3D& maxwell3d;
33};
34
35class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
36public:
37 MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
38 ~MacroJITx64Impl();
39
40 void Execute(const std::vector<u32>& parameters, u32 method) override;
41
42 void Compile_ALU(Macro::Opcode opcode);
43 void Compile_AddImmediate(Macro::Opcode opcode);
44 void Compile_ExtractInsert(Macro::Opcode opcode);
45 void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
46 void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
47 void Compile_Read(Macro::Opcode opcode);
48 void Compile_Branch(Macro::Opcode opcode);
49
50private:
51 void Optimizer_ScanFlags();
52
53 void Compile();
54 bool Compile_NextInstruction();
55
56 Xbyak::Reg32 Compile_FetchParameter();
57 Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
58
59 void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
60 void Compile_Send(Xbyak::Reg32 value);
61
62 Macro::Opcode GetOpCode() const;
63 std::bitset<32> PersistentCallerSavedRegs() const;
64
65 struct JITState {
66 Engines::Maxwell3D* maxwell3d{};
67 std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
68 u32 carry_flag{};
69 };
70 static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
71 using ProgramType = void (*)(JITState*, const u32*);
72
73 struct OptimizerState {
74 bool can_skip_carry{};
75 bool has_delayed_pc{};
76 bool zero_reg_skip{};
77 bool skip_dummy_addimmediate{};
78 bool optimize_for_method_move{};
79 bool enable_asserts{};
80 };
81 OptimizerState optimizer{};
82
83 std::optional<Macro::Opcode> next_opcode{};
84 ProgramType program{nullptr};
85
86 std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
87 std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
88 Xbyak::Label end_of_code{};
89
90 bool is_delay_slot{};
91 u32 pc{};
92 std::optional<u32> delayed_pc;
93
94 const std::vector<u32>& code;
95 Engines::Maxwell3D& maxwell3d;
96};
97
98} // namespace Tegra