summaryrefslogtreecommitdiff
path: root/src/core/arm/nce/patcher.cpp
diff options
context:
space:
mode:
authorGravatar liamwhite2023-11-30 09:20:55 -0500
committerGravatar GitHub2023-11-30 09:20:55 -0500
commit57a391e71db13ade7a3d96f59d53781eff18d2ac (patch)
tree0b4223de40a2d77598ac9095b1374353c2e9da7c /src/core/arm/nce/patcher.cpp
parentMerge pull request #12223 from liamwhite/fruit-company (diff)
parentcore: Rename patcher file (diff)
downloadyuzu-57a391e71db13ade7a3d96f59d53781eff18d2ac.tar.gz
yuzu-57a391e71db13ade7a3d96f59d53781eff18d2ac.tar.xz
yuzu-57a391e71db13ade7a3d96f59d53781eff18d2ac.zip
Merge pull request #12074 from GPUCode/yuwu-on-the-metal
Implement Native Code Execution (NCE)
Diffstat (limited to 'src/core/arm/nce/patcher.cpp')
-rw-r--r--src/core/arm/nce/patcher.cpp474
1 files changed, 474 insertions, 0 deletions
diff --git a/src/core/arm/nce/patcher.cpp b/src/core/arm/nce/patcher.cpp
new file mode 100644
index 000000000..ec8527224
--- /dev/null
+++ b/src/core/arm/nce/patcher.cpp
@@ -0,0 +1,474 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include "common/arm64/native_clock.h"
5#include "common/bit_cast.h"
6#include "common/literals.h"
7#include "core/arm/nce/arm_nce.h"
8#include "core/arm/nce/guest_context.h"
9#include "core/arm/nce/instructions.h"
10#include "core/arm/nce/patcher.h"
11#include "core/core.h"
12#include "core/core_timing.h"
13#include "core/hle/kernel/svc.h"
14
15namespace Core::NCE {
16
17using namespace Common::Literals;
18using namespace oaknut::util;
19
20using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
21
22constexpr size_t MaxRelativeBranch = 128_MiB;
23constexpr u32 ModuleCodeIndex = 0x24 / sizeof(u32);
24
25Patcher::Patcher() : c(m_patch_instructions) {}
26
27Patcher::~Patcher() = default;
28
29void Patcher::PatchText(const Kernel::PhysicalMemory& program_image,
30 const Kernel::CodeSet::Segment& code) {
31
32 // Write save context helper function.
33 c.l(m_save_context);
34 WriteSaveContext();
35
36 // Write load context helper function.
37 c.l(m_load_context);
38 WriteLoadContext();
39
40 // Retrieve text segment data.
41 const auto text = std::span{program_image}.subspan(code.offset, code.size);
42 const auto text_words =
43 std::span<const u32>{reinterpret_cast<const u32*>(text.data()), text.size() / sizeof(u32)};
44
45 // Loop through instructions, patching as needed.
46 for (u32 i = ModuleCodeIndex; i < static_cast<u32>(text_words.size()); i++) {
47 const u32 inst = text_words[i];
48
49 const auto AddRelocations = [&] {
50 const uintptr_t this_offset = i * sizeof(u32);
51 const uintptr_t next_offset = this_offset + sizeof(u32);
52
53 // Relocate from here to patch.
54 this->BranchToPatch(this_offset);
55
56 // Relocate from patch to next instruction.
57 return next_offset;
58 };
59
60 // SVC
61 if (auto svc = SVC{inst}; svc.Verify()) {
62 WriteSvcTrampoline(AddRelocations(), svc.GetValue());
63 continue;
64 }
65
66 // MRS Xn, TPIDR_EL0
67 // MRS Xn, TPIDRRO_EL0
68 if (auto mrs = MRS{inst};
69 mrs.Verify() && (mrs.GetSystemReg() == TpidrroEl0 || mrs.GetSystemReg() == TpidrEl0)) {
70 const auto src_reg = mrs.GetSystemReg() == TpidrroEl0 ? oaknut::SystemReg::TPIDRRO_EL0
71 : oaknut::SystemReg::TPIDR_EL0;
72 const auto dest_reg = oaknut::XReg{static_cast<int>(mrs.GetRt())};
73 WriteMrsHandler(AddRelocations(), dest_reg, src_reg);
74 continue;
75 }
76
77 // MRS Xn, CNTPCT_EL0
78 if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntpctEl0) {
79 WriteCntpctHandler(AddRelocations(), oaknut::XReg{static_cast<int>(mrs.GetRt())});
80 continue;
81 }
82
83 // MRS Xn, CNTFRQ_EL0
84 if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntfrqEl0) {
85 UNREACHABLE();
86 }
87
88 // MSR TPIDR_EL0, Xn
89 if (auto msr = MSR{inst}; msr.Verify() && msr.GetSystemReg() == TpidrEl0) {
90 WriteMsrHandler(AddRelocations(), oaknut::XReg{static_cast<int>(msr.GetRt())});
91 continue;
92 }
93
94 if (auto exclusive = Exclusive{inst}; exclusive.Verify()) {
95 m_exclusives.push_back(i);
96 }
97 }
98
99 // Determine patching mode for the final relocation step
100 const size_t image_size = program_image.size();
101 this->mode = image_size > MaxRelativeBranch ? PatchMode::PreText : PatchMode::PostData;
102}
103
104void Patcher::RelocateAndCopy(Common::ProcessAddress load_base,
105 const Kernel::CodeSet::Segment& code,
106 Kernel::PhysicalMemory& program_image,
107 EntryTrampolines* out_trampolines) {
108 const size_t patch_size = GetSectionSize();
109 const size_t image_size = program_image.size();
110
111 // Retrieve text segment data.
112 const auto text = std::span{program_image}.subspan(code.offset, code.size);
113 const auto text_words =
114 std::span<u32>{reinterpret_cast<u32*>(text.data()), text.size() / sizeof(u32)};
115
116 const auto ApplyBranchToPatchRelocation = [&](u32* target, const Relocation& rel) {
117 oaknut::CodeGenerator rc{target};
118 if (mode == PatchMode::PreText) {
119 rc.B(rel.patch_offset - patch_size - rel.module_offset);
120 } else {
121 rc.B(image_size - rel.module_offset + rel.patch_offset);
122 }
123 };
124
125 const auto ApplyBranchToModuleRelocation = [&](u32* target, const Relocation& rel) {
126 oaknut::CodeGenerator rc{target};
127 if (mode == PatchMode::PreText) {
128 rc.B(patch_size - rel.patch_offset + rel.module_offset);
129 } else {
130 rc.B(rel.module_offset - image_size - rel.patch_offset);
131 }
132 };
133
134 const auto RebasePatch = [&](ptrdiff_t patch_offset) {
135 if (mode == PatchMode::PreText) {
136 return GetInteger(load_base) + patch_offset;
137 } else {
138 return GetInteger(load_base) + image_size + patch_offset;
139 }
140 };
141
142 const auto RebasePc = [&](uintptr_t module_offset) {
143 if (mode == PatchMode::PreText) {
144 return GetInteger(load_base) + patch_size + module_offset;
145 } else {
146 return GetInteger(load_base) + module_offset;
147 }
148 };
149
150 // We are now ready to relocate!
151 for (const Relocation& rel : m_branch_to_patch_relocations) {
152 ApplyBranchToPatchRelocation(text_words.data() + rel.module_offset / sizeof(u32), rel);
153 }
154 for (const Relocation& rel : m_branch_to_module_relocations) {
155 ApplyBranchToModuleRelocation(m_patch_instructions.data() + rel.patch_offset / sizeof(u32),
156 rel);
157 }
158
159 // Rewrite PC constants and record post trampolines
160 for (const Relocation& rel : m_write_module_pc_relocations) {
161 oaknut::CodeGenerator rc{m_patch_instructions.data() + rel.patch_offset / sizeof(u32)};
162 rc.dx(RebasePc(rel.module_offset));
163 }
164 for (const Trampoline& rel : m_trampolines) {
165 out_trampolines->insert({RebasePc(rel.module_offset), RebasePatch(rel.patch_offset)});
166 }
167
168 // Cortex-A57 seems to treat all exclusives as ordered, but newer processors do not.
169 // Convert to ordered to preserve this assumption.
170 for (const ModuleTextAddress i : m_exclusives) {
171 auto exclusive = Exclusive{text_words[i]};
172 text_words[i] = exclusive.AsOrdered();
173 }
174
175 // Copy to program image
176 if (this->mode == PatchMode::PreText) {
177 std::memcpy(program_image.data(), m_patch_instructions.data(),
178 m_patch_instructions.size() * sizeof(u32));
179 } else {
180 program_image.resize(image_size + patch_size);
181 std::memcpy(program_image.data() + image_size, m_patch_instructions.data(),
182 m_patch_instructions.size() * sizeof(u32));
183 }
184}
185
186size_t Patcher::GetSectionSize() const noexcept {
187 return Common::AlignUp(m_patch_instructions.size() * sizeof(u32), Core::Memory::YUZU_PAGESIZE);
188}
189
190void Patcher::WriteLoadContext() {
191 // This function was called, which modifies X30, so use that as a scratch register.
192 // SP contains the guest X30, so save our return X30 to SP + 8, since we have allocated 16 bytes
193 // of stack.
194 c.STR(X30, SP, 8);
195 c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
196 c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
197
198 // Load system registers.
199 c.LDR(W0, X30, offsetof(GuestContext, fpsr));
200 c.MSR(oaknut::SystemReg::FPSR, X0);
201 c.LDR(W0, X30, offsetof(GuestContext, fpcr));
202 c.MSR(oaknut::SystemReg::FPCR, X0);
203 c.LDR(W0, X30, offsetof(GuestContext, nzcv));
204 c.MSR(oaknut::SystemReg::NZCV, X0);
205
206 // Load all vector registers.
207 static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
208 for (int i = 0; i <= 30; i += 2) {
209 c.LDP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
210 }
211
212 // Load all general-purpose registers except X30.
213 for (int i = 0; i <= 28; i += 2) {
214 c.LDP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
215 }
216
217 // Reload our return X30 from the stack and return.
218 // The patch code will reload the guest X30 for us.
219 c.LDR(X30, SP, 8);
220 c.RET();
221}
222
223void Patcher::WriteSaveContext() {
224 // This function was called, which modifies X30, so use that as a scratch register.
225 // SP contains the guest X30, so save our X30 to SP + 8, since we have allocated 16 bytes of
226 // stack.
227 c.STR(X30, SP, 8);
228 c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
229 c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
230
231 // Store all general-purpose registers except X30.
232 for (int i = 0; i <= 28; i += 2) {
233 c.STP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
234 }
235
236 // Store all vector registers.
237 static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
238 for (int i = 0; i <= 30; i += 2) {
239 c.STP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
240 }
241
242 // Store guest system registers, X30 and SP, using X0 as a scratch register.
243 c.STR(X0, SP, PRE_INDEXED, -16);
244 c.LDR(X0, SP, 16);
245 c.STR(X0, X30, 8 * 30);
246 c.ADD(X0, SP, 32);
247 c.STR(X0, X30, offsetof(GuestContext, sp));
248 c.MRS(X0, oaknut::SystemReg::FPSR);
249 c.STR(W0, X30, offsetof(GuestContext, fpsr));
250 c.MRS(X0, oaknut::SystemReg::FPCR);
251 c.STR(W0, X30, offsetof(GuestContext, fpcr));
252 c.MRS(X0, oaknut::SystemReg::NZCV);
253 c.STR(W0, X30, offsetof(GuestContext, nzcv));
254 c.LDR(X0, SP, POST_INDEXED, 16);
255
256 // Reload our return X30 from the stack, and return.
257 c.LDR(X30, SP, 8);
258 c.RET();
259}
260
261void Patcher::WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id) {
262 // We are about to start saving state, so we need to lock the context.
263 this->LockContext();
264
265 // Store guest X30 to the stack. Then, save the context and restore the stack.
266 // This will save all registers except PC, but we know PC at patch time.
267 c.STR(X30, SP, PRE_INDEXED, -16);
268 c.BL(m_save_context);
269 c.LDR(X30, SP, POST_INDEXED, 16);
270
271 // Now that we've saved all registers, we can use any registers as scratch.
272 // Store PC + 4 to arm interface, since we know the instruction offset from the entry point.
273 oaknut::Label pc_after_svc;
274 c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
275 c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
276 c.LDR(X2, pc_after_svc);
277 c.STR(X2, X1, offsetof(GuestContext, pc));
278
279 // Store SVC number to execute when we return
280 c.MOV(X2, svc_id);
281 c.STR(W2, X1, offsetof(GuestContext, svc_swi));
282
283 // We are calling a SVC. Clear esr_el1 and return it.
284 static_assert(std::is_same_v<std::underlying_type_t<HaltReason>, u64>);
285 oaknut::Label retry;
286 c.ADD(X2, X1, offsetof(GuestContext, esr_el1));
287 c.l(retry);
288 c.LDAXR(X0, X2);
289 c.STLXR(W3, XZR, X2);
290 c.CBNZ(W3, retry);
291
292 // Add "calling SVC" flag. Since this is X0, this is now our return value.
293 c.ORR(X0, X0, static_cast<u64>(HaltReason::SupervisorCall));
294
295 // Offset the GuestContext pointer to the HostContext member.
296 // STP has limited range of [-512, 504] which we can't reach otherwise
297 // NB: Due to this all offsets below are from the start of HostContext.
298 c.ADD(X1, X1, offsetof(GuestContext, host_ctx));
299
300 // Reload host TPIDR_EL0 and SP.
301 static_assert(offsetof(HostContext, host_sp) + 8 == offsetof(HostContext, host_tpidr_el0));
302 c.LDP(X2, X3, X1, offsetof(HostContext, host_sp));
303 c.MOV(SP, X2);
304 c.MSR(oaknut::SystemReg::TPIDR_EL0, X3);
305
306 // Load callee-saved host registers and return to host.
307 static constexpr size_t HOST_REGS_OFF = offsetof(HostContext, host_saved_regs);
308 static constexpr size_t HOST_VREGS_OFF = offsetof(HostContext, host_saved_vregs);
309 c.LDP(X19, X20, X1, HOST_REGS_OFF);
310 c.LDP(X21, X22, X1, HOST_REGS_OFF + 2 * sizeof(u64));
311 c.LDP(X23, X24, X1, HOST_REGS_OFF + 4 * sizeof(u64));
312 c.LDP(X25, X26, X1, HOST_REGS_OFF + 6 * sizeof(u64));
313 c.LDP(X27, X28, X1, HOST_REGS_OFF + 8 * sizeof(u64));
314 c.LDP(X29, X30, X1, HOST_REGS_OFF + 10 * sizeof(u64));
315 c.LDP(Q8, Q9, X1, HOST_VREGS_OFF);
316 c.LDP(Q10, Q11, X1, HOST_VREGS_OFF + 2 * sizeof(u128));
317 c.LDP(Q12, Q13, X1, HOST_VREGS_OFF + 4 * sizeof(u128));
318 c.LDP(Q14, Q15, X1, HOST_VREGS_OFF + 6 * sizeof(u128));
319 c.RET();
320
321 // Write the post-SVC trampoline address, which will jump back to the guest after restoring its
322 // state.
323 m_trampolines.push_back({c.offset(), module_dest});
324
325 // Host called this location. Save the return address so we can
326 // unwind the stack properly when jumping back.
327 c.MRS(X2, oaknut::SystemReg::TPIDR_EL0);
328 c.LDR(X2, X2, offsetof(NativeExecutionParameters, native_context));
329 c.ADD(X0, X2, offsetof(GuestContext, host_ctx));
330 c.STR(X30, X0, offsetof(HostContext, host_saved_regs) + 11 * sizeof(u64));
331
332 // Reload all guest registers except X30 and PC.
333 // The function also expects 16 bytes of stack already allocated.
334 c.STR(X30, SP, PRE_INDEXED, -16);
335 c.BL(m_load_context);
336 c.LDR(X30, SP, POST_INDEXED, 16);
337
338 // Use X1 as a scratch register to restore X30.
339 c.STR(X1, SP, PRE_INDEXED, -16);
340 c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
341 c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
342 c.LDR(X30, X1, offsetof(GuestContext, cpu_registers) + sizeof(u64) * 30);
343 c.LDR(X1, SP, POST_INDEXED, 16);
344
345 // Unlock the context.
346 this->UnlockContext();
347
348 // Jump back to the instruction after the emulated SVC.
349 this->BranchToModule(module_dest);
350
351 // Store PC after call.
352 c.l(pc_after_svc);
353 this->WriteModulePc(module_dest);
354}
355
356void Patcher::WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
357 oaknut::SystemReg src_reg) {
358 // Retrieve emulated TLS register from GuestContext.
359 c.MRS(dest_reg, oaknut::SystemReg::TPIDR_EL0);
360 if (src_reg == oaknut::SystemReg::TPIDRRO_EL0) {
361 c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidrro_el0));
362 } else {
363 c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidr_el0));
364 }
365
366 // Jump back to the instruction after the emulated MRS.
367 this->BranchToModule(module_dest);
368}
369
370void Patcher::WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg) {
371 const auto scratch_reg = src_reg.index() == 0 ? X1 : X0;
372 c.STR(scratch_reg, SP, PRE_INDEXED, -16);
373
374 // Save guest value to NativeExecutionParameters::tpidr_el0.
375 c.MRS(scratch_reg, oaknut::SystemReg::TPIDR_EL0);
376 c.STR(src_reg, scratch_reg, offsetof(NativeExecutionParameters, tpidr_el0));
377
378 // Restore scratch register.
379 c.LDR(scratch_reg, SP, POST_INDEXED, 16);
380
381 // Jump back to the instruction after the emulated MSR.
382 this->BranchToModule(module_dest);
383}
384
385void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg) {
386 static Common::Arm64::NativeClock clock{};
387 const auto factor = clock.GetGuestCNTFRQFactor();
388 const auto raw_factor = Common::BitCast<std::array<u64, 2>>(factor);
389
390 const auto use_x2_x3 = dest_reg.index() == 0 || dest_reg.index() == 1;
391 oaknut::XReg scratch0 = use_x2_x3 ? X2 : X0;
392 oaknut::XReg scratch1 = use_x2_x3 ? X3 : X1;
393
394 oaknut::Label factorlo;
395 oaknut::Label factorhi;
396
397 // Save scratches.
398 c.STP(scratch0, scratch1, SP, PRE_INDEXED, -16);
399
400 // Load counter value.
401 c.MRS(dest_reg, oaknut::SystemReg::CNTVCT_EL0);
402
403 // Load scaling factor.
404 c.LDR(scratch0, factorlo);
405 c.LDR(scratch1, factorhi);
406
407 // Multiply low bits and get result.
408 c.UMULH(scratch0, dest_reg, scratch0);
409
410 // Multiply high bits and add low bit result.
411 c.MADD(dest_reg, dest_reg, scratch1, scratch0);
412
413 // Reload scratches.
414 c.LDP(scratch0, scratch1, SP, POST_INDEXED, 16);
415
416 // Jump back to the instruction after the emulated MRS.
417 this->BranchToModule(module_dest);
418
419 // Scaling factor constant values.
420 c.l(factorlo);
421 c.dx(raw_factor[0]);
422 c.l(factorhi);
423 c.dx(raw_factor[1]);
424}
425
426void Patcher::LockContext() {
427 oaknut::Label retry;
428
429 // Save scratches.
430 c.STP(X0, X1, SP, PRE_INDEXED, -16);
431
432 // Reload lock pointer.
433 c.l(retry);
434 c.CLREX();
435 c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
436 c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
437
438 static_assert(SpinLockLocked == 0);
439
440 // Load-linked with acquire ordering.
441 c.LDAXR(W1, X0);
442
443 // If the value was SpinLockLocked, clear monitor and retry.
444 c.CBZ(W1, retry);
445
446 // Store-conditional SpinLockLocked with relaxed ordering.
447 c.STXR(W1, WZR, X0);
448
449 // If we failed to store, retry.
450 c.CBNZ(W1, retry);
451
452 // We succeeded! Reload scratches.
453 c.LDP(X0, X1, SP, POST_INDEXED, 16);
454}
455
456void Patcher::UnlockContext() {
457 // Save scratches.
458 c.STP(X0, X1, SP, PRE_INDEXED, -16);
459
460 // Load lock pointer.
461 c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
462 c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
463
464 // Load SpinLockUnlocked.
465 c.MOV(W1, SpinLockUnlocked);
466
467 // Store value with release ordering.
468 c.STLR(W1, X0);
469
470 // Load scratches.
471 c.LDP(X0, X1, SP, POST_INDEXED, 16);
472}
473
474} // namespace Core::NCE