summaryrefslogtreecommitdiff
path: root/src/core/arm/nce/patch.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/arm/nce/patch.cpp')
-rw-r--r--src/core/arm/nce/patch.cpp472
1 files changed, 472 insertions, 0 deletions
diff --git a/src/core/arm/nce/patch.cpp b/src/core/arm/nce/patch.cpp
new file mode 100644
index 000000000..c79399c2b
--- /dev/null
+++ b/src/core/arm/nce/patch.cpp
@@ -0,0 +1,472 @@
1// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
2// SPDX-License-Identifier: GPL-2.0-or-later
3
4#include "common/arm64/native_clock.h"
5#include "common/bit_cast.h"
6#include "common/literals.h"
7#include "core/arm/nce/arm_nce.h"
8#include "core/arm/nce/guest_context.h"
9#include "core/arm/nce/instructions.h"
10#include "core/arm/nce/patch.h"
11#include "core/core.h"
12#include "core/core_timing.h"
13#include "core/hle/kernel/svc.h"
14
15namespace Core::NCE {
16
17using namespace Common::Literals;
18using namespace oaknut::util;
19
20using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
21
22constexpr size_t MaxRelativeBranch = 128_MiB;
23
24Patcher::Patcher() : c(m_patch_instructions) {}
25
26Patcher::~Patcher() = default;
27
28void Patcher::PatchText(const Kernel::PhysicalMemory& program_image,
29 const Kernel::CodeSet::Segment& code) {
30
31 // Write save context helper function.
32 c.l(m_save_context);
33 WriteSaveContext();
34
35 // Write load context helper function.
36 c.l(m_load_context);
37 WriteLoadContext();
38
39 // Retrieve text segment data.
40 const auto text = std::span{program_image}.subspan(code.offset, code.size);
41 const auto text_words =
42 std::span<const u32>{reinterpret_cast<const u32*>(text.data()), text.size() / sizeof(u32)};
43
44 // Loop through instructions, patching as needed.
45 for (u32 i = 0; i < static_cast<u32>(text_words.size()); i++) {
46 const u32 inst = text_words[i];
47
48 const auto AddRelocations = [&] {
49 const uintptr_t this_offset = i * sizeof(u32);
50 const uintptr_t next_offset = this_offset + sizeof(u32);
51
52 // Relocate from here to patch.
53 this->BranchToPatch(this_offset);
54
55 // Relocate from patch to next instruction.
56 return next_offset;
57 };
58
59 // SVC
60 if (auto svc = SVC{inst}; svc.Verify()) {
61 WriteSvcTrampoline(AddRelocations(), svc.GetValue());
62 continue;
63 }
64
65 // MRS Xn, TPIDR_EL0
66 // MRS Xn, TPIDRRO_EL0
67 if (auto mrs = MRS{inst};
68 mrs.Verify() && (mrs.GetSystemReg() == TpidrroEl0 || mrs.GetSystemReg() == TpidrEl0)) {
69 const auto src_reg = mrs.GetSystemReg() == TpidrroEl0 ? oaknut::SystemReg::TPIDRRO_EL0
70 : oaknut::SystemReg::TPIDR_EL0;
71 const auto dest_reg = oaknut::XReg{static_cast<int>(mrs.GetRt())};
72 WriteMrsHandler(AddRelocations(), dest_reg, src_reg);
73 continue;
74 }
75
76 // MRS Xn, CNTPCT_EL0
77 if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntpctEl0) {
78 WriteCntpctHandler(AddRelocations(), oaknut::XReg{static_cast<int>(mrs.GetRt())});
79 continue;
80 }
81
82 // MRS Xn, CNTFRQ_EL0
83 if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntfrqEl0) {
84 UNREACHABLE();
85 }
86
87 // MSR TPIDR_EL0, Xn
88 if (auto msr = MSR{inst}; msr.Verify() && msr.GetSystemReg() == TpidrEl0) {
89 WriteMsrHandler(AddRelocations(), oaknut::XReg{static_cast<int>(msr.GetRt())});
90 continue;
91 }
92 }
93
94 // Determine patching mode for the final relocation step
95 const size_t image_size = program_image.size();
96 this->mode = image_size > MaxRelativeBranch ? PatchMode::PreText : PatchMode::PostData;
97}
98
99void Patcher::RelocateAndCopy(Common::ProcessAddress load_base,
100 const Kernel::CodeSet::Segment& code,
101 Kernel::PhysicalMemory& program_image,
102 EntryTrampolines* out_trampolines) {
103 const size_t patch_size = SectionSize();
104 const size_t image_size = program_image.size();
105
106 // Retrieve text segment data.
107 const auto text = std::span{program_image}.subspan(code.offset, code.size);
108 const auto text_words =
109 std::span<u32>{reinterpret_cast<u32*>(text.data()), text.size() / sizeof(u32)};
110
111 const auto ApplyBranchToPatchRelocation = [&](u32* target, const Relocation& rel) {
112 oaknut::CodeGenerator rc{target};
113 if (mode == PatchMode::PreText) {
114 rc.B(rel.patch_offset - patch_size - rel.module_offset);
115 } else {
116 rc.B(image_size - rel.module_offset + rel.patch_offset);
117 }
118 };
119
120 const auto ApplyBranchToModuleRelocation = [&](u32* target, const Relocation& rel) {
121 oaknut::CodeGenerator rc{target};
122 if (mode == PatchMode::PreText) {
123 rc.B(patch_size - rel.patch_offset + rel.module_offset);
124 } else {
125 rc.B(rel.module_offset - image_size - rel.patch_offset);
126 }
127 };
128
129 const auto RebasePatch = [&](ptrdiff_t patch_offset) {
130 if (mode == PatchMode::PreText) {
131 return GetInteger(load_base) + patch_offset;
132 } else {
133 return GetInteger(load_base) + image_size + patch_offset;
134 }
135 };
136
137 const auto RebasePc = [&](uintptr_t module_offset) {
138 if (mode == PatchMode::PreText) {
139 return GetInteger(load_base) + patch_size + module_offset;
140 } else {
141 return GetInteger(load_base) + module_offset;
142 }
143 };
144
145 // We are now ready to relocate!
146 for (const Relocation& rel : m_branch_to_patch_relocations) {
147 ApplyBranchToPatchRelocation(text_words.data() + rel.module_offset / sizeof(u32), rel);
148 }
149 for (const Relocation& rel : m_branch_to_module_relocations) {
150 ApplyBranchToModuleRelocation(m_patch_instructions.data() + rel.patch_offset / sizeof(u32),
151 rel);
152 }
153
154 // Rewrite PC constants and record post trampolines
155 for (const Relocation& rel : m_write_module_pc_relocations) {
156 oaknut::CodeGenerator rc{m_patch_instructions.data() + rel.patch_offset / sizeof(u32)};
157 rc.dx(RebasePc(rel.module_offset));
158 }
159 for (const Trampoline& rel : m_trampolines) {
160 out_trampolines->insert({RebasePc(rel.module_offset), RebasePatch(rel.patch_offset)});
161 }
162
163 // Cortex-A57 seems to treat all exclusives as ordered, but newer processors do not.
164 // Convert to ordered to preserve this assumption
165 for (u32 i = 0; i < static_cast<u32>(text_words.size()); i++) {
166 const u32 inst = text_words[i];
167 if (auto exclusive = Exclusive{inst}; exclusive.Verify()) {
168 text_words[i] = exclusive.AsOrdered();
169 }
170 }
171
172 // Copy to program image
173 if (this->mode == PatchMode::PreText) {
174 std::memcpy(program_image.data(), m_patch_instructions.data(),
175 m_patch_instructions.size() * sizeof(u32));
176 } else {
177 program_image.resize(image_size + patch_size);
178 std::memcpy(program_image.data() + image_size, m_patch_instructions.data(),
179 m_patch_instructions.size() * sizeof(u32));
180 }
181}
182
183size_t Patcher::SectionSize() const noexcept {
184 return Common::AlignUp(m_patch_instructions.size() * sizeof(u32), Core::Memory::YUZU_PAGESIZE);
185}
186
187void Patcher::WriteLoadContext() {
188 // This function was called, which modifies X30, so use that as a scratch register.
189 // SP contains the guest X30, so save our return X30 to SP + 8, since we have allocated 16 bytes
190 // of stack.
191 c.STR(X30, SP, 8);
192 c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
193 c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
194
195 // Load system registers.
196 c.LDR(W0, X30, offsetof(GuestContext, fpsr));
197 c.MSR(oaknut::SystemReg::FPSR, X0);
198 c.LDR(W0, X30, offsetof(GuestContext, fpcr));
199 c.MSR(oaknut::SystemReg::FPCR, X0);
200 c.LDR(W0, X30, offsetof(GuestContext, nzcv));
201 c.MSR(oaknut::SystemReg::NZCV, X0);
202
203 // Load all vector registers.
204 static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
205 for (int i = 0; i <= 30; i += 2) {
206 c.LDP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
207 }
208
209 // Load all general-purpose registers except X30.
210 for (int i = 0; i <= 28; i += 2) {
211 c.LDP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
212 }
213
214 // Reload our return X30 from the stack and return.
215 // The patch code will reload the guest X30 for us.
216 c.LDR(X30, SP, 8);
217 c.RET();
218}
219
220void Patcher::WriteSaveContext() {
221 // This function was called, which modifies X30, so use that as a scratch register.
222 // SP contains the guest X30, so save our X30 to SP + 8, since we have allocated 16 bytes of
223 // stack.
224 c.STR(X30, SP, 8);
225 c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
226 c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
227
228 // Store all general-purpose registers except X30.
229 for (int i = 0; i <= 28; i += 2) {
230 c.STP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
231 }
232
233 // Store all vector registers.
234 static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
235 for (int i = 0; i <= 30; i += 2) {
236 c.STP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
237 }
238
239 // Store guest system registers, X30 and SP, using X0 as a scratch register.
240 c.STR(X0, SP, PRE_INDEXED, -16);
241 c.LDR(X0, SP, 16);
242 c.STR(X0, X30, 8 * 30);
243 c.ADD(X0, SP, 32);
244 c.STR(X0, X30, offsetof(GuestContext, sp));
245 c.MRS(X0, oaknut::SystemReg::FPSR);
246 c.STR(W0, X30, offsetof(GuestContext, fpsr));
247 c.MRS(X0, oaknut::SystemReg::FPCR);
248 c.STR(W0, X30, offsetof(GuestContext, fpcr));
249 c.MRS(X0, oaknut::SystemReg::NZCV);
250 c.STR(W0, X30, offsetof(GuestContext, nzcv));
251 c.LDR(X0, SP, POST_INDEXED, 16);
252
253 // Reload our return X30 from the stack, and return.
254 c.LDR(X30, SP, 8);
255 c.RET();
256}
257
258void Patcher::WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id) {
259 LOG_ERROR(Core_ARM, "Patching SVC {:#x} at {:#x}", svc_id, module_dest - 4);
260 // We are about to start saving state, so we need to lock the context.
261 this->LockContext();
262
263 // Store guest X30 to the stack. Then, save the context and restore the stack.
264 // This will save all registers except PC, but we know PC at patch time.
265 c.STR(X30, SP, PRE_INDEXED, -16);
266 c.BL(m_save_context);
267 c.LDR(X30, SP, POST_INDEXED, 16);
268
269 // Now that we've saved all registers, we can use any registers as scratch.
270 // Store PC + 4 to arm interface, since we know the instruction offset from the entry point.
271 oaknut::Label pc_after_svc;
272 c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
273 c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
274 c.LDR(X2, pc_after_svc);
275 c.STR(X2, X1, offsetof(GuestContext, pc));
276
277 // Store SVC number to execute when we return
278 c.MOV(X2, svc_id);
279 c.STR(W2, X1, offsetof(GuestContext, svc_swi));
280
281 // We are calling a SVC. Clear esr_el1 and return it.
282 static_assert(std::is_same_v<std::underlying_type_t<HaltReason>, u64>);
283 oaknut::Label retry;
284 c.ADD(X2, X1, offsetof(GuestContext, esr_el1));
285 c.l(retry);
286 c.LDAXR(X0, X2);
287 c.STLXR(W3, XZR, X2);
288 c.CBNZ(W3, retry);
289
290 // Add "calling SVC" flag. Since this is X0, this is now our return value.
291 c.ORR(X0, X0, static_cast<u64>(HaltReason::SupervisorCall));
292
293 // Offset the GuestContext pointer to the HostContext member.
294 // STP has limited range of [-512, 504] which we can't reach otherwise
295 // NB: Due to this all offsets below are from the start of HostContext.
296 c.ADD(X1, X1, offsetof(GuestContext, host_ctx));
297
298 // Reload host TPIDR_EL0 and SP.
299 static_assert(offsetof(HostContext, host_sp) + 8 == offsetof(HostContext, host_tpidr_el0));
300 c.LDP(X2, X3, X1, offsetof(HostContext, host_sp));
301 c.MOV(SP, X2);
302 c.MSR(oaknut::SystemReg::TPIDR_EL0, X3);
303
304 // Load callee-saved host registers and return to host.
305 static constexpr size_t HOST_REGS_OFF = offsetof(HostContext, host_saved_regs);
306 static constexpr size_t HOST_VREGS_OFF = offsetof(HostContext, host_saved_vregs);
307 c.LDP(X19, X20, X1, HOST_REGS_OFF);
308 c.LDP(X21, X22, X1, HOST_REGS_OFF + 2 * sizeof(u64));
309 c.LDP(X23, X24, X1, HOST_REGS_OFF + 4 * sizeof(u64));
310 c.LDP(X25, X26, X1, HOST_REGS_OFF + 6 * sizeof(u64));
311 c.LDP(X27, X28, X1, HOST_REGS_OFF + 8 * sizeof(u64));
312 c.LDP(X29, X30, X1, HOST_REGS_OFF + 10 * sizeof(u64));
313 c.LDP(Q8, Q9, X1, HOST_VREGS_OFF);
314 c.LDP(Q10, Q11, X1, HOST_VREGS_OFF + 2 * sizeof(u128));
315 c.LDP(Q12, Q13, X1, HOST_VREGS_OFF + 4 * sizeof(u128));
316 c.LDP(Q14, Q15, X1, HOST_VREGS_OFF + 6 * sizeof(u128));
317 c.RET();
318
319 // Write the post-SVC trampoline address, which will jump back to the guest after restoring its
320 // state.
321 m_trampolines.push_back({c.offset(), module_dest});
322
323 // Host called this location. Save the return address so we can
324 // unwind the stack properly when jumping back.
325 c.MRS(X2, oaknut::SystemReg::TPIDR_EL0);
326 c.LDR(X2, X2, offsetof(NativeExecutionParameters, native_context));
327 c.ADD(X0, X2, offsetof(GuestContext, host_ctx));
328 c.STR(X30, X0, offsetof(HostContext, host_saved_regs) + 11 * sizeof(u64));
329
330 // Reload all guest registers except X30 and PC.
331 // The function also expects 16 bytes of stack already allocated.
332 c.STR(X30, SP, PRE_INDEXED, -16);
333 c.BL(m_load_context);
334 c.LDR(X30, SP, POST_INDEXED, 16);
335
336 // Use X1 as a scratch register to restore X30.
337 c.STR(X1, SP, PRE_INDEXED, -16);
338 c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
339 c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
340 c.LDR(X30, X1, offsetof(GuestContext, cpu_registers) + sizeof(u64) * 30);
341 c.LDR(X1, SP, POST_INDEXED, 16);
342
343 // Unlock the context.
344 this->UnlockContext();
345
346 // Jump back to the instruction after the emulated SVC.
347 this->BranchToModule(module_dest);
348
349 // Store PC after call.
350 c.l(pc_after_svc);
351 this->WriteModulePc(module_dest);
352}
353
354void Patcher::WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
355 oaknut::SystemReg src_reg) {
356 // Retrieve emulated TLS register from GuestContext.
357 c.MRS(dest_reg, oaknut::SystemReg::TPIDR_EL0);
358 if (src_reg == oaknut::SystemReg::TPIDRRO_EL0) {
359 c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidrro_el0));
360 } else {
361 c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidr_el0));
362 }
363
364 // Jump back to the instruction after the emulated MRS.
365 this->BranchToModule(module_dest);
366}
367
368void Patcher::WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg) {
369 const auto scratch_reg = src_reg.index() == 0 ? X1 : X0;
370 c.STR(scratch_reg, SP, PRE_INDEXED, -16);
371
372 // Save guest value to NativeExecutionParameters::tpidr_el0.
373 c.MRS(scratch_reg, oaknut::SystemReg::TPIDR_EL0);
374 c.STR(src_reg, scratch_reg, offsetof(NativeExecutionParameters, tpidr_el0));
375
376 // Restore scratch register.
377 c.LDR(scratch_reg, SP, POST_INDEXED, 16);
378
379 // Jump back to the instruction after the emulated MSR.
380 this->BranchToModule(module_dest);
381}
382
383void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg) {
384 static Common::Arm64::NativeClock clock{};
385 const auto factor = clock.GetGuestCNTFRQFactor();
386 const auto raw_factor = Common::BitCast<std::array<u64, 2>>(factor);
387
388 const auto use_x2_x3 = dest_reg.index() == 0 || dest_reg.index() == 1;
389 oaknut::XReg scratch0 = use_x2_x3 ? X2 : X0;
390 oaknut::XReg scratch1 = use_x2_x3 ? X3 : X1;
391
392 oaknut::Label factorlo;
393 oaknut::Label factorhi;
394
395 // Save scratches.
396 c.STP(scratch0, scratch1, SP, PRE_INDEXED, -16);
397
398 // Load counter value.
399 c.MRS(dest_reg, oaknut::SystemReg::CNTVCT_EL0);
400
401 // Load scaling factor.
402 c.LDR(scratch0, factorlo);
403 c.LDR(scratch1, factorhi);
404
405 // Multiply low bits and get result.
406 c.UMULH(scratch0, dest_reg, scratch0);
407
408 // Multiply high bits and add low bit result.
409 c.MADD(dest_reg, dest_reg, scratch1, scratch0);
410
411 // Reload scratches.
412 c.LDP(scratch0, scratch1, SP, POST_INDEXED, 16);
413
414 // Jump back to the instruction after the emulated MRS.
415 this->BranchToModule(module_dest);
416
417 // Scaling factor constant values.
418 c.l(factorlo);
419 c.dx(raw_factor[0]);
420 c.l(factorhi);
421 c.dx(raw_factor[1]);
422}
423
424void Patcher::LockContext() {
425 oaknut::Label retry;
426
427 // Save scratches.
428 c.STP(X0, X1, SP, PRE_INDEXED, -16);
429
430 // Reload lock pointer.
431 c.l(retry);
432 c.CLREX();
433 c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
434 c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
435
436 static_assert(SpinLockLocked == 0);
437
438 // Load-linked with acquire ordering.
439 c.LDAXR(W1, X0);
440
441 // If the value was SpinLockLocked, clear monitor and retry.
442 c.CBZ(W1, retry);
443
444 // Store-conditional SpinLockLocked with relaxed ordering.
445 c.STXR(W1, WZR, X0);
446
447 // If we failed to store, retry.
448 c.CBNZ(W1, retry);
449
450 // We succeeded! Reload scratches.
451 c.LDP(X0, X1, SP, POST_INDEXED, 16);
452}
453
454void Patcher::UnlockContext() {
455 // Save scratches.
456 c.STP(X0, X1, SP, PRE_INDEXED, -16);
457
458 // Load lock pointer.
459 c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
460 c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
461
462 // Load SpinLockUnlocked.
463 c.MOV(W1, SpinLockUnlocked);
464
465 // Store value with release ordering.
466 c.STLR(W1, X0);
467
468 // Load scratches.
469 c.LDP(X0, X1, SP, POST_INDEXED, 16);
470}
471
472} // namespace Core::NCE