WIP: DO-NOT-MERGE: NCE experiments
botw boots but with poor performance
This commit is contained in:
parent
4957950b55
commit
d621707001
2 changed files with 47 additions and 13 deletions
|
@ -196,16 +196,23 @@ HaltReason ArmNce::RunThread(Kernel::KThread* thread) {
|
||||||
return hr;
|
return hr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the thread context.
|
// Pre-fetch thread context data to improve cache locality
|
||||||
auto* thread_params = &thread->GetNativeExecutionParameters();
|
auto* thread_params = &thread->GetNativeExecutionParameters();
|
||||||
auto* process = thread->GetOwnerProcess();
|
auto* process = thread->GetOwnerProcess();
|
||||||
|
|
||||||
// Assign current members.
|
// Move non-critical operations outside the locked section
|
||||||
|
const u64 tpidr_el0_cache = m_guest_ctx.tpidr_el0;
|
||||||
|
const u64 tpidrro_el0_cache = m_guest_ctx.tpidrro_el0;
|
||||||
|
|
||||||
|
// Critical section begins - minimize operations here
|
||||||
m_running_thread = thread;
|
m_running_thread = thread;
|
||||||
m_guest_ctx.parent = this;
|
m_guest_ctx.parent = this;
|
||||||
thread_params->native_context = &m_guest_ctx;
|
thread_params->native_context = &m_guest_ctx;
|
||||||
thread_params->tpidr_el0 = m_guest_ctx.tpidr_el0;
|
thread_params->tpidr_el0 = tpidr_el0_cache;
|
||||||
thread_params->tpidrro_el0 = m_guest_ctx.tpidrro_el0;
|
thread_params->tpidrro_el0 = tpidrro_el0_cache;
|
||||||
|
|
||||||
|
// Memory barrier to ensure visibility of changes
|
||||||
|
std::atomic_thread_fence(std::memory_order_release);
|
||||||
thread_params->is_running = true;
|
thread_params->is_running = true;
|
||||||
|
|
||||||
// TODO: finding and creating the post handler needs to be locked
|
// TODO: finding and creating the post handler needs to be locked
|
||||||
|
@ -217,12 +224,19 @@ HaltReason ArmNce::RunThread(Kernel::KThread* thread) {
|
||||||
hr = ReturnToRunCodeByExceptionLevelChange(m_thread_id, thread_params);
|
hr = ReturnToRunCodeByExceptionLevelChange(m_thread_id, thread_params);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Unload members.
|
// Critical section for thread cleanup
|
||||||
// The thread does not change, so we can persist the old reference.
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
m_running_thread = nullptr;
|
|
||||||
m_guest_ctx.tpidr_el0 = thread_params->tpidr_el0;
|
// Cache values before releasing thread
|
||||||
thread_params->native_context = nullptr;
|
const u64 final_tpidr_el0 = thread_params->tpidr_el0;
|
||||||
|
|
||||||
|
// Minimize critical section
|
||||||
thread_params->is_running = false;
|
thread_params->is_running = false;
|
||||||
|
thread_params->native_context = nullptr;
|
||||||
|
m_running_thread = nullptr;
|
||||||
|
|
||||||
|
// Non-critical updates can happen after releasing the thread
|
||||||
|
m_guest_ctx.tpidr_el0 = final_tpidr_el0;
|
||||||
|
|
||||||
// Return the halt reason.
|
// Return the halt reason.
|
||||||
return hr;
|
return hr;
|
||||||
|
@ -366,11 +380,21 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArmNce::ClearInstructionCache() {
|
void ArmNce::ClearInstructionCache() {
|
||||||
// TODO: This is not possible to implement correctly on Linux because
|
// Implement efficient cache clearing using compiler built-ins
|
||||||
// we do not have any access to ic iallu.
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
// Get current program counter
|
||||||
|
void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(uintptr_t)0xFFF);
|
||||||
|
void* end = (void*)((uintptr_t)start + 0x1000); // Clear one page
|
||||||
|
__builtin___clear_cache(static_cast<char*>(start), static_cast<char*>(end));
|
||||||
|
#endif
|
||||||
|
|
||||||
// Require accesses to complete.
|
// Ensure memory accesses are complete before clearing cache
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
std::atomic_thread_fence(std::memory_order_release);
|
||||||
|
|
||||||
|
#ifdef __aarch64__
|
||||||
|
asm volatile("dsb ish");
|
||||||
|
asm volatile("isb");
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
||||||
|
|
|
@ -441,6 +441,11 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
|
||||||
address += offset;
|
address += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add early prefetch hint for loads
|
||||||
|
if (memop == MemOp::Load && (address % 8) == 0) {
|
||||||
|
__builtin_prefetch((void*)address, 0, 3);
|
||||||
|
}
|
||||||
|
|
||||||
const size_t datasize = 8 << scale;
|
const size_t datasize = 8 << scale;
|
||||||
switch (memop) {
|
switch (memop) {
|
||||||
case MemOp::Store: {
|
case MemOp::Store: {
|
||||||
|
@ -521,6 +526,11 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale,
|
||||||
address += offset;
|
address += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prefetch for SIMD loads
|
||||||
|
if (memop == MemOp::Load && (address % 16) == 0) {
|
||||||
|
__builtin_prefetch((void*)(address + datasize), 0, 3);
|
||||||
|
}
|
||||||
|
|
||||||
switch (memop) {
|
switch (memop) {
|
||||||
case MemOp::Store: {
|
case MemOp::Store: {
|
||||||
u128 data = VectorGetElement(this->GetVec(Vt), datasize);
|
u128 data = VectorGetElement(this->GetVec(Vt), datasize);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue