From d621707001e073648126179c8b2c6a20eb8524b9 Mon Sep 17 00:00:00 2001 From: MrPurple666 Date: Tue, 1 Apr 2025 02:55:34 -0300 Subject: [PATCH] WIP: DO-NOT-MERGE: NCE experiments botw boots but with poor performance --- src/core/arm/nce/arm_nce.cpp | 50 ++++++++++++++++++------ src/core/arm/nce/interpreter_visitor.cpp | 10 +++++ 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/src/core/arm/nce/arm_nce.cpp b/src/core/arm/nce/arm_nce.cpp index 123b3da7ec..6e0cd6672c 100644 --- a/src/core/arm/nce/arm_nce.cpp +++ b/src/core/arm/nce/arm_nce.cpp @@ -196,16 +196,23 @@ HaltReason ArmNce::RunThread(Kernel::KThread* thread) { return hr; } - // Get the thread context. + // Pre-fetch thread context data to improve cache locality auto* thread_params = &thread->GetNativeExecutionParameters(); auto* process = thread->GetOwnerProcess(); - // Assign current members. + // Move non-critical operations outside the locked section + const u64 tpidr_el0_cache = m_guest_ctx.tpidr_el0; + const u64 tpidrro_el0_cache = m_guest_ctx.tpidrro_el0; + + // Critical section begins - minimize operations here m_running_thread = thread; m_guest_ctx.parent = this; thread_params->native_context = &m_guest_ctx; - thread_params->tpidr_el0 = m_guest_ctx.tpidr_el0; - thread_params->tpidrro_el0 = m_guest_ctx.tpidrro_el0; + thread_params->tpidr_el0 = tpidr_el0_cache; + thread_params->tpidrro_el0 = tpidrro_el0_cache; + + // Memory barrier to ensure visibility of changes + std::atomic_thread_fence(std::memory_order_release); thread_params->is_running = true; // TODO: finding and creating the post handler needs to be locked @@ -217,12 +224,19 @@ HaltReason ArmNce::RunThread(Kernel::KThread* thread) { hr = ReturnToRunCodeByExceptionLevelChange(m_thread_id, thread_params); } - // Unload members. - // The thread does not change, so we can persist the old reference. - m_running_thread = nullptr; - m_guest_ctx.tpidr_el0 = thread_params->tpidr_el0; - thread_params->native_context = nullptr; + // Critical section for thread cleanup + std::atomic_thread_fence(std::memory_order_acquire); + + // Cache values before releasing thread + const u64 final_tpidr_el0 = thread_params->tpidr_el0; + + // Minimize critical section thread_params->is_running = false; + thread_params->native_context = nullptr; + m_running_thread = nullptr; + + // Non-critical updates can happen after releasing the thread + m_guest_ctx.tpidr_el0 = final_tpidr_el0; // Return the halt reason. return hr; @@ -366,11 +380,21 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) { } void ArmNce::ClearInstructionCache() { - // TODO: This is not possible to implement correctly on Linux because - // we do not have any access to ic iallu. + // Implement efficient cache clearing using compiler built-ins + #if defined(__GNUC__) || defined(__clang__) + // Get current program counter + void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(uintptr_t)0xFFF); + void* end = (void*)((uintptr_t)start + 0x1000); // Clear one page + __builtin___clear_cache(static_cast(start), static_cast(end)); + #endif - // Require accesses to complete. - std::atomic_thread_fence(std::memory_order_seq_cst); + // Ensure memory accesses are complete before clearing cache + std::atomic_thread_fence(std::memory_order_release); + + #ifdef __aarch64__ + asm volatile("dsb ish"); + asm volatile("isb"); + #endif } void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) { diff --git a/src/core/arm/nce/interpreter_visitor.cpp b/src/core/arm/nce/interpreter_visitor.cpp index def888d153..0c174cfbfb 100644 --- a/src/core/arm/nce/interpreter_visitor.cpp +++ b/src/core/arm/nce/interpreter_visitor.cpp @@ -441,6 +441,11 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc address += offset; } + // Add early prefetch hint for loads + if (memop == MemOp::Load && (address % 8) == 0) { + __builtin_prefetch((void*)address, 0, 3); + } + const size_t datasize = 8 << scale; switch (memop) { case MemOp::Store: { @@ -521,6 +526,11 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale, address += offset; } + // Prefetch for SIMD loads + if (memop == MemOp::Load && (address % 16) == 0) { + __builtin_prefetch((void*)(address + datasize), 0, 3); + } + switch (memop) { case MemOp::Store: { u128 data = VectorGetElement(this->GetVec(Vt), datasize);