From 777b674edfe317965018bfb837ce76fee0378667 Mon Sep 17 00:00:00 2001 From: MrPurple666 Date: Tue, 1 Apr 2025 03:42:09 -0300 Subject: [PATCH] WIP: DO-NOT-MERGE: NCE experiments: Some optimizations on pre-fetch and cache --- src/core/arm/nce/arm_nce.cpp | 24 +++++++++++-- src/core/arm/nce/interpreter_visitor.cpp | 43 +++++++++++++++++------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/core/arm/nce/arm_nce.cpp b/src/core/arm/nce/arm_nce.cpp index c9bcf37642..76b476e672 100644 --- a/src/core/arm/nce/arm_nce.cpp +++ b/src/core/arm/nce/arm_nce.cpp @@ -384,18 +384,38 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) { void ArmNce::ClearInstructionCache() { #if defined(__GNUC__) || defined(__clang__) - void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(uintptr_t)0xFFF); - void* end = (void*)((uintptr_t)start + 0x1000); + const size_t PAGE_SIZE = 4096; + void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(PAGE_SIZE - 1)); + void* end = (void*)((uintptr_t)start + PAGE_SIZE * 2); // Clear two pages for better coverage + + // Prefetch next likely pages + __builtin_prefetch((void*)((uintptr_t)end), 1, 3); __builtin___clear_cache(static_cast(start), static_cast(end)); #endif #ifdef __aarch64__ + // Ensure all previous memory operations complete + asm volatile("dmb ish" ::: "memory"); asm volatile("dsb ish" ::: "memory"); asm volatile("isb" ::: "memory"); #endif } void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) { + #if defined(__GNUC__) || defined(__clang__) + // Align the start address to cache line boundary for better performance + const size_t CACHE_LINE_SIZE = 64; + addr &= ~(CACHE_LINE_SIZE - 1); + + // Round up size to nearest cache line + size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1); + + // Prefetch the range to be invalidated + for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) { + __builtin_prefetch((void*)(addr + offset), 1, 3); + } + #endif + this->ClearInstructionCache(); } diff --git a/src/core/arm/nce/interpreter_visitor.cpp b/src/core/arm/nce/interpreter_visitor.cpp index 0c174cfbfb..31ab7735d2 100644 --- a/src/core/arm/nce/interpreter_visitor.cpp +++ b/src/core/arm/nce/interpreter_visitor.cpp @@ -422,28 +422,39 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc signed_ = true; } - if (memop == MemOp::Load && wback && Rn == Rt && Rn != Reg::R31) { - // Unpredictable instruction - return false; - } - if (memop == MemOp::Store && wback && Rn == Rt && Rn != Reg::R31) { + if ((memop == MemOp::Load || memop == MemOp::Store) && wback && Rn == Rt && Rn != Reg::R31) { // Unpredictable instruction return false; } - u64 address; + // Use aligned access where possible + alignas(8) u64 address; if (Rn == Reg::SP) { address = this->GetSp(); } else { address = this->GetReg(Rn); } + + // Pre-index addressing if (!postindex) { address += offset; } - // Add early prefetch hint for loads - if (memop == MemOp::Load && (address % 8) == 0) { - __builtin_prefetch((void*)address, 0, 3); + // Alignment optimization for common cases + const bool is_aligned = (address % 8) == 0; + + // Enhanced prefetching for loads with aligned addresses + if (memop == MemOp::Load) { + const size_t CACHE_LINE_SIZE = 64; + if ((address % 16) == 0) { + __builtin_prefetch((void*)address, 0, 3); + __builtin_prefetch((void*)(address + CACHE_LINE_SIZE), 0, 3); + if (datasize >= 32) { + __builtin_prefetch((void*)(address + CACHE_LINE_SIZE * 2), 0, 2); + } + } else if ((address % 8) == 0) { + __builtin_prefetch((void*)address, 0, 2); + } } const size_t datasize = 8 << scale; @@ -526,9 +537,17 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale, address += offset; } - // Prefetch for SIMD loads - if (memop == MemOp::Load && (address % 16) == 0) { - __builtin_prefetch((void*)(address + datasize), 0, 3); + // Enhanced prefetching for SIMD loads + if (memop == MemOp::Load) { + if ((address % 32) == 0) { + // Aggressive prefetch for well-aligned SIMD operations + __builtin_prefetch((void*)address, 0, 3); + __builtin_prefetch((void*)(address + 32), 0, 3); + __builtin_prefetch((void*)(address + 64), 0, 2); + } else if ((address % 16) == 0) { + __builtin_prefetch((void*)address, 0, 3); + __builtin_prefetch((void*)(address + datasize), 0, 2); + } } switch (memop) {