WIP: DO-NOT-MERGE: NCE experiments: Some optimizations on pre-fetch and cache

2025-04-01 03:42:09 -03:00 · 2025-04-01 03:42:09 -03:00 · 3cca503f7a
commit 3cca503f7a
parent 6e6fae364f
2 changed files with 53 additions and 14 deletions
--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@ -384,18 +384,38 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {

 void ArmNce::ClearInstructionCache() {
    #if defined(__GNUC__) || defined(__clang__)
-        void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(uintptr_t)0xFFF);
-        void* end = (void*)((uintptr_t)start + 0x1000);
+        const size_t PAGE_SIZE = 4096;
+        void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(PAGE_SIZE - 1));
+        void* end = (void*)((uintptr_t)start + PAGE_SIZE * 2); // Clear two pages for better coverage
+
+        // Prefetch next likely pages
+        __builtin_prefetch((void*)((uintptr_t)end), 1, 3);
        __builtin___clear_cache(static_cast<char*>(start), static_cast<char*>(end));
    #endif

    #ifdef __aarch64__
+        // Ensure all previous memory operations complete
+        asm volatile("dmb ish" ::: "memory");
        asm volatile("dsb ish" ::: "memory");
        asm volatile("isb" ::: "memory");
    #endif
 }

 void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
+    #if defined(__GNUC__) || defined(__clang__)
+        // Align the start address to cache line boundary for better performance
+        const size_t CACHE_LINE_SIZE = 64;
+        addr &= ~(CACHE_LINE_SIZE - 1);
+
+        // Round up size to nearest cache line
+        size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
+
+        // Prefetch the range to be invalidated
+        for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) {
+            __builtin_prefetch((void*)(addr + offset), 1, 3);
+        }
+    #endif
+
    this->ClearInstructionCache();
 }

--- a/src/core/arm/nce/interpreter_visitor.cpp
+++ b/src/core/arm/nce/interpreter_visitor.cpp
@ -422,28 +422,39 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
        signed_ = true;
    }

-    if (memop == MemOp::Load && wback && Rn == Rt && Rn != Reg::R31) {
-        // Unpredictable instruction
-        return false;
-    }
-    if (memop == MemOp::Store && wback && Rn == Rt && Rn != Reg::R31) {
+    if ((memop == MemOp::Load || memop == MemOp::Store) && wback && Rn == Rt && Rn != Reg::R31) {
        // Unpredictable instruction
        return false;
    }

-    u64 address;
+    // Use aligned access where possible
+    alignas(8) u64 address;
    if (Rn == Reg::SP) {
        address = this->GetSp();
    } else {
        address = this->GetReg(Rn);
    }
+
+    // Pre-index addressing
    if (!postindex) {
        address += offset;
    }

-    // Add early prefetch hint for loads
-    if (memop == MemOp::Load && (address % 8) == 0) {
+    // Alignment optimization for common cases
+    const bool is_aligned = (address % 8) == 0;
+
+    // Enhanced prefetching for loads with aligned addresses
+    if (memop == MemOp::Load) {
+        const size_t CACHE_LINE_SIZE = 64;
+        if ((address % 16) == 0) {
            __builtin_prefetch((void*)address, 0, 3);
+            __builtin_prefetch((void*)(address + CACHE_LINE_SIZE), 0, 3);
+            if (datasize >= 32) {
+                __builtin_prefetch((void*)(address + CACHE_LINE_SIZE * 2), 0, 2);
+            }
+        } else if ((address % 8) == 0) {
+            __builtin_prefetch((void*)address, 0, 2);
+        }
    }

    const size_t datasize = 8 << scale;
@ -526,9 +537,17 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale,
        address += offset;
    }

-    // Prefetch for SIMD loads
-    if (memop == MemOp::Load && (address % 16) == 0) {
-        __builtin_prefetch((void*)(address + datasize), 0, 3);
+    // Enhanced prefetching for SIMD loads
+    if (memop == MemOp::Load) {
+        if ((address % 32) == 0) {
+            // Aggressive prefetch for well-aligned SIMD operations
+            __builtin_prefetch((void*)address, 0, 3);
+            __builtin_prefetch((void*)(address + 32), 0, 3);
+            __builtin_prefetch((void*)(address + 64), 0, 2);
+        } else if ((address % 16) == 0) {
+            __builtin_prefetch((void*)address, 0, 3);
+            __builtin_prefetch((void*)(address + datasize), 0, 2);
+        }
    }

    switch (memop) {