WIP: DO-NOT-MERGE: NCE experiments: Some optimizations on pre-fetch and cache
This commit is contained in:
parent
6e6fae364f
commit
3cca503f7a
2 changed files with 53 additions and 14 deletions
|
@ -384,18 +384,38 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
|
||||||
|
|
||||||
void ArmNce::ClearInstructionCache() {
|
void ArmNce::ClearInstructionCache() {
|
||||||
#if defined(__GNUC__) || defined(__clang__)
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(uintptr_t)0xFFF);
|
const size_t PAGE_SIZE = 4096;
|
||||||
void* end = (void*)((uintptr_t)start + 0x1000);
|
void* start = (void*)((uintptr_t)__builtin_return_address(0) & ~(PAGE_SIZE - 1));
|
||||||
|
void* end = (void*)((uintptr_t)start + PAGE_SIZE * 2); // Clear two pages for better coverage
|
||||||
|
|
||||||
|
// Prefetch next likely pages
|
||||||
|
__builtin_prefetch((void*)((uintptr_t)end), 1, 3);
|
||||||
__builtin___clear_cache(static_cast<char*>(start), static_cast<char*>(end));
|
__builtin___clear_cache(static_cast<char*>(start), static_cast<char*>(end));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __aarch64__
|
#ifdef __aarch64__
|
||||||
|
// Ensure all previous memory operations complete
|
||||||
|
asm volatile("dmb ish" ::: "memory");
|
||||||
asm volatile("dsb ish" ::: "memory");
|
asm volatile("dsb ish" ::: "memory");
|
||||||
asm volatile("isb" ::: "memory");
|
asm volatile("isb" ::: "memory");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
||||||
|
#if defined(__GNUC__) || defined(__clang__)
|
||||||
|
// Align the start address to cache line boundary for better performance
|
||||||
|
const size_t CACHE_LINE_SIZE = 64;
|
||||||
|
addr &= ~(CACHE_LINE_SIZE - 1);
|
||||||
|
|
||||||
|
// Round up size to nearest cache line
|
||||||
|
size = (size + CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE - 1);
|
||||||
|
|
||||||
|
// Prefetch the range to be invalidated
|
||||||
|
for (size_t offset = 0; offset < size; offset += CACHE_LINE_SIZE) {
|
||||||
|
__builtin_prefetch((void*)(addr + offset), 1, 3);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
this->ClearInstructionCache();
|
this->ClearInstructionCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -422,28 +422,39 @@ bool InterpreterVisitor::RegisterImmediate(bool wback, bool postindex, size_t sc
|
||||||
signed_ = true;
|
signed_ = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memop == MemOp::Load && wback && Rn == Rt && Rn != Reg::R31) {
|
if ((memop == MemOp::Load || memop == MemOp::Store) && wback && Rn == Rt && Rn != Reg::R31) {
|
||||||
// Unpredictable instruction
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (memop == MemOp::Store && wback && Rn == Rt && Rn != Reg::R31) {
|
|
||||||
// Unpredictable instruction
|
// Unpredictable instruction
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
u64 address;
|
// Use aligned access where possible
|
||||||
|
alignas(8) u64 address;
|
||||||
if (Rn == Reg::SP) {
|
if (Rn == Reg::SP) {
|
||||||
address = this->GetSp();
|
address = this->GetSp();
|
||||||
} else {
|
} else {
|
||||||
address = this->GetReg(Rn);
|
address = this->GetReg(Rn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Pre-index addressing
|
||||||
if (!postindex) {
|
if (!postindex) {
|
||||||
address += offset;
|
address += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add early prefetch hint for loads
|
// Alignment optimization for common cases
|
||||||
if (memop == MemOp::Load && (address % 8) == 0) {
|
const bool is_aligned = (address % 8) == 0;
|
||||||
__builtin_prefetch((void*)address, 0, 3);
|
|
||||||
|
// Enhanced prefetching for loads with aligned addresses
|
||||||
|
if (memop == MemOp::Load) {
|
||||||
|
const size_t CACHE_LINE_SIZE = 64;
|
||||||
|
if ((address % 16) == 0) {
|
||||||
|
__builtin_prefetch((void*)address, 0, 3);
|
||||||
|
__builtin_prefetch((void*)(address + CACHE_LINE_SIZE), 0, 3);
|
||||||
|
if (datasize >= 32) {
|
||||||
|
__builtin_prefetch((void*)(address + CACHE_LINE_SIZE * 2), 0, 2);
|
||||||
|
}
|
||||||
|
} else if ((address % 8) == 0) {
|
||||||
|
__builtin_prefetch((void*)address, 0, 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t datasize = 8 << scale;
|
const size_t datasize = 8 << scale;
|
||||||
|
@ -526,9 +537,17 @@ bool InterpreterVisitor::SIMDImmediate(bool wback, bool postindex, size_t scale,
|
||||||
address += offset;
|
address += offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prefetch for SIMD loads
|
// Enhanced prefetching for SIMD loads
|
||||||
if (memop == MemOp::Load && (address % 16) == 0) {
|
if (memop == MemOp::Load) {
|
||||||
__builtin_prefetch((void*)(address + datasize), 0, 3);
|
if ((address % 32) == 0) {
|
||||||
|
// Aggressive prefetch for well-aligned SIMD operations
|
||||||
|
__builtin_prefetch((void*)address, 0, 3);
|
||||||
|
__builtin_prefetch((void*)(address + 32), 0, 3);
|
||||||
|
__builtin_prefetch((void*)(address + 64), 0, 2);
|
||||||
|
} else if ((address % 16) == 0) {
|
||||||
|
__builtin_prefetch((void*)address, 0, 3);
|
||||||
|
__builtin_prefetch((void*)(address + datasize), 0, 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (memop) {
|
switch (memop) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue