From 9efdf44a75d621f62295bf7f7c4ccfe09c8c360d Mon Sep 17 00:00:00 2001 From: MrPurple666 Date: Thu, 24 Apr 2025 01:46:22 -0300 Subject: [PATCH] Better FastMemcpy and FastMemset Use 16-byte copy paths --- src/core/memory.cpp | 173 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 163 insertions(+), 10 deletions(-) diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 0790887b1b..4ca1a72dd1 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -33,6 +33,7 @@ namespace Core::Memory { namespace { inline void FastMemcpy(void* dst, const void* src, std::size_t size) { + // Fast path for small copies switch (size) { case 1: *static_cast(dst) = *static_cast(src); @@ -46,13 +47,23 @@ inline void FastMemcpy(void* dst, const void* src, std::size_t size) { case 8: *static_cast(dst) = *static_cast(src); break; + case 16: { + // Optimize for 16-byte copy (common case for SIMD registers) + const u64* src_64 = static_cast(src); + u64* dst_64 = static_cast(dst); + dst_64[0] = src_64[0]; + dst_64[1] = src_64[1]; + break; + } default: + // For larger sizes, use standard memcpy which is usually optimized by the compiler std::memcpy(dst, src, size); break; } } inline void FastMemset(void* dst, int value, std::size_t size) { + // Fast path for small fills switch (size) { case 1: *static_cast(dst) = static_cast(value); @@ -66,8 +77,32 @@ inline void FastMemset(void* dst, int value, std::size_t size) { case 8: *static_cast(dst) = static_cast(value); break; + case 16: { + // Optimize for 16-byte fill (common case for SIMD registers) + u64* dst_64 = static_cast(dst); + const u64 val64 = static_cast(value) * 0x0101010101010101ULL; + dst_64[0] = val64; + dst_64[1] = val64; + break; + } default: - std::memset(dst, value, size); + if (size <= 128 && value == 0) { + // Fast path for small zero-fills + u8* dst_bytes = static_cast(dst); + for (std::size_t i = 0; i < size; i += 8) { + if (i + 8 <= size) { + *reinterpret_cast(dst_bytes + i) = 0; + } else { + // Handle remaining bytes (less than 8) + for (std::size_t j = i; j < size; j++) { + dst_bytes[j] = 0; + } + } + } + } else { + // For larger sizes, use standard memset which is usually optimized by the compiler + std::memset(dst, value, size); + } break; } } @@ -773,14 +808,69 @@ struct Memory::Impl { */ template T Read(Common::ProcessAddress vaddr) { + // Fast path for aligned reads of common sizes + const u64 addr = GetInteger(vaddr); + if constexpr (std::is_same_v || std::is_same_v) { + // 8-bit reads are always aligned + const u8* const ptr = GetPointerImpl( + addr, + [addr]() { + LOG_ERROR(HW_Memory, "Unmapped Read8 @ 0x{:016X}", addr); + }, + [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); + if (ptr) { + return static_cast(*ptr); + } + return 0; + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 16-bit reads + if ((addr & 1) == 0) { + const u8* const ptr = GetPointerImpl( + addr, + [addr]() { + LOG_ERROR(HW_Memory, "Unmapped Read16 @ 0x{:016X}", addr); + }, + [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); + if (ptr) { + return static_cast(*reinterpret_cast(ptr)); + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 32-bit reads + if ((addr & 3) == 0) { + const u8* const ptr = GetPointerImpl( + addr, + [addr]() { + LOG_ERROR(HW_Memory, "Unmapped Read32 @ 0x{:016X}", addr); + }, + [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); + if (ptr) { + return static_cast(*reinterpret_cast(ptr)); + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 64-bit reads + if ((addr & 7) == 0) { + const u8* const ptr = GetPointerImpl( + addr, + [addr]() { + LOG_ERROR(HW_Memory, "Unmapped Read64 @ 0x{:016X}", addr); + }, + [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); + if (ptr) { + return static_cast(*reinterpret_cast(ptr)); + } + } + } + + // Fall back to the general case for other types or unaligned access T result = 0; const u8* const ptr = GetPointerImpl( - GetInteger(vaddr), - [vaddr]() { - LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, - GetInteger(vaddr)); + addr, + [addr]() { + LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, addr); }, - [&]() { HandleRasterizerDownload(GetInteger(vaddr), sizeof(T)); }); + [&]() { HandleRasterizerDownload(addr, sizeof(T)); }); if (ptr) { FastMemcpy(&result, ptr, sizeof(T)); } @@ -798,13 +888,76 @@ struct Memory::Impl { */ template void Write(Common::ProcessAddress vaddr, const T data) { + // Fast path for aligned writes of common sizes + const u64 addr = GetInteger(vaddr); + if constexpr (std::is_same_v || std::is_same_v) { + // 8-bit writes are always aligned + u8* const ptr = GetPointerImpl( + addr, + [addr, data]() { + LOG_ERROR(HW_Memory, "Unmapped Write8 @ 0x{:016X} = 0x{:02X}", addr, + static_cast(data)); + }, + [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); + if (ptr) { + *ptr = static_cast(data); + } + return; + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 16-bit writes + if ((addr & 1) == 0) { + u8* const ptr = GetPointerImpl( + addr, + [addr, data]() { + LOG_ERROR(HW_Memory, "Unmapped Write16 @ 0x{:016X} = 0x{:04X}", addr, + static_cast(data)); + }, + [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); + if (ptr) { + *reinterpret_cast(ptr) = static_cast(data); + return; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 32-bit writes + if ((addr & 3) == 0) { + u8* const ptr = GetPointerImpl( + addr, + [addr, data]() { + LOG_ERROR(HW_Memory, "Unmapped Write32 @ 0x{:016X} = 0x{:08X}", addr, + static_cast(data)); + }, + [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); + if (ptr) { + *reinterpret_cast(ptr) = static_cast(data); + return; + } + } + } else if constexpr (std::is_same_v || std::is_same_v) { + // Check alignment for 64-bit writes + if ((addr & 7) == 0) { + u8* const ptr = GetPointerImpl( + addr, + [addr, data]() { + LOG_ERROR(HW_Memory, "Unmapped Write64 @ 0x{:016X} = 0x{:016X}", addr, + static_cast(data)); + }, + [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); + if (ptr) { + *reinterpret_cast(ptr) = static_cast(data); + return; + } + } + } + + // Fall back to the general case for other types or unaligned access u8* const ptr = GetPointerImpl( - GetInteger(vaddr), - [vaddr, data]() { + addr, + [addr, data]() { LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, - GetInteger(vaddr), static_cast(data)); + addr, static_cast(data)); }, - [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(T)); }); + [&]() { HandleRasterizerWrite(addr, sizeof(T)); }); if (ptr) { FastMemcpy(ptr, &data, sizeof(T)); }