From 7d22f79cf09f8601fe4680aeb39ef56bd42b397c Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 15 Aug 2021 15:35:53 +0200
Subject: [PATCH] VideoCore: Rework Garbage Collection.

---
 src/common/lru_cache.h                        | 141 ++++++++++++++++++
 src/video_core/buffer_cache/buffer_base.h     |  13 +-
 src/video_core/buffer_cache/buffer_cache.h    |  61 ++++----
 src/video_core/texture_cache/image_base.h     |   2 +-
 src/video_core/texture_cache/texture_cache.h  |  89 ++++-------
 .../texture_cache/texture_cache_base.h        |   8 +-
 6 files changed, 213 insertions(+), 101 deletions(-)
 create mode 100644 src/common/lru_cache.h

diff --git a/src/common/lru_cache.h b/src/common/lru_cache.h
new file mode 100644
index 0000000000..048e9c3daa
--- /dev/null
+++ b/src/common/lru_cache.h
@@ -0,0 +1,141 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2+ or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <type_traits>
+
+#include "common/common_types.h"
+
+namespace Common {
+
+template <class Traits>
+class LeastRecentlyUsedCache {
+    using ObjectType = typename Traits::ObjectType;
+    using TickType = typename Traits::TickType;
+
+    struct Item {
+        ObjectType obj;
+        TickType tick;
+        Item* next{};
+        Item* prev{};
+    };
+
+public:
+    LeastRecentlyUsedCache() : first_item{}, last_item{} {}
+    ~LeastRecentlyUsedCache() = default;
+
+    size_t Insert(ObjectType obj, TickType tick) {
+        const auto new_id = build();
+        auto& item = item_pool[new_id];
+        item.obj = obj;
+        item.tick = tick;
+        attach(item);
+        return new_id;
+    }
+
+    void Touch(size_t id, TickType tick) {
+        auto& item = item_pool[id];
+        if (item.tick >= tick) {
+            return;
+        }
+        item.tick = tick;
+        if (&item == last_item) {
+            return;
+        }
+        detach(item);
+        attach(item);
+    }
+
+    void Free(size_t id) {
+        auto& item = item_pool[id];
+        detach(item);
+        item.prev = nullptr;
+        item.next = nullptr;
+        free_items.push_back(id);
+    }
+
+    template <typename Func>
+    void ForEachItemBelow(TickType tick, Func&& func) {
+        static constexpr bool RETURNS_BOOL =
+            std::is_same_v<std::invoke_result<Func, ObjectType>, bool>;
+        Item* iterator = first_item;
+        while (iterator) {
+            if (static_cast<s64>(tick) - static_cast<s64>(iterator->tick) < 0) {
+                return;
+            }
+            Item* next = iterator->next;
+            if constexpr (RETURNS_BOOL) {
+                if (func(iterator->obj)) {
+                    return;
+                }
+            } else {
+                func(iterator->obj);
+            }
+            iterator = next;
+        }
+    }
+
+private:
+    size_t build() {
+        if (free_items.empty()) {
+            const size_t item_id = item_pool.size();
+            item_pool.emplace_back();
+            auto& item = item_pool[item_id];
+            item.next = nullptr;
+            item.prev = nullptr;
+            return item_id;
+        }
+        const size_t item_id = free_items.front();
+        free_items.pop_front();
+        auto& item = item_pool[item_id];
+        item.next = nullptr;
+        item.prev = nullptr;
+        return item_id;
+    }
+
+    void attach(Item& item) {
+        if (!first_item) {
+            first_item = &item;
+        }
+        if (!last_item) {
+            last_item = &item;
+        } else {
+            item.prev = last_item;
+            last_item->next = &item;
+            item.next = nullptr;
+            last_item = &item;
+        }
+    }
+
+    void detach(Item& item) {
+        if (item.prev) {
+            item.prev->next = item.next;
+        }
+        if (item.next) {
+            item.next->prev = item.prev;
+        }
+        if (&item == first_item) {
+            first_item = item.next;
+            if (first_item) {
+                first_item->prev = nullptr;
+            }
+        }
+        if (&item == last_item) {
+            last_item = item.prev;
+            if (last_item) {
+                last_item->next = nullptr;
+            }
+        }
+    }
+
+    std::deque<Item> item_pool;
+    std::deque<size_t> free_items;
+    Item* first_item;
+    Item* last_item;
+};
+
+} // namespace Common
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index c3318095c4..4b696a60ff 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -261,16 +261,6 @@ public:
         stream_score += score;
     }
 
-    /// Sets the new frame tick
-    void SetFrameTick(u64 new_frame_tick) noexcept {
-        frame_tick = new_frame_tick;
-    }
-
-    /// Returns the new frame tick
-    [[nodiscard]] u64 FrameTick() const noexcept {
-        return frame_tick;
-    }
-
     /// Returns the likeliness of this being a stream buffer
     [[nodiscard]] int StreamScore() const noexcept {
         return stream_score;
@@ -307,6 +297,8 @@ public:
         return words.size_bytes;
     }
 
+    size_t lru_id;
+
 private:
     template <Type type>
     u64* Array() noexcept {
@@ -603,7 +595,6 @@ private:
     RasterizerInterface* rasterizer = nullptr;
     VAddr cpu_addr = 0;
     Words words;
-    u64 frame_tick = 0;
     BufferFlagBits flags{};
     int stream_score = 0;
 };
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 3b43554f9a..a0217908ab 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -20,6 +20,7 @@
 #include "common/common_types.h"
 #include "common/div_ceil.h"
 #include "common/literals.h"
+#include "common/lru_cache.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
@@ -77,7 +78,7 @@ class BufferCache {
 
     static constexpr BufferId NULL_BUFFER_ID{0};
 
-    static constexpr u64 EXPECTED_MEMORY = 512_MiB;
+    static constexpr u64 EXPECTED_MEMORY = 256_MiB;
     static constexpr u64 CRITICAL_MEMORY = 1_GiB;
 
     using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -330,7 +331,7 @@ private:
     template <bool insert>
     void ChangeRegister(BufferId buffer_id);
 
-    void TouchBuffer(Buffer& buffer) const noexcept;
+    void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept;
 
     bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);
 
@@ -428,7 +429,11 @@ private:
     size_t immediate_buffer_capacity = 0;
     std::unique_ptr<u8[]> immediate_buffer_alloc;
 
-    typename SlotVector<Buffer>::Iterator deletion_iterator;
+    struct LRUItemParams {
+        using ObjectType = BufferId;
+        using TickType = u64;
+    };
+    Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
     u64 frame_tick = 0;
     u64 total_used_memory = 0;
 
@@ -445,7 +450,6 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
       kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_} {
     // Ensure the first slot is used for the null buffer
     void(slot_buffers.insert(runtime, NullBufferParams{}));
-    deletion_iterator = slot_buffers.end();
     common_ranges.clear();
 }
 
@@ -454,20 +458,17 @@ void BufferCache<P>::RunGarbageCollector() {
     const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY;
     const u64 ticks_to_destroy = aggressive_gc ? 60 : 120;
     int num_iterations = aggressive_gc ? 64 : 32;
-    for (; num_iterations > 0; --num_iterations) {
-        if (deletion_iterator == slot_buffers.end()) {
-            deletion_iterator = slot_buffers.begin();
+    const auto clean_up = [this, &num_iterations](BufferId buffer_id) {
+        if (num_iterations == 0) {
+            return true;
         }
-        ++deletion_iterator;
-        if (deletion_iterator == slot_buffers.end()) {
-            break;
-        }
-        const auto [buffer_id, buffer] = *deletion_iterator;
-        if (buffer->FrameTick() + ticks_to_destroy < frame_tick) {
-            DownloadBufferMemory(*buffer);
-            DeleteBuffer(buffer_id);
-        }
-    }
+        --num_iterations;
+        auto& buffer = slot_buffers[buffer_id];
+        DownloadBufferMemory(buffer);
+        DeleteBuffer(buffer_id);
+        return false;
+    };
+    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up);
 }
 
 template <class P>
@@ -954,7 +955,7 @@ bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
 template <class P>
 void BufferCache<P>::BindHostIndexBuffer() {
     Buffer& buffer = slot_buffers[index_buffer.buffer_id];
-    TouchBuffer(buffer);
+    TouchBuffer(buffer, index_buffer.buffer_id);
     const u32 offset = buffer.Offset(index_buffer.cpu_addr);
     const u32 size = index_buffer.size;
     SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
@@ -975,7 +976,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
     for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
         const Binding& binding = vertex_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
         SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
@@ -1011,7 +1012,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
     const VAddr cpu_addr = binding.cpu_addr;
     const u32 size = std::min(binding.size, (*uniform_buffer_sizes)[stage][index]);
     Buffer& buffer = slot_buffers[binding.buffer_id];
-    TouchBuffer(buffer);
+    TouchBuffer(buffer, binding.buffer_id);
     const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
                                  size <= uniform_buffer_skip_cache_size &&
                                  !buffer.IsRegionGpuModified(cpu_addr, size);
@@ -1083,7 +1084,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
     ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
         const Binding& binding = storage_buffers[stage][index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -1128,7 +1129,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
     for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
         const Binding& binding = transform_feedback_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -1148,7 +1149,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
     ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
         const Binding& binding = compute_uniform_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
         const u32 size = std::min(binding.size, (*compute_uniform_buffer_sizes)[index]);
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -1168,7 +1169,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
     ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
         const Binding& binding = compute_storage_buffers[index];
         Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
@@ -1513,11 +1514,11 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
     const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
     const u32 size = static_cast<u32>(overlap.end - overlap.begin);
     const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
-    TouchBuffer(slot_buffers[new_buffer_id]);
     for (const BufferId overlap_id : overlap.ids) {
         JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
     }
     Register(new_buffer_id);
+    TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id);
     return new_buffer_id;
 }
 
@@ -1534,12 +1535,14 @@ void BufferCache<P>::Unregister(BufferId buffer_id) {
 template <class P>
 template <bool insert>
 void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
-    const Buffer& buffer = slot_buffers[buffer_id];
+    Buffer& buffer = slot_buffers[buffer_id];
     const auto size = buffer.SizeBytes();
     if (insert) {
         total_used_memory += Common::AlignUp(size, 1024);
+        buffer.lru_id = lru_cache.Insert(buffer_id, frame_tick);
     } else {
         total_used_memory -= Common::AlignUp(size, 1024);
+        lru_cache.Free(buffer.lru_id);
     }
     const VAddr cpu_addr_begin = buffer.CpuAddr();
     const VAddr cpu_addr_end = cpu_addr_begin + size;
@@ -1555,8 +1558,10 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
 }
 
 template <class P>
-void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept {
-    buffer.SetFrameTick(frame_tick);
+void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept {
+    if (buffer_id != NULL_BUFFER_ID) {
+        lru_cache.Touch(buffer.lru_id, frame_tick);
+    }
 }
 
 template <class P>
diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h
index ff1feda9b3..662089e3de 100644
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -80,7 +80,7 @@ struct ImageBase {
     VAddr cpu_addr_end = 0;
 
     u64 modification_tick = 0;
-    u64 frame_tick = 0;
+    size_t lru_index = ~0;
 
     std::array<u32, MAX_MIP_LEVELS> mip_level_offsets{};
 
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index a087498ff5..c16cc0838d 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -43,8 +43,6 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
     void(slot_image_views.insert(runtime, NullImageParams{}));
     void(slot_samplers.insert(runtime, sampler_descriptor));
 
-    deletion_iterator = slot_images.begin();
-
     if constexpr (HAS_DEVICE_MEMORY_INFO) {
         const auto device_memory = runtime.GetDeviceLocalMemory();
         const u64 possible_expected_memory = (device_memory * 3) / 10;
@@ -64,65 +62,33 @@ template <class P>
 void TextureCache<P>::RunGarbageCollector() {
     const bool high_priority_mode = total_used_memory >= expected_memory;
     const bool aggressive_mode = total_used_memory >= critical_memory;
-    const u64 ticks_to_destroy = high_priority_mode ? 60 : 100;
-    int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64);
-    for (; num_iterations > 0; --num_iterations) {
-        if (deletion_iterator == slot_images.end()) {
-            deletion_iterator = slot_images.begin();
-            if (deletion_iterator == slot_images.end()) {
-                break;
-            }
+    const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 50ULL : 100ULL;
+    size_t num_iterations = aggressive_mode ? 10000 : (high_priority_mode ? 50 : 5);
+    const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) {
+        if (num_iterations == 0) {
+            return true;
         }
-        auto [image_id, image_tmp] = *deletion_iterator;
-        Image* image = image_tmp; // fix clang error.
-        const bool is_alias = True(image->flags & ImageFlagBits::Alias);
-        const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap);
-        const bool must_download = image->IsSafeDownload();
-        bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download);
-        const u64 ticks_needed =
-            is_bad_overlap
-                ? ticks_to_destroy >> 4
-                : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy);
-        should_care |= aggressive_mode;
-        if (should_care && image->frame_tick + ticks_needed < frame_tick) {
-            if (is_bad_overlap) {
-                const bool overlap_check = std::ranges::all_of(
-                    image->overlapping_images, [&, image](const ImageId& overlap_id) {
-                        auto& overlap = slot_images[overlap_id];
-                        return overlap.frame_tick >= image->frame_tick;
-                    });
-                if (!overlap_check) {
-                    ++deletion_iterator;
-                    continue;
-                }
-            }
-            if (!is_bad_overlap && must_download) {
-                const bool alias_check = std::ranges::none_of(
-                    image->aliased_images, [&, image](const AliasedImage& alias) {
-                        auto& alias_image = slot_images[alias.id];
-                        return (alias_image.frame_tick < image->frame_tick) ||
-                               (alias_image.modification_tick < image->modification_tick);
-                    });
-
-                if (alias_check) {
-                    auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes);
-                    const auto copies = FullDownloadCopies(image->info);
-                    image->DownloadMemory(map, copies);
-                    runtime.Finish();
-                    SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span);
-                }
-            }
-            if (True(image->flags & ImageFlagBits::Tracked)) {
-                UntrackImage(*image, image_id);
-            }
-            UnregisterImage(image_id);
-            DeleteImage(image_id);
-            if (is_bad_overlap) {
-                ++num_iterations;
-            }
+        --num_iterations;
+        auto& image = slot_images[image_id];
+        const bool must_download = image.IsSafeDownload();
+        if (!high_priority_mode && must_download) {
+            return false;
         }
-        ++deletion_iterator;
-    }
+        if (must_download) {
+            auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
+            const auto copies = FullDownloadCopies(image.info);
+            image.DownloadMemory(map, copies);
+            runtime.Finish();
+            SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
+        }
+        if (True(image.flags & ImageFlagBits::Tracked)) {
+            UntrackImage(image, image_id);
+        }
+        UnregisterImage(image_id);
+        DeleteImage(image_id);
+        return false;
+    };
+    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up);
 }
 
 template <class P>
@@ -1078,6 +1044,8 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
         tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
     }
     total_used_memory += Common::AlignUp(tentative_size, 1024);
+    image.lru_index = lru_cache.Insert(image_id, frame_tick);
+
     ForEachGPUPage(image.gpu_addr, image.guest_size_bytes,
                    [this, image_id](u64 page) { gpu_page_table[page].push_back(image_id); });
     if (False(image.flags & ImageFlagBits::Sparse)) {
@@ -1115,6 +1083,7 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) {
         tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
     }
     total_used_memory -= Common::AlignUp(tentative_size, 1024);
+    lru_cache.Free(image.lru_index);
     const auto& clear_page_table =
         [this, image_id](
             u64 page,
@@ -1384,7 +1353,7 @@ void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool
     if (is_modification) {
         MarkModification(image);
     }
-    image.frame_tick = frame_tick;
+    lru_cache.Touch(image.lru_index, frame_tick);
 }
 
 template <class P>
diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h
index e4ae351cb2..d7528ed243 100644
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -14,6 +14,7 @@
 
 #include "common/common_types.h"
 #include "common/literals.h"
+#include "common/lru_cache.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/delayed_destruction_ring.h"
 #include "video_core/engines/fermi_2d.h"
@@ -370,6 +371,12 @@ private:
     std::vector<ImageId> uncommitted_downloads;
     std::queue<std::vector<ImageId>> committed_downloads;
 
+    struct LRUItemParams {
+        using ObjectType = ImageId;
+        using TickType = u64;
+    };
+    Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
+
     static constexpr size_t TICKS_TO_DESTROY = 6;
     DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images;
     DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view;
@@ -379,7 +386,6 @@ private:
 
     u64 modification_tick = 0;
     u64 frame_tick = 0;
-    typename SlotVector<Image>::Iterator deletion_iterator;
 };
 
 } // namespace VideoCommon