Prototype of a defragmentation interface that supports tiling optimal images

Merged #90 thanks @JustSid !
2025-05-30 08:27:44 +00:00 · 2019-12-23 15:28:51 +01:00 · 2019-12-23 15:28:51 +01:00 · a52012de37
commit a52012de37
parent c8eec757fd
2 changed files with 1056 additions and 51 deletions
--- a/src/Tests.cpp
+++ b/src/Tests.cpp
@ -688,6 +688,7 @@ struct AllocInfo
    VmaAllocation m_Allocation = VK_NULL_HANDLE;
    VkBuffer m_Buffer = VK_NULL_HANDLE;
    VkImage m_Image = VK_NULL_HANDLE;
+    VkImageLayout m_ImageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
    uint32_t m_StartValue = 0;
    union
    {
@ -698,6 +699,10 @@ struct AllocInfo
    void CreateBuffer(
        const VkBufferCreateInfo& bufCreateInfo,
        const VmaAllocationCreateInfo& allocCreateInfo);
+    void CreateImage(
+        const VkImageCreateInfo& imageCreateInfo,
+        const VmaAllocationCreateInfo& allocCreateInfo,
+        VkImageLayout layout);
    void Destroy();
 };

@ -709,6 +714,16 @@ void AllocInfo::CreateBuffer(
    VkResult res = vmaCreateBuffer(g_hAllocator, &bufCreateInfo, &allocCreateInfo, &m_Buffer, &m_Allocation, nullptr);
    TEST(res == VK_SUCCESS);
 }
+void AllocInfo::CreateImage(
+    const VkImageCreateInfo& imageCreateInfo,
+    const VmaAllocationCreateInfo& allocCreateInfo,
+    VkImageLayout layout)
+{
+    m_ImageInfo = imageCreateInfo;
+    m_ImageLayout = layout;
+    VkResult res = vmaCreateImage(g_hAllocator, &imageCreateInfo, &allocCreateInfo, &m_Image, &m_Allocation, nullptr);
+    TEST(res == VK_SUCCESS);
+}

 void AllocInfo::Destroy()
 {
@ -904,7 +919,88 @@ static void UploadGpuData(const AllocInfo* allocInfo, size_t allocInfoCount)
        }
        else
        {
-            TEST(0 && "Images not currently supported.");
+            TEST(currAllocInfo.m_ImageInfo.format == VK_FORMAT_R8G8B8A8_UNORM && "Only RGBA8 images are currently supported.");
+            TEST(currAllocInfo.m_ImageInfo.mipLevels == 1 && "Only single mip images are currently supported.");
+
+            const VkDeviceSize size = currAllocInfo.m_ImageInfo.extent.width * currAllocInfo.m_ImageInfo.extent.height * sizeof(uint32_t);
+
+            VkBuffer stagingBuf = VK_NULL_HANDLE;
+            void* stagingBufMappedPtr = nullptr;
+            if(!stagingBufs.AcquireBuffer(size, stagingBuf, stagingBufMappedPtr))
+            {
+                TEST(cmdBufferStarted);
+                EndSingleTimeCommands();
+                stagingBufs.ReleaseAllBuffers();
+                cmdBufferStarted = false;
+
+                bool ok = stagingBufs.AcquireBuffer(size, stagingBuf, stagingBufMappedPtr);
+                TEST(ok);
+            }
+
+            // Fill staging buffer.
+            {
+                assert(size % sizeof(uint32_t) == 0);
+                uint32_t *stagingValPtr = (uint32_t *)stagingBufMappedPtr;
+                uint32_t val = currAllocInfo.m_StartValue;
+                for(size_t i = 0; i < size / sizeof(uint32_t); ++i)
+                {
+                    *stagingValPtr = val;
+                    ++stagingValPtr;
+                    ++val;
+                }
+            }
+            
+            // Issue copy command from staging buffer to destination buffer.
+            if(!cmdBufferStarted)
+            {
+                cmdBufferStarted = true;
+                BeginSingleTimeCommands();
+            }
+
+            
+            // Transfer to transfer dst layout
+            VkImageSubresourceRange subresourceRange = {
+                VK_IMAGE_ASPECT_COLOR_BIT,
+                0, VK_REMAINING_MIP_LEVELS,
+                0, VK_REMAINING_ARRAY_LAYERS
+            };
+            
+            VkImageMemoryBarrier barrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER };
+            barrier.srcAccessMask = 0;
+            barrier.dstAccessMask = 0;
+            barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+            barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.image = currAllocInfo.m_Image;
+            barrier.subresourceRange = subresourceRange;
+
+            vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0,
+                0, nullptr,
+                0, nullptr,
+                1, &barrier);
+
+            // Copy image date
+            VkBufferImageCopy copy = {};
+            copy.bufferOffset = 0;
+            copy.bufferRowLength = 0;
+            copy.bufferImageHeight = 0;
+            copy.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+            copy.imageSubresource.layerCount = 1;
+            copy.imageExtent = currAllocInfo.m_ImageInfo.extent;
+
+            vkCmdCopyBufferToImage(g_hTemporaryCommandBuffer, stagingBuf, currAllocInfo.m_Image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, &copy);
+
+            // Transfer to desired layout
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
+            barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+            barrier.newLayout = currAllocInfo.m_ImageLayout;
+
+            vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0,
+                0, nullptr,
+                0, nullptr,
+                1, &barrier);
        }
    }

@ -1754,6 +1850,555 @@ static void TestDefragmentationGpu()
    g_MemoryAliasingWarningEnabled = true;
 }

+static void ProcessDefragmentationStepInfo(VmaDefragmentationStepInfo &stepInfo)
+{
+    std::vector<VkImageMemoryBarrier> beginImageBarriers;
+    std::vector<VkImageMemoryBarrier> finalizeImageBarriers;
+
+    VkPipelineStageFlags beginSrcStageMask = 0;
+    VkPipelineStageFlags beginDstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+    VkPipelineStageFlags finalizeSrcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
+    VkPipelineStageFlags finalizeDstStageMask = 0;
+
+    bool wantsMemoryBarrier = false;
+
+    VkMemoryBarrier beginMemoryBarrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER };
+    VkMemoryBarrier finalizeMemoryBarrier = { VK_STRUCTURE_TYPE_MEMORY_BARRIER };
+
+    std::vector<void *> newHandles;
+
+    for(uint32_t i = 0; i < stepInfo.moveCount; ++ i)
+    {
+        VmaAllocationInfo info;
+        vmaGetAllocationInfo(g_hAllocator, stepInfo.pMoves[i].allocation, &info);
+
+        AllocInfo *allocInfo = (AllocInfo *)info.pUserData;
+
+        if(allocInfo->m_Image)
+        {
+            VkImage newImage;
+
+            const VkResult result = vkCreateImage(g_hDevice, &allocInfo->m_ImageInfo, g_Allocs, &newImage);
+            TEST(result >= VK_SUCCESS);
+
+            vkBindImageMemory(g_hDevice, newImage, stepInfo.pMoves[i].memory, stepInfo.pMoves[i].offset);
+            newHandles.push_back(newImage);
+
+            // Keep track of our pipeline stages that we need to wait/signal on
+            beginSrcStageMask |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            finalizeDstStageMask |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+            // We need one pipeline barrier and two image layout transitions here
+            // First we'll have to turn our newly created image into VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+            // And the second one is turning the old image into VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
+
+            VkImageSubresourceRange subresourceRange = {
+                VK_IMAGE_ASPECT_COLOR_BIT,
+                0, VK_REMAINING_MIP_LEVELS,
+                0, VK_REMAINING_ARRAY_LAYERS
+            };
+
+            VkImageMemoryBarrier barrier = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER };
+            barrier.srcAccessMask = 0;
+            barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+            barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+            barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.image = newImage;
+            barrier.subresourceRange = subresourceRange;
+
+            beginImageBarriers.push_back(barrier);
+
+            // Second barrier to convert the existing image. This one actually needs a real barrier                         
+            barrier.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+            barrier.oldLayout = allocInfo->m_ImageLayout;
+            barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
+            barrier.image = allocInfo->m_Image;
+
+            beginImageBarriers.push_back(barrier);
+
+            // And lastly we need a barrier that turns our new image into the layout of the old one
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT;
+            barrier.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+            barrier.newLayout = allocInfo->m_ImageLayout;
+            barrier.image = newImage;
+
+            finalizeImageBarriers.push_back(barrier);
+        }
+        else if(allocInfo->m_Buffer)
+        {
+            VkBuffer newBuffer;
+
+            const VkResult result = vkCreateBuffer(g_hDevice, &allocInfo->m_BufferInfo, g_Allocs, &newBuffer);
+            TEST(result >= VK_SUCCESS);
+
+            vkBindBufferMemory(g_hDevice, newBuffer, stepInfo.pMoves[i].memory, stepInfo.pMoves[i].offset);
+            newHandles.push_back(newBuffer);
+
+            // Keep track of our pipeline stages that we need to wait/signal on
+            beginSrcStageMask |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            finalizeDstStageMask |= VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+            beginMemoryBarrier.srcAccessMask |= VK_ACCESS_MEMORY_WRITE_BIT;
+            beginMemoryBarrier.dstAccessMask |= VK_ACCESS_TRANSFER_READ_BIT;
+
+            finalizeMemoryBarrier.srcAccessMask |= VK_ACCESS_TRANSFER_WRITE_BIT;
+            finalizeMemoryBarrier.dstAccessMask |= VK_ACCESS_MEMORY_READ_BIT;
+
+            wantsMemoryBarrier = true;
+        }
+    }
+
+    if(!beginImageBarriers.empty() || wantsMemoryBarrier)
+    {
+        const uint32_t memoryBarrierCount = wantsMemoryBarrier ? 1 : 0;
+
+        vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, beginSrcStageMask, beginDstStageMask, 0,
+            memoryBarrierCount, &beginMemoryBarrier,
+            0, nullptr,
+            (uint32_t)beginImageBarriers.size(), beginImageBarriers.data());
+    }
+
+    for(uint32_t i = 0; i < stepInfo.moveCount; ++ i)
+    {
+        VmaAllocationInfo info;
+        vmaGetAllocationInfo(g_hAllocator, stepInfo.pMoves[i].allocation, &info);
+
+        AllocInfo *allocInfo = (AllocInfo *)info.pUserData;
+
+        if(allocInfo->m_Image)
+        {
+            std::vector<VkImageCopy> imageCopies;
+
+            // Copy all mips of the source image into the target image
+            VkOffset3D offset = { 0, 0, 0 };
+            VkExtent3D extent = allocInfo->m_ImageInfo.extent;
+
+            VkImageSubresourceLayers subresourceLayers = {
+                VK_IMAGE_ASPECT_COLOR_BIT,
+                0,
+                0, 1
+            };
+
+            for(uint32_t mip = 0; mip < allocInfo->m_ImageInfo.mipLevels; ++ mip)
+            {
+                subresourceLayers.mipLevel = mip;
+
+                VkImageCopy imageCopy{
+                    subresourceLayers,
+                    offset,
+                    subresourceLayers,
+                    offset,
+                    extent
+                };
+
+                imageCopies.push_back(imageCopy);
+
+                extent.width = std::max(uint32_t(1), extent.width >> 1);
+                extent.height = std::max(uint32_t(1), extent.height >> 1);
+                extent.depth = std::max(uint32_t(1), extent.depth >> 1);
+            }
+
+            vkCmdCopyImage(
+                g_hTemporaryCommandBuffer,
+                allocInfo->m_Image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                (VkImage)newHandles[i], VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                (uint32_t)imageCopies.size(), imageCopies.data());
+
+            imageCopies.clear();
+
+            // Update our alloc info with the new resource to be used
+            allocInfo->m_Image = (VkImage)newHandles[i];
+        }
+        else if(allocInfo->m_Buffer)
+        {
+            VkBufferCopy region = {
+                0,
+                0,
+                allocInfo->m_BufferInfo.size };
+
+            vkCmdCopyBuffer(g_hTemporaryCommandBuffer, 
+                allocInfo->m_Buffer, (VkBuffer)newHandles[i],
+                1, &region);
+
+
+            // Update our alloc info with the new resource to be used
+            allocInfo->m_Buffer = (VkBuffer)newHandles[i];
+        }
+    }
+
+
+    if(!finalizeImageBarriers.empty() || wantsMemoryBarrier)
+    {
+        const uint32_t memoryBarrierCount = wantsMemoryBarrier ? 1 : 0;
+
+        vkCmdPipelineBarrier(g_hTemporaryCommandBuffer, finalizeSrcStageMask, finalizeDstStageMask, 0,
+            memoryBarrierCount, &finalizeMemoryBarrier,
+            0, nullptr,
+            (uint32_t)finalizeImageBarriers.size(), finalizeImageBarriers.data());
+    }
+}
+
+
+static void TestDefragmentationIncrementalBasic()
+{
+    wprintf(L"Test defragmentation incremental basic\n");
+    g_MemoryAliasingWarningEnabled = false;
+
+    std::vector<AllocInfo> allocations;
+
+    // Create that many allocations to surely fill 3 new blocks of 256 MB.
+    const std::array<uint32_t, 3> imageSizes = { 256, 512, 1024 };
+    const VkDeviceSize bufSizeMin = 5ull * 1024 * 1024;
+    const VkDeviceSize bufSizeMax = 10ull * 1024 * 1024;
+    const VkDeviceSize totalSize = 3ull * 256 * 1024 * 1024;
+    const size_t imageCount = (size_t)(totalSize / (imageSizes[0] * imageSizes[0] * 4)) / 2;
+    const size_t bufCount = (size_t)(totalSize / bufSizeMin) / 2;
+    const size_t percentToLeave = 30;
+    RandomNumberGenerator rand = { 234522 };
+
+    VkImageCreateInfo imageInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
+    imageInfo.imageType = VK_IMAGE_TYPE_2D;
+    imageInfo.extent.depth = 1;
+    imageInfo.mipLevels = 1;
+    imageInfo.arrayLayers = 1;
+    imageInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
+    imageInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
+    imageInfo.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
+    imageInfo.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
+    imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+
+    VmaAllocationCreateInfo allocCreateInfo = {};
+    allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+    allocCreateInfo.flags = 0;
+
+    // Create all intended images.
+    for(size_t i = 0; i < imageCount; ++i)
+    {
+        const uint32_t size = imageSizes[rand.Generate() % 3];
+
+        imageInfo.extent.width = size;
+        imageInfo.extent.height = size;
+
+        AllocInfo alloc;
+        alloc.CreateImage(imageInfo, allocCreateInfo, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+        alloc.m_StartValue = 0;
+
+        allocations.push_back(alloc);
+    }
+
+    // And all buffers
+    VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+
+    for(size_t i = 0; i < bufCount; ++i)
+    {
+        bufCreateInfo.size = align_up<VkDeviceSize>(bufSizeMin + rand.Generate() % (bufSizeMax - bufSizeMin), 16);
+        bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+
+        AllocInfo alloc;
+        alloc.CreateBuffer(bufCreateInfo, allocCreateInfo);
+        alloc.m_StartValue = 0;
+
+        allocations.push_back(alloc);
+    }
+
+    // Destroy some percentage of them.
+    {
+        const size_t allocationsToDestroy = round_div<size_t>((imageCount + bufCount) * (100 - percentToLeave), 100);
+        for(size_t i = 0; i < allocationsToDestroy; ++i)
+        {
+            const size_t index = rand.Generate() % allocations.size();
+            allocations[index].Destroy();
+            allocations.erase(allocations.begin() + index);
+        }
+    }
+
+    {
+        // Set our user data pointers. A real application should probably be more clever here
+        const size_t allocationCount = allocations.size();
+        for(size_t i = 0; i < allocationCount; ++i)
+        {
+            AllocInfo &alloc = allocations[i];
+            vmaSetAllocationUserData(g_hAllocator, alloc.m_Allocation, &alloc);
+        }
+    }
+
+    // Fill them with meaningful data.
+    UploadGpuData(allocations.data(), allocations.size());
+
+    wchar_t fileName[MAX_PATH];
+    swprintf_s(fileName, L"GPU_defragmentation_incremental_basic_A_before.json");
+    SaveAllocatorStatsToFile(fileName);
+
+    // Defragment using GPU only.
+    {
+        const size_t allocCount = allocations.size();
+
+        std::vector<VmaAllocation> allocationPtrs;
+
+        for(size_t i = 0; i < allocCount; ++i)
+        {
+            VmaAllocationInfo allocInfo = {};
+            vmaGetAllocationInfo(g_hAllocator, allocations[i].m_Allocation, &allocInfo);
+
+            allocationPtrs.push_back(allocations[i].m_Allocation);
+        }
+
+        const size_t movableAllocCount = allocationPtrs.size();
+
+        VmaDefragmentationInfo2 defragInfo = {};
+        defragInfo.flags = VMA_DEFRAGMENTATION_FLAG_INCREMENTAL;
+        defragInfo.allocationCount = (uint32_t)movableAllocCount;
+        defragInfo.pAllocations = allocationPtrs.data();
+        defragInfo.maxGpuBytesToMove = VK_WHOLE_SIZE;
+        defragInfo.maxGpuAllocationsToMove = UINT32_MAX;
+
+        VmaDefragmentationStats stats = {};
+        VmaDefragmentationContext ctx = VK_NULL_HANDLE;
+        VkResult res = vmaDefragmentationBegin(g_hAllocator, &defragInfo, &stats, &ctx);
+        TEST(res >= VK_SUCCESS);
+
+        res = VK_NOT_READY;
+
+        std::vector<VmaDefragmentationStepMoveInfo> moveInfo;
+        moveInfo.resize(movableAllocCount);
+
+        while(res == VK_NOT_READY)
+        {
+            VmaDefragmentationStepInfo stepInfo = {};
+            stepInfo.pMoves = moveInfo.data();
+            stepInfo.moveCount = (uint32_t)moveInfo.size();
+
+            res = vmaDefragmentationStepBegin(g_hAllocator, &stepInfo, ctx);
+            TEST(res >= VK_SUCCESS);
+
+            BeginSingleTimeCommands();
+            ProcessDefragmentationStepInfo(stepInfo);
+            EndSingleTimeCommands();
+
+            res = vmaDefragmentationStepEnd(g_hAllocator, ctx);
+        }
+
+        TEST(res >= VK_SUCCESS);
+        vmaDefragmentationEnd(g_hAllocator, ctx);
+
+        // If corruption detection is enabled, GPU defragmentation may not work on
+        // memory types that have this detection active, e.g. on Intel.
+#if !defined(VMA_DEBUG_DETECT_CORRUPTION) || VMA_DEBUG_DETECT_CORRUPTION == 0
+        TEST(stats.allocationsMoved > 0 && stats.bytesMoved > 0);
+        TEST(stats.deviceMemoryBlocksFreed > 0 && stats.bytesFreed > 0);
+#endif
+    }
+
+    //ValidateGpuData(allocations.data(), allocations.size());
+
+    swprintf_s(fileName, L"GPU_defragmentation_incremental_basic_B_after.json");
+    SaveAllocatorStatsToFile(fileName);
+
+    // Destroy all remaining buffers.
+    for(size_t i = allocations.size(); i--; )
+    {
+        allocations[i].Destroy();
+    }
+
+    g_MemoryAliasingWarningEnabled = true;
+}
+
+void TestDefragmentationIncrementalComplex()
+{
+    wprintf(L"Test defragmentation incremental complex\n");
+    g_MemoryAliasingWarningEnabled = false;
+
+    std::vector<AllocInfo> allocations;
+
+    // Create that many allocations to surely fill 3 new blocks of 256 MB.
+    const std::array<uint32_t, 3> imageSizes = { 256, 512, 1024 };
+    const VkDeviceSize bufSizeMin = 5ull * 1024 * 1024;
+    const VkDeviceSize bufSizeMax = 10ull * 1024 * 1024;
+    const VkDeviceSize totalSize = 3ull * 256 * 1024 * 1024;
+    const size_t imageCount = (size_t)(totalSize / (imageSizes[0] * imageSizes[0] * 4)) / 2;
+    const size_t bufCount = (size_t)(totalSize / bufSizeMin) / 2;
+    const size_t percentToLeave = 30;
+    RandomNumberGenerator rand = { 234522 };
+
+    VkImageCreateInfo imageInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO };
+    imageInfo.imageType = VK_IMAGE_TYPE_2D;
+    imageInfo.extent.depth = 1;
+    imageInfo.mipLevels = 1;
+    imageInfo.arrayLayers = 1;
+    imageInfo.format = VK_FORMAT_R8G8B8A8_UNORM;
+    imageInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
+    imageInfo.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
+    imageInfo.usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT;
+    imageInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+
+    VmaAllocationCreateInfo allocCreateInfo = {};
+    allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+    allocCreateInfo.flags = 0;
+
+    // Create all intended images.
+    for(size_t i = 0; i < imageCount; ++i)
+    {
+        const uint32_t size = imageSizes[rand.Generate() % 3];
+
+        imageInfo.extent.width = size;
+        imageInfo.extent.height = size;
+
+        AllocInfo alloc;
+        alloc.CreateImage(imageInfo, allocCreateInfo, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
+        alloc.m_StartValue = 0;
+
+        allocations.push_back(alloc);
+    }
+
+    // And all buffers
+    VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
+
+    for(size_t i = 0; i < bufCount; ++i)
+    {
+        bufCreateInfo.size = align_up<VkDeviceSize>(bufSizeMin + rand.Generate() % (bufSizeMax - bufSizeMin), 16);
+        bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+
+        AllocInfo alloc;
+        alloc.CreateBuffer(bufCreateInfo, allocCreateInfo);
+        alloc.m_StartValue = 0;
+
+        allocations.push_back(alloc);
+    }
+
+    // Destroy some percentage of them.
+    {
+        const size_t allocationsToDestroy = round_div<size_t>((imageCount + bufCount) * (100 - percentToLeave), 100);
+        for(size_t i = 0; i < allocationsToDestroy; ++i)
+        {
+            const size_t index = rand.Generate() % allocations.size();
+            allocations[index].Destroy();
+            allocations.erase(allocations.begin() + index);
+        }
+    }
+
+    {
+        // Set our user data pointers. A real application should probably be more clever here
+        const size_t allocationCount = allocations.size();
+        for(size_t i = 0; i < allocationCount; ++i)
+        {
+            AllocInfo &alloc = allocations[i];
+            vmaSetAllocationUserData(g_hAllocator, alloc.m_Allocation, &alloc);
+        }
+    }
+
+    // Fill them with meaningful data.
+    UploadGpuData(allocations.data(), allocations.size());
+
+    wchar_t fileName[MAX_PATH];
+    swprintf_s(fileName, L"GPU_defragmentation_incremental_complex_A_before.json");
+    SaveAllocatorStatsToFile(fileName);
+
+    std::vector<AllocInfo> additionalAllocations;
+
+#define MakeAdditionalAllocation() \
+    do { \
+        { \
+            bufCreateInfo.size = align_up<VkDeviceSize>(bufSizeMin + rand.Generate() % (bufSizeMax - bufSizeMin), 16); \
+            bufCreateInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT; \
+            \
+            AllocInfo alloc; \
+            alloc.CreateBuffer(bufCreateInfo, allocCreateInfo); \
+            \
+            additionalAllocations.push_back(alloc); \
+        } \
+    } while(0)
+
+    // Defragment using GPU only.
+    {
+        const size_t allocCount = allocations.size();
+
+        std::vector<VmaAllocation> allocationPtrs;
+
+        for(size_t i = 0; i < allocCount; ++i)
+        {
+            VmaAllocationInfo allocInfo = {};
+            vmaGetAllocationInfo(g_hAllocator, allocations[i].m_Allocation, &allocInfo);
+
+            allocationPtrs.push_back(allocations[i].m_Allocation);
+        }
+
+        const size_t movableAllocCount = allocationPtrs.size();
+
+        VmaDefragmentationInfo2 defragInfo = {};
+        defragInfo.flags = VMA_DEFRAGMENTATION_FLAG_INCREMENTAL;
+        defragInfo.allocationCount = (uint32_t)movableAllocCount;
+        defragInfo.pAllocations = allocationPtrs.data();
+        defragInfo.maxGpuBytesToMove = VK_WHOLE_SIZE;
+        defragInfo.maxGpuAllocationsToMove = UINT32_MAX;
+
+        VmaDefragmentationStats stats = {};
+        VmaDefragmentationContext ctx = VK_NULL_HANDLE;
+        VkResult res = vmaDefragmentationBegin(g_hAllocator, &defragInfo, &stats, &ctx);
+        TEST(res >= VK_SUCCESS);
+
+        res = VK_NOT_READY;
+
+        std::vector<VmaDefragmentationStepMoveInfo> moveInfo;
+        moveInfo.resize(movableAllocCount);
+
+        MakeAdditionalAllocation();
+
+        while(res == VK_NOT_READY)
+        {
+            VmaDefragmentationStepInfo stepInfo = {};
+            stepInfo.pMoves = moveInfo.data();
+            stepInfo.moveCount = (uint32_t)moveInfo.size();
+
+            res = vmaDefragmentationStepBegin(g_hAllocator, &stepInfo, ctx);
+            TEST(res >= VK_SUCCESS);
+
+            MakeAdditionalAllocation();
+
+            BeginSingleTimeCommands();
+            ProcessDefragmentationStepInfo(stepInfo);
+            EndSingleTimeCommands();
+
+            res = vmaDefragmentationStepEnd(g_hAllocator, ctx);
+
+            MakeAdditionalAllocation();
+        }
+
+        TEST(res >= VK_SUCCESS);
+        vmaDefragmentationEnd(g_hAllocator, ctx);
+
+        // If corruption detection is enabled, GPU defragmentation may not work on
+        // memory types that have this detection active, e.g. on Intel.
+#if !defined(VMA_DEBUG_DETECT_CORRUPTION) || VMA_DEBUG_DETECT_CORRUPTION == 0
+        TEST(stats.allocationsMoved > 0 && stats.bytesMoved > 0);
+        TEST(stats.deviceMemoryBlocksFreed > 0 && stats.bytesFreed > 0);
+#endif
+    }
+
+    //ValidateGpuData(allocations.data(), allocations.size());
+
+    swprintf_s(fileName, L"GPU_defragmentation_incremental_complex_B_after.json");
+    SaveAllocatorStatsToFile(fileName);
+
+    // Destroy all remaining buffers.
+    for(size_t i = allocations.size(); i--; )
+    {
+        allocations[i].Destroy();
+    }
+
+    for(size_t i = additionalAllocations.size(); i--; )
+    {
+        additionalAllocations[i].Destroy();
+    }
+
+    g_MemoryAliasingWarningEnabled = true;
+}
+
+
 static void TestUserData()
 {
    VkResult res;
@ -5499,6 +6144,8 @@ void Test()
    TestDefragmentationFull();
    TestDefragmentationWholePool();
    TestDefragmentationGpu();
+    TestDefragmentationIncrementalBasic();
+    TestDefragmentationIncrementalComplex();

    // # Detailed tests
    FILE* file;
--- a/src/vk_mem_alloc.h
+++ b/src/vk_mem_alloc.h
@ -1952,6 +1952,7 @@ typedef struct VmaVulkanFunctions {
    PFN_vkCreateImage vkCreateImage;
    PFN_vkDestroyImage vkDestroyImage;
    PFN_vkCmdCopyBuffer vkCmdCopyBuffer;
+    PFN_vkCmdCopyImage vkCmdCopyImage;
 #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000
    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
@ -3111,6 +3112,7 @@ VK_DEFINE_HANDLE(VmaDefragmentationContext)

 /// Flags to be used in vmaDefragmentationBegin(). None at the moment. Reserved for future use.
 typedef enum VmaDefragmentationFlagBits {
+    VMA_DEFRAGMENTATION_FLAG_INCREMENTAL = 0x1,
    VMA_DEFRAGMENTATION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
 } VmaDefragmentationFlagBits;
 typedef VkFlags VmaDefragmentationFlags;
@ -3191,6 +3193,21 @@ typedef struct VmaDefragmentationInfo2 {
    VkCommandBuffer commandBuffer;
 } VmaDefragmentationInfo2;

+typedef struct VmaDefragmentationStepMoveInfo {
+    VmaAllocation allocation;
+    VkDeviceMemory memory;
+    VkDeviceSize offset;
+} VmaDefragmentationStepMoveInfo;
+
+/** \brief Parameters for incremental defragmentation steps.
+
+To be used with function vmaDefragmentationStepBegin().
+*/
+typedef struct VmaDefragmentationStepInfo {
+    uint32_t moveCount;
+    VmaDefragmentationStepMoveInfo* pMoves;
+} VmaDefragmentationStepInfo;
+
 /** \brief Deprecated. Optional configuration parameters to be passed to function vmaDefragment().

 \deprecated This is a part of the old interface. It is recommended to use structure #VmaDefragmentationInfo2 and function vmaDefragmentationBegin() instead.
@ -3264,6 +3281,16 @@ VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd(
    VmaAllocator allocator,
    VmaDefragmentationContext context);

+VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationStepBegin(
+    VmaAllocator allocator,
+    VmaDefragmentationStepInfo* pInfo,
+    VmaDefragmentationContext context
+);
+VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationStepEnd(
+    VmaAllocator allocator,
+    VmaDefragmentationContext context
+);
+
 /** \brief Deprecated. Compacts memory by moving allocations.

@param pAllocations Array of allocations that can be moved during this compation.
@ -3672,6 +3699,7 @@ void *aligned_alloc(size_t alignment, size_t size)
    public:
        void Lock() { m_Mutex.lock(); }
        void Unlock() { m_Mutex.unlock(); }
+        bool TryLock() { return m_Mutex.try_lock(); }
    private:
        std::mutex m_Mutex;
    };
@ -3688,8 +3716,10 @@ void *aligned_alloc(size_t alignment, size_t size)
        public:
            void LockRead() { m_Mutex.lock_shared(); }
            void UnlockRead() { m_Mutex.unlock_shared(); }
+            bool TryLockRead() { return m_Mutex.try_shared_lock(); }
            void LockWrite() { m_Mutex.lock(); }
            void UnlockWrite() { m_Mutex.unlock(); }
+            bool TryLockWrite() { return m_Mutex.try_lock(); }
        private:
            std::shared_mutex m_Mutex;
        };
@ -3703,8 +3733,10 @@ void *aligned_alloc(size_t alignment, size_t size)
            VmaRWMutex() { InitializeSRWLock(&m_Lock); }
            void LockRead() { AcquireSRWLockShared(&m_Lock); }
            void UnlockRead() { ReleaseSRWLockShared(&m_Lock); }
+            bool TryLockRead() { return TryAcquireSRWLockShared(&m_Lock); }
            void LockWrite() { AcquireSRWLockExclusive(&m_Lock); }
            void UnlockWrite() { ReleaseSRWLockExclusive(&m_Lock); }
+            bool TryLockWrite() { return TryAcquireSRWLockExclusive(&m_Lock); }
        private:
            SRWLOCK m_Lock;
        };
@ -3716,8 +3748,10 @@ void *aligned_alloc(size_t alignment, size_t size)
        public:
            void LockRead() { m_Mutex.Lock(); }
            void UnlockRead() { m_Mutex.Unlock(); }
+            bool TryLockRead() { return m_Mutex.TryLock(); }
            void LockWrite() { m_Mutex.Lock(); }
            void UnlockWrite() { m_Mutex.Unlock(); }
+            bool TryLockWrite() { return m_Mutex.TryLock(); }
        private:
            VMA_MUTEX m_Mutex;
        };
@ -6241,6 +6275,9 @@ struct VmaDefragmentationMove
    VkDeviceSize srcOffset;
    VkDeviceSize dstOffset;
    VkDeviceSize size;
+    VmaAllocation hAllocation;
+    VmaDeviceMemoryBlock* pSrcBlock;
+    VmaDeviceMemoryBlock* pDstBlock;
 };

 class VmaDefragmentationAlgorithm;
@ -6310,7 +6347,7 @@ public:
    // Saves results in pCtx->res.
    void Defragment(
        class VmaBlockVectorDefragmentationContext* pCtx,
-        VmaDefragmentationStats* pStats,
+        VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags,
        VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove,
        VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove,
        VkCommandBuffer commandBuffer);
@ -6318,6 +6355,14 @@ public:
        class VmaBlockVectorDefragmentationContext* pCtx,
        VmaDefragmentationStats* pStats);

+    uint32_t ProcessDefragmentations(
+        class VmaBlockVectorDefragmentationContext *pCtx,
+        VmaDefragmentationStepMoveInfo* pMove, uint32_t maxMoves);
+
+    void CommitDefragmentations(
+        class VmaBlockVectorDefragmentationContext *pCtx,
+        VmaDefragmentationStats* pStats);
+
    ////////////////////////////////////////////////////////////////////////////////
    // To be used only while the m_Mutex is locked. Used during defragmentation.

@ -6350,6 +6395,8 @@ private:

    VkDeviceSize CalcMaxBlockSize() const;

+    static VkImageAspectFlags ImageAspectMaskForFormat(VkFormat format);
+    
    // Finds and removes given block from vector.
    void Remove(VmaDeviceMemoryBlock* pBlock);

@ -6386,7 +6433,7 @@ private:
    // Saves result to pCtx->res.
    void ApplyDefragmentationMovesGpu(
        class VmaBlockVectorDefragmentationContext* pDefragCtx,
-        const VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
+        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
        VkCommandBuffer commandBuffer);

    /*
@ -6455,7 +6502,8 @@ public:
    virtual VkResult Defragment(
        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
        VkDeviceSize maxBytesToMove,
-        uint32_t maxAllocationsToMove) = 0;
+        uint32_t maxAllocationsToMove,
+        VmaDefragmentationFlags flags) = 0;

    virtual VkDeviceSize GetBytesMoved() const = 0;
    virtual uint32_t GetAllocationsMoved() const = 0;
@ -6500,7 +6548,8 @@ public:
    virtual VkResult Defragment(
        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
        VkDeviceSize maxBytesToMove,
-        uint32_t maxAllocationsToMove);
+        uint32_t maxAllocationsToMove,
+        VmaDefragmentationFlags flags);

    virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; }
    virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; }
@ -6601,7 +6650,8 @@ private:
    VkResult DefragmentRound(
        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
        VkDeviceSize maxBytesToMove,
-        uint32_t maxAllocationsToMove);
+        uint32_t maxAllocationsToMove,
+        bool freeOldAllocations);

    size_t CalcBlocksWithNonMovableCount() const;

@ -6627,7 +6677,8 @@ public:
    virtual VkResult Defragment(
        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
        VkDeviceSize maxBytesToMove,
-        uint32_t maxAllocationsToMove);
+        uint32_t maxAllocationsToMove,
+        VmaDefragmentationFlags flags);

    virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; }
    virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; }
@ -6775,6 +6826,10 @@ public:
    VkResult res;
    bool mutexLocked;
    VmaVector< VmaBlockDefragmentationContext, VmaStlAllocator<VmaBlockDefragmentationContext> > blockContexts;
+    VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> > defragmentationMoves;
+    uint32_t defragmentationMovesProcessed;
+    uint32_t defragmentationMovesCommitted;
+    bool hasDefragmentationPlan;

    VmaBlockVectorDefragmentationContext(
        VmaAllocator hAllocator,
@ -6790,7 +6845,7 @@ public:
    void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged);
    void AddAll() { m_AllAllocations = true; }

-    void Begin(bool overlappingMoveSupported);
+    void Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags);

 private:
    const VmaAllocator m_hAllocator;
@ -6839,13 +6894,22 @@ public:
    VkResult Defragment(
        VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove,
        VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove,
-        VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats);
+        VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags);
+
+    VkResult DefragmentStepBegin(VmaDefragmentationStepInfo* pInfo);
+    VkResult DefragmentStepEnd();

 private:
    const VmaAllocator m_hAllocator;
    const uint32_t m_CurrFrameIndex;
    const uint32_t m_Flags;
    VmaDefragmentationStats* const m_pStats;
+
+    VkDeviceSize m_MaxCpuBytesToMove;
+    uint32_t m_MaxCpuAllocationsToMove;
+    VkDeviceSize m_MaxGpuBytesToMove;
+    uint32_t m_MaxGpuAllocationsToMove;
+
    // Owner of these objects.
    VmaBlockVectorDefragmentationContext* m_DefaultPoolContexts[VK_MAX_MEMORY_TYPES];
    // Owner of these objects.
@ -7185,6 +7249,12 @@ public:
    VkResult DefragmentationEnd(
        VmaDefragmentationContext context);

+    VkResult DefragmentationStepBegin(
+        VmaDefragmentationStepInfo* pInfo,
+        VmaDefragmentationContext context);
+    VkResult DefragmentationStepEnd(
+        VmaDefragmentationContext context);
+
    void GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo);
    bool TouchAllocation(VmaAllocation hAllocation);

@ -12618,7 +12688,7 @@ void VmaBlockVector::ApplyDefragmentationMovesCpu(

 void VmaBlockVector::ApplyDefragmentationMovesGpu(
    class VmaBlockVectorDefragmentationContext* pDefragCtx,
-    const VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
+    VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
    VkCommandBuffer commandBuffer)
 {
    const size_t blockCount = m_Blocks.size();
@ -12631,9 +12701,14 @@ void VmaBlockVector::ApplyDefragmentationMovesGpu(
    for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex)
    {
        const VmaDefragmentationMove& move = moves[moveIndex];
+
+        //if(move.type == VMA_ALLOCATION_TYPE_UNKNOWN)
+        {
+            // Old school move still require us to map the whole block
            pDefragCtx->blockContexts[move.srcBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED;
            pDefragCtx->blockContexts[move.dstBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED;
        }
+    }

    VMA_ASSERT(pDefragCtx->res == VK_SUCCESS);

@ -12806,7 +12881,7 @@ void VmaBlockVector::PrintDetailedMap(class VmaJsonWriter& json)

 void VmaBlockVector::Defragment(
    class VmaBlockVectorDefragmentationContext* pCtx,
-    VmaDefragmentationStats* pStats,
+    VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags,
    VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove,
    VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove,
    VkCommandBuffer commandBuffer)
@ -12842,20 +12917,29 @@ void VmaBlockVector::Defragment(
        bool overlappingMoveSupported = !defragmentOnGpu;

        if(m_hAllocator->m_UseMutex)
+        {
+            if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)
+            {
+                if(!m_Mutex.TryLockWrite())
+                {
+                    pCtx->res = VK_ERROR_INITIALIZATION_FAILED;
+                    return;
+                }
+            }
+            else
            {
                m_Mutex.LockWrite();
                pCtx->mutexLocked = true;
            }
+        }

-        pCtx->Begin(overlappingMoveSupported);
+        pCtx->Begin(overlappingMoveSupported, flags);

        // Defragment.

        const VkDeviceSize maxBytesToMove = defragmentOnGpu ? maxGpuBytesToMove : maxCpuBytesToMove;
        const uint32_t maxAllocationsToMove = defragmentOnGpu ? maxGpuAllocationsToMove : maxCpuAllocationsToMove;
-        VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> > moves = 
-            VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >(VmaStlAllocator<VmaDefragmentationMove>(m_hAllocator->GetAllocationCallbacks()));
-        pCtx->res = pCtx->GetAlgorithm()->Defragment(moves, maxBytesToMove, maxAllocationsToMove);
+        pCtx->res = pCtx->GetAlgorithm()->Defragment(pCtx->defragmentationMoves, maxBytesToMove, maxAllocationsToMove, flags);

        // Accumulate statistics.
        if(pStats != VMA_NULL)
@ -12878,15 +12962,26 @@ void VmaBlockVector::Defragment(
            }
        }

+        if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)
+        {
+            if(m_hAllocator->m_UseMutex)
+                m_Mutex.UnlockWrite();
+            
+            if(pCtx->res >= VK_SUCCESS && !pCtx->defragmentationMoves.empty())
+                pCtx->res = VK_NOT_READY;
+
+            return;
+        }
+    
        if(pCtx->res >= VK_SUCCESS)
        {
            if(defragmentOnGpu)
            {
-                ApplyDefragmentationMovesGpu(pCtx, moves, commandBuffer);
+                ApplyDefragmentationMovesGpu(pCtx, pCtx->defragmentationMoves, commandBuffer);
            }
            else
            {
-                ApplyDefragmentationMovesCpu(pCtx, moves);
+                ApplyDefragmentationMovesCpu(pCtx, pCtx->defragmentationMoves);
            }
        }
    }
@ -12919,6 +13014,48 @@ void VmaBlockVector::DefragmentationEnd(
    }
 }

+uint32_t VmaBlockVector::ProcessDefragmentations(
+    class VmaBlockVectorDefragmentationContext *pCtx,
+    VmaDefragmentationStepMoveInfo* pMove, uint32_t maxMoves)
+{
+    VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex);
+    
+    const uint32_t moveCount = std::min(uint32_t(pCtx->defragmentationMoves.size()) - pCtx->defragmentationMovesProcessed, maxMoves);
+
+    for(uint32_t i = pCtx->defragmentationMovesProcessed; i < moveCount; ++ i)
+    {
+        VmaDefragmentationMove& move = pCtx->defragmentationMoves[i];
+
+        pMove->allocation = move.hAllocation;
+        pMove->memory = move.pDstBlock->GetDeviceMemory();
+        pMove->offset = move.dstOffset;
+
+        ++ pMove;
+    }
+
+    pCtx->defragmentationMovesProcessed += moveCount;
+
+    return moveCount;
+}
+
+void VmaBlockVector::CommitDefragmentations(
+    class VmaBlockVectorDefragmentationContext *pCtx,
+    VmaDefragmentationStats* pStats)
+{
+    VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex);
+    
+    for(uint32_t i = pCtx->defragmentationMovesCommitted; i < pCtx->defragmentationMovesProcessed; ++ i)
+    {
+        const VmaDefragmentationMove &move = pCtx->defragmentationMoves[i];
+
+        move.pSrcBlock->m_pMetadata->FreeAtOffset(move.srcOffset);
+        move.hAllocation->ChangeBlockAllocation(m_hAllocator, move.pDstBlock, move.dstOffset);
+    }
+
+    pCtx->defragmentationMovesCommitted = pCtx->defragmentationMovesProcessed;
+    FreeEmptyBlocks(pStats);
+}
+
 size_t VmaBlockVector::CalcAllocationCount() const
 {
    size_t result = 0;
@ -13069,7 +13206,8 @@ void VmaDefragmentationAlgorithm_Generic::AddAllocation(VmaAllocation hAlloc, Vk
 VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound(
    VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
    VkDeviceSize maxBytesToMove,
-    uint32_t maxAllocationsToMove)
+    uint32_t maxAllocationsToMove,
+    bool freeOldAllocations)
 {
    if(m_Blocks.empty())
    {
@ -13161,12 +13299,16 @@ VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound(
                    return VK_SUCCESS;
                }

-                VmaDefragmentationMove move;
+                VmaDefragmentationMove move = {};
                move.srcBlockIndex = pSrcBlockInfo->m_OriginalBlockIndex;
                move.dstBlockIndex = pDstBlockInfo->m_OriginalBlockIndex;
                move.srcOffset = srcOffset;
                move.dstOffset = dstAllocRequest.offset;
                move.size = size;
+                move.hAllocation = allocInfo.m_hAllocation;
+                move.pSrcBlock = pSrcBlockInfo->m_pBlock;
+                move.pDstBlock = pDstBlockInfo->m_pBlock;
+
                moves.push_back(move);

                pDstBlockInfo->m_pBlock->m_pMetadata->Alloc(
@ -13174,9 +13316,12 @@ VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound(
                    suballocType,
                    size,
                    allocInfo.m_hAllocation);
-                pSrcBlockInfo->m_pBlock->m_pMetadata->FreeAtOffset(srcOffset);

+                if(freeOldAllocations)
+                {
+                    pSrcBlockInfo->m_pBlock->m_pMetadata->FreeAtOffset(srcOffset);
                    allocInfo.m_hAllocation->ChangeBlockAllocation(m_hAllocator, pDstBlockInfo->m_pBlock, dstAllocRequest.offset);
+                }
                
                if(allocInfo.m_pChanged != VMA_NULL)
                {
@ -13229,7 +13374,8 @@ size_t VmaDefragmentationAlgorithm_Generic::CalcBlocksWithNonMovableCount() cons
 VkResult VmaDefragmentationAlgorithm_Generic::Defragment(
    VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
    VkDeviceSize maxBytesToMove,
-    uint32_t maxAllocationsToMove)
+    uint32_t maxAllocationsToMove,
+    VmaDefragmentationFlags flags)
 {
    if(!m_AllAllocations && m_AllocationCount == 0)
    {
@ -13275,7 +13421,7 @@ VkResult VmaDefragmentationAlgorithm_Generic::Defragment(
    VkResult result = VK_SUCCESS;
    for(uint32_t round = 0; (round < roundCount) && (result == VK_SUCCESS); ++round)
    {
-        result = DefragmentRound(moves, maxBytesToMove, maxAllocationsToMove);
+        result = DefragmentRound(moves, maxBytesToMove, maxAllocationsToMove, !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL));
    }

    return result;
@ -13327,7 +13473,8 @@ VmaDefragmentationAlgorithm_Fast::~VmaDefragmentationAlgorithm_Fast()
 VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
    VmaVector< VmaDefragmentationMove, VmaStlAllocator<VmaDefragmentationMove> >& moves,
    VkDeviceSize maxBytesToMove,
-    uint32_t maxAllocationsToMove)
+    uint32_t maxAllocationsToMove,
+    VmaDefragmentationFlags flags)
 {
    VMA_ASSERT(m_AllAllocations || m_pBlockVector->CalcAllocationCount() == m_AllocationCount);

@ -13383,6 +13530,7 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
            }
            const VkDeviceSize srcAllocOffset = srcSuballocIt->offset;

+            VmaDefragmentationMove move = {};
            // Try to place it in one of free spaces from the database.
            size_t freeSpaceInfoIndex;
            VkDeviceSize dstAllocOffset;
@ -13413,10 +13561,12 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(

                    InsertSuballoc(pFreeSpaceMetadata, suballoc);

-                    VmaDefragmentationMove move = {
-                        srcOrigBlockIndex, freeSpaceOrigBlockIndex,
-                        srcAllocOffset, dstAllocOffset,
-                        srcAllocSize };
+                    move.srcBlockIndex = srcOrigBlockIndex;
+                    move.dstBlockIndex = freeSpaceOrigBlockIndex;
+                    move.srcOffset = srcAllocOffset;
+                    move.dstOffset = dstAllocOffset;
+                    move.size = srcAllocSize;
+                    
                    moves.push_back(move);
                }
                // Different block
@ -13439,10 +13589,12 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(

                    InsertSuballoc(pFreeSpaceMetadata, suballoc);

-                    VmaDefragmentationMove move = {
-                        srcOrigBlockIndex, freeSpaceOrigBlockIndex,
-                        srcAllocOffset, dstAllocOffset,
-                        srcAllocSize };
+                    move.srcBlockIndex = srcOrigBlockIndex;
+                    move.dstBlockIndex = freeSpaceOrigBlockIndex;
+                    move.srcOffset = srcAllocOffset;
+                    move.dstOffset = dstAllocOffset;
+                    move.size = srcAllocSize;
+                    
                    moves.push_back(move);
                }
            }
@ -13497,10 +13649,13 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(
                        m_BytesMoved += srcAllocSize;
                        ++m_AllocationsMoved;
                        ++srcSuballocIt;
-                        VmaDefragmentationMove move = {
-                            srcOrigBlockIndex, dstOrigBlockIndex,
-                            srcAllocOffset, dstAllocOffset,
-                            srcAllocSize };
+                        
+                        move.srcBlockIndex = srcOrigBlockIndex;
+                        move.dstBlockIndex = dstOrigBlockIndex;
+                        move.srcOffset = srcAllocOffset;
+                        move.dstOffset = dstAllocOffset;
+                        move.size = srcAllocSize;
+                        
                        moves.push_back(move);
                    }
                }
@ -13526,10 +13681,12 @@ VkResult VmaDefragmentationAlgorithm_Fast::Defragment(

                    pDstMetadata->m_Suballocations.push_back(suballoc);

-                    VmaDefragmentationMove move = {
-                        srcOrigBlockIndex, dstOrigBlockIndex,
-                        srcAllocOffset, dstAllocOffset,
-                        srcAllocSize };
+                    move.srcBlockIndex = srcOrigBlockIndex;
+                    move.dstBlockIndex = dstOrigBlockIndex;
+                    move.srcOffset = srcAllocOffset;
+                    move.dstOffset = dstAllocOffset;
+                    move.size = srcAllocSize;
+                    
                    moves.push_back(move);
                }
            }
@ -13679,6 +13836,10 @@ VmaBlockVectorDefragmentationContext::VmaBlockVectorDefragmentationContext(
    res(VK_SUCCESS),
    mutexLocked(false),
    blockContexts(VmaStlAllocator<VmaBlockDefragmentationContext>(hAllocator->GetAllocationCallbacks())),
+    defragmentationMoves(VmaStlAllocator<VmaDefragmentationMove>(hAllocator->GetAllocationCallbacks())),
+    defragmentationMovesProcessed(0),
+    defragmentationMovesCommitted(0),
+    hasDefragmentationPlan(0),
    m_hAllocator(hAllocator),
    m_hCustomPool(hCustomPool),
    m_pBlockVector(pBlockVector),
@ -13700,7 +13861,7 @@ void VmaBlockVectorDefragmentationContext::AddAllocation(VmaAllocation hAlloc, V
    m_Allocations.push_back(info);
 }

-void VmaBlockVectorDefragmentationContext::Begin(bool overlappingMoveSupported)
+void VmaBlockVectorDefragmentationContext::Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags)
 {
    const bool allAllocations = m_AllAllocations ||
        m_Allocations.size() == m_pBlockVector->CalcAllocationCount();
@ -13714,10 +13875,12 @@ void VmaBlockVectorDefragmentationContext::Begin(bool overlappingMoveSupported)
    - VMA_DEBUG_MARGIN is 0.
    - All allocations in this block vector are moveable.
    - There is no possibility of image/buffer granularity conflict.
+    - The defragmentation is not incremental
    */
    if(VMA_DEBUG_MARGIN == 0 &&
        allAllocations &&
-        !m_pBlockVector->IsBufferImageGranularityConflictPossible())
+        !m_pBlockVector->IsBufferImageGranularityConflictPossible() &&
+        !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL))
    {
        m_pAlgorithm = vma_new(m_hAllocator, VmaDefragmentationAlgorithm_Fast)(
            m_hAllocator, m_pBlockVector, m_CurrFrameIndex, overlappingMoveSupported);
@ -13884,13 +14047,30 @@ void VmaDefragmentationContext_T::AddAllocations(
 VkResult VmaDefragmentationContext_T::Defragment(
    VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove,
    VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove,
-    VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats)
+    VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags)
 {
    if(pStats)
    {
        memset(pStats, 0, sizeof(VmaDefragmentationStats));
    }

+    if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)
+    {
+        // For incremental defragmetnations, we just earmark how much we can move
+        // The real meat is in the defragmentation steps
+        m_MaxCpuBytesToMove = maxCpuBytesToMove;
+        m_MaxCpuAllocationsToMove = maxCpuAllocationsToMove;
+
+        m_MaxGpuBytesToMove = maxGpuBytesToMove;
+        m_MaxGpuAllocationsToMove = maxGpuAllocationsToMove;
+
+        if(m_MaxCpuBytesToMove == 0 && m_MaxCpuAllocationsToMove == 0 &&
+            m_MaxGpuBytesToMove == 0 && m_MaxGpuAllocationsToMove == 0)
+            return VK_SUCCESS;
+
+        return VK_NOT_READY;
+    }
+
    if(commandBuffer == VK_NULL_HANDLE)
    {
        maxGpuBytesToMove = 0;
@ -13910,7 +14090,7 @@ VkResult VmaDefragmentationContext_T::Defragment(
            VMA_ASSERT(pBlockVectorCtx->GetBlockVector());
            pBlockVectorCtx->GetBlockVector()->Defragment(
                pBlockVectorCtx,
-                pStats,
+                pStats, flags,
                maxCpuBytesToMove, maxCpuAllocationsToMove,
                maxGpuBytesToMove, maxGpuAllocationsToMove,
                commandBuffer);
@ -13930,7 +14110,7 @@ VkResult VmaDefragmentationContext_T::Defragment(
        VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector());
        pBlockVectorCtx->GetBlockVector()->Defragment(
            pBlockVectorCtx,
-            pStats,
+            pStats, flags,
            maxCpuBytesToMove, maxCpuAllocationsToMove,
            maxGpuBytesToMove, maxGpuAllocationsToMove,
            commandBuffer);
@ -13943,6 +14123,132 @@ VkResult VmaDefragmentationContext_T::Defragment(
    return res;
 }

+VkResult VmaDefragmentationContext_T::DefragmentStepBegin(VmaDefragmentationStepInfo* pInfo)
+{
+    VmaDefragmentationStepMoveInfo* pCurrentMove = pInfo->pMoves;
+    uint32_t movesLeft = pInfo->moveCount;
+
+    // Process default pools.
+    for(uint32_t memTypeIndex = 0;
+        memTypeIndex < m_hAllocator->GetMemoryTypeCount();
+        ++memTypeIndex)
+    {
+        VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex];
+        if(pBlockVectorCtx)
+        {
+            VMA_ASSERT(pBlockVectorCtx->GetBlockVector());
+
+            if(!pBlockVectorCtx->hasDefragmentationPlan)
+            {
+                pBlockVectorCtx->GetBlockVector()->Defragment(
+                    pBlockVectorCtx,
+                    m_pStats, m_Flags,
+                    m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove,
+                    m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove,
+                    VK_NULL_HANDLE);
+
+                if(pBlockVectorCtx->res < VK_SUCCESS)
+                    continue;
+
+                pBlockVectorCtx->hasDefragmentationPlan = true;
+            }
+
+            const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations(
+                pBlockVectorCtx,
+                pCurrentMove, movesLeft);
+
+            movesLeft -= processed;
+            pCurrentMove += processed;
+        }
+    }
+
+    // Process custom pools.
+    for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size();
+        customCtxIndex < customCtxCount;
+        ++customCtxIndex)
+    {
+        VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex];
+        VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector());
+
+        if(!pBlockVectorCtx->hasDefragmentationPlan)
+        {
+            pBlockVectorCtx->GetBlockVector()->Defragment(
+                pBlockVectorCtx,
+                m_pStats, m_Flags,
+                m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove,
+                m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove,
+                VK_NULL_HANDLE);
+
+            if(pBlockVectorCtx->res < VK_SUCCESS)
+                continue;
+
+            pBlockVectorCtx->hasDefragmentationPlan = true;
+        }
+
+        const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations(
+            pBlockVectorCtx,
+            pCurrentMove, movesLeft);
+
+        movesLeft -= processed;
+        pCurrentMove += processed;
+    }
+
+    pInfo->moveCount = pInfo->moveCount - movesLeft;
+
+    return VK_SUCCESS;
+}
+VkResult VmaDefragmentationContext_T::DefragmentStepEnd()
+{
+    VkResult res = VK_SUCCESS;
+
+    // Process default pools.
+    for(uint32_t memTypeIndex = 0;
+        memTypeIndex < m_hAllocator->GetMemoryTypeCount();
+        ++memTypeIndex)
+    {
+        VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex];
+        if(pBlockVectorCtx)
+        {
+            VMA_ASSERT(pBlockVectorCtx->GetBlockVector());
+
+            if(!pBlockVectorCtx->hasDefragmentationPlan)
+            {
+                res = VK_NOT_READY;
+                continue;
+            }
+
+            pBlockVectorCtx->GetBlockVector()->CommitDefragmentations(
+                pBlockVectorCtx, m_pStats);
+
+            if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted)
+                res = VK_NOT_READY;
+        }
+    }
+
+    // Process custom pools.
+    for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size();
+        customCtxIndex < customCtxCount;
+        ++customCtxIndex)
+    {
+        VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex];
+        VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector());
+
+        if(!pBlockVectorCtx->hasDefragmentationPlan)
+        {
+            res = VK_NOT_READY;
+            continue;
+        }
+
+        pBlockVectorCtx->GetBlockVector()->CommitDefragmentations(
+            pBlockVectorCtx, m_pStats);
+
+        if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted)
+            res = VK_NOT_READY;
+    }
+
+    return res;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // VmaRecorder

@ -14759,6 +15065,7 @@ void VmaAllocator_T::ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunc
    m_VulkanFunctions.vkCreateImage = (PFN_vkCreateImage)vkCreateImage;
    m_VulkanFunctions.vkDestroyImage = (PFN_vkDestroyImage)vkDestroyImage;
    m_VulkanFunctions.vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)vkCmdCopyBuffer;
+    m_VulkanFunctions.vkCmdCopyImage = (PFN_vkCmdCopyImage)vkCmdCopyImage;
 #if VMA_VULKAN_VERSION >= 1001000
    if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0))
    {
@ -14825,6 +15132,7 @@ void VmaAllocator_T::ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunc
        VMA_COPY_IF_NOT_NULL(vkCreateImage);
        VMA_COPY_IF_NOT_NULL(vkDestroyImage);
        VMA_COPY_IF_NOT_NULL(vkCmdCopyBuffer);
+        VMA_COPY_IF_NOT_NULL(vkCmdCopyImage);
 #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000
        VMA_COPY_IF_NOT_NULL(vkGetBufferMemoryRequirements2KHR);
        VMA_COPY_IF_NOT_NULL(vkGetImageMemoryRequirements2KHR);
@ -14859,6 +15167,7 @@ void VmaAllocator_T::ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunc
    VMA_ASSERT(m_VulkanFunctions.vkCreateImage != VMA_NULL);
    VMA_ASSERT(m_VulkanFunctions.vkDestroyImage != VMA_NULL);
    VMA_ASSERT(m_VulkanFunctions.vkCmdCopyBuffer != VMA_NULL);
+    VMA_ASSERT(m_VulkanFunctions.vkCmdCopyImage != VMA_NULL);
 #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000
    if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0) || m_UseKhrDedicatedAllocation)
    {
@ -15578,7 +15887,7 @@ VkResult VmaAllocator_T::DefragmentationBegin(
    VkResult res = (*pContext)->Defragment(
        info.maxCpuBytesToMove, info.maxCpuAllocationsToMove,
        info.maxGpuBytesToMove, info.maxGpuAllocationsToMove,
-        info.commandBuffer, pStats);
+        info.commandBuffer, pStats, info.flags);

    if(res != VK_NOT_READY)
    {
@ -15596,6 +15905,19 @@ VkResult VmaAllocator_T::DefragmentationEnd(
    return VK_SUCCESS;
 }

+VkResult VmaAllocator_T::DefragmentationStepBegin(
+    VmaDefragmentationStepInfo* pInfo,
+    VmaDefragmentationContext context)
+{
+    return context->DefragmentStepBegin(pInfo);
+}
+VkResult VmaAllocator_T::DefragmentationStepEnd(
+    VmaDefragmentationContext context)
+{
+    return context->DefragmentStepEnd();
+    
+}
+
 void VmaAllocator_T::GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo)
 {
    if(hAllocation->CanBecomeLost())
@ -17414,6 +17736,42 @@ VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd(
    }
 }

+VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationStepBegin(
+    VmaAllocator allocator,
+    VmaDefragmentationStepInfo* pInfo,
+    VmaDefragmentationContext context)
+{
+    VMA_ASSERT(allocator);
+    VMA_ASSERT(pInfo);
+    VMA_HEAVY_ASSERT(VmaValidatePointerArray(pInfo->moveCount, pInfo->pMoves));
+
+    VMA_DEBUG_LOG("vmaDefragmentationStepBegin");
+
+    VMA_DEBUG_GLOBAL_MUTEX_LOCK
+
+    if(context == VK_NULL_HANDLE)
+    {
+        pInfo->moveCount = 0;
+        return VK_SUCCESS;
+    }
+
+    return allocator->DefragmentationStepBegin(pInfo, context);
+}
+VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationStepEnd(
+    VmaAllocator allocator,
+    VmaDefragmentationContext context)
+{
+    VMA_ASSERT(allocator);
+
+    VMA_DEBUG_LOG("vmaDefragmentationStepEnd");
+    VMA_DEBUG_GLOBAL_MUTEX_LOCK
+
+    if(context == VK_NULL_HANDLE)
+        return VK_SUCCESS;
+
+    return allocator->DefragmentationStepEnd(context);
+}
+
 VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory(
    VmaAllocator allocator,
    VmaAllocation allocation,