From 6c7eb81f7d871f5c08a4844471633a67725aae73 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 4 Jan 2023 22:05:20 -0500
Subject: [PATCH] video_core: Cache GPU internal writes.

---
 src/video_core/CMakeLists.txt                 |  1 +
 src/video_core/engines/engine_upload.cpp      |  2 +-
 src/video_core/engines/maxwell_3d.cpp         |  7 +-
 src/video_core/engines/maxwell_dma.cpp        | 17 ++--
 src/video_core/invalidation_accumulator.h     | 78 +++++++++++++++++++
 src/video_core/memory_manager.cpp             | 62 +++++++++++----
 src/video_core/memory_manager.h               | 17 +++-
 src/video_core/rasterizer_interface.h         |  7 ++
 .../renderer_vulkan/vk_rasterizer.cpp         | 23 ++++++
 .../renderer_vulkan/vk_rasterizer.h           |  1 +
 10 files changed, 185 insertions(+), 30 deletions(-)
 create mode 100644 src/video_core/invalidation_accumulator.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index aa271a377..b7095ae13 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -85,6 +85,7 @@ add_library(video_core STATIC
     gpu.h
     gpu_thread.cpp
     gpu_thread.h
+    invalidation_accumulator.h
     memory_manager.cpp
     memory_manager.h
     precompiled_headers.h
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index cea1dd8b0..7f5a0c29d 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                        regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                        x_elements, regs.line_count, regs.dest.BlockHeight(),
                                        regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index fbfd1ddd2..97f547789 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }
 
 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
     switch (regs.report_semaphore.query.operation) {
     case Regs::ReportSemaphore::Operation::Release:
         if (regs.report_semaphore.query.short_query != 0) {
@@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {
 
     const GPUVAddr address{buffer_address + regs.const_buffer.offset};
     const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);
 
     // Increment the current buffer position.
     regs.const_buffer.offset += static_cast<u32>(copy_size);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 01f70ea9e..7bf08e3e0 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
     if (launch.multi_line_enable) {
         const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
         const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
         if (!is_src_pitch && !is_dst_pitch) {
             // If both the source and the destination are in block layout, assert.
             CopyBlockLinearToBlockLinear();
@@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
                                             reinterpret_cast<u8*>(tmp_buffer.data()),
                                             regs.line_length_in * sizeof(u32));
         } else {
+            memory_manager.FlushCaching();
             const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                 return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                        ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@@ -121,7 +122,7 @@ void MaxwellDMA::Launch() {
                     memory_manager.ReadBlockUnsafe(
                         convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                         tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
                                               tmp_buffer.size());
                 }
             } else if (is_src_pitch && !is_dst_pitch) {
@@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                     memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                    tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                         convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                         tmp_buffer.data(), tmp_buffer.size());
                 }
@@ -141,7 +142,7 @@ void MaxwellDMA::Launch() {
                     std::vector<u8> tmp_buffer(regs.line_length_in);
                     memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                    regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
                                               regs.line_length_in);
                 }
             }
@@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                      regs.pitch_out);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                    dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                    regs.pitch_in);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                      regs.src_params.block_size.height, regs.src_params.block_size.depth,
                      regs.pitch_out);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                    dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                    dst.block_size.height, dst.block_size.depth, pitch);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::ReleaseSemaphore() {
diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h
new file mode 100644
index 000000000..42420e31c
--- /dev/null
+++ b/src/video_core/invalidation_accumulator.h
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+    InvalidationAccumulator() = default;
+    ~InvalidationAccumulator() = default;
+
+    void Add(GPUVAddr address, size_t size) {
+        const auto reset_values = [&]() {
+            if (has_collected) {
+                buffer.emplace_back(start_address, accumulated_size);
+            }
+            start_address = address;
+            accumulated_size = size;
+            last_collection = start_address + size;
+        };
+        if (address >= start_address && address + size <= last_collection) [[likely]] {
+            return;
+        }
+        size = (address + size + atomicy_side_mask) & atomicy_mask - address;
+        address = address & atomicy_mask;
+        if (!has_collected) [[unlikely]] {
+            reset_values();
+            has_collected = true;
+            return;
+        }
+        if (address != last_collection) [[unlikely]] {
+            reset_values();
+            return;
+        }
+        accumulated_size += size;
+        last_collection += size;
+    }
+
+    void Clear() {
+        buffer.clear();
+        start_address = 0;
+        last_collection = 0;
+        has_collected = false;
+    }
+
+    bool AnyAccumulated() const {
+        return has_collected;
+    }
+
+    template <typename Func>
+    void Callback(Func&& func) {
+        if (!has_collected) {
+            return;
+        }
+        buffer.emplace_back(start_address, accumulated_size);
+        for (auto& [address, size] : buffer) {
+            func(address, size);
+        }
+    }
+
+private:
+    static constexpr size_t atomicy_bits = 5;
+    static constexpr size_t atomicy_size = 1ULL << atomicy_bits;
+    static constexpr size_t atomicy_side_mask = atomicy_size - 1;
+    static constexpr size_t atomicy_mask = ~atomicy_side_mask;
+    GPUVAddr start_address{};
+    GPUVAddr last_collection{};
+    size_t accumulated_size{};
+    bool has_collected{};
+    std::vector<std::pair<VAddr, size_t>> buffer;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 3a5cdeb39..83924475b 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -11,6 +11,7 @@
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@@ -26,7 +27,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
       entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                            page_bits != big_page_bits ? page_bits : 0},
       kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
-                                      1, std::memory_order_acq_rel)} {
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
     address_space_size = 1ULL << address_space_bits;
     page_size = 1ULL << page_bits;
     page_mask = page_size - 1ULL;
@@ -185,15 +187,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
     if (size == 0) {
         return;
     }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);
 
-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
     }
+    page_stash.clear();
 
     BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
     PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@@ -454,6 +453,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
     WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }
 
+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
 void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
                                 VideoCommon::CacheType which) const {
     auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@@ -663,7 +668,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
     GPUVAddr gpu_addr, std::size_t size) const {
     std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
     std::optional<VAddr> old_page_addr{};
     const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                 [[maybe_unused]] std::size_t offset,
@@ -685,8 +700,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
         }
         old_page_addr = {cpu_addr_base + copy_amount};
         if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
         } else {
             last_segment->second += copy_amount;
         }
@@ -703,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
         }
         old_page_addr = {cpu_addr_base + copy_amount};
         if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
         } else {
             last_segment->second += copy_amount;
         }
@@ -715,7 +738,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
     };
     MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
     split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }
 
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 828e13439..e6de0d0cb 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -19,6 +19,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }
 
+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@@ -80,6 +84,7 @@ public:
      */
     void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
     void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
 
     /**
      * Checks if a gpu region can be simply read with a pointer.
@@ -102,7 +107,7 @@ public:
      * will be returned;
      */
     std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
-                                                                    std::size_t size) const;
+                                                                 std::size_t size) const;
 
     GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size,
                  PTEKind kind = PTEKind::INVALID, bool is_big_pages = true);
@@ -129,6 +134,8 @@ public:
     size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
                                size_t max_size = std::numeric_limits<size_t>::max()) const;
 
+    void FlushCaching();
+
 private:
     template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
     inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
@@ -154,6 +161,12 @@ private:
     inline bool IsBigPageContinous(size_t big_page_index) const;
     inline void SetBigPageContinous(size_t big_page_index, bool value);
 
+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
     Core::System& system;
     Core::Memory::Memory& memory;
     Core::DeviceMemory& device_memory;
@@ -201,10 +214,12 @@ private:
     Common::VirtualBuffer<u32> big_page_table_cpu;
 
     std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
 
     constexpr static size_t continous_bits = 64;
 
     const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;
 
     static std::atomic<size_t> unique_identifier_generator;
 };
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index f44c7df50..6b66ad7b6 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,6 +6,7 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
 #include "video_core/cache_types.h"
@@ -95,6 +96,12 @@ public:
     virtual void InvalidateRegion(VAddr addr, u64 size,
                                   VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
 
+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }
+
     /// Notify rasterizer that any caches of the specified region are desync with guest
     virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 242bf9602..6c4d74564 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
 
     SCOPE_EXIT({ gpu.TickWork(); });
     FlushWork();
+    gpu_memory->FlushCaching();
 
     query_cache.UpdateCounters();
 
@@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {
 
 void RasterizerVulkan::DispatchCompute() {
     FlushWork();
+    gpu_memory->FlushCaching();
 
     ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
     if (!pipeline) {
@@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
     }
 }
 
+void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            texture_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            buffer_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        for (const auto [addr, size] : sequences) {
+            query_cache.InvalidateRegion(addr, size);
+            pipeline_cache.InvalidateRegion(addr, size);
+        }
+    }
+}
+
 void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index c661e5b19..472cc64d9 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -79,6 +79,7 @@ public:
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
     void InvalidateRegion(VAddr addr, u64 size,
                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
     void OnCPUWrite(VAddr addr, u64 size) override;
     void InvalidateGPUCache() override;
     void UnmapMemory(VAddr addr, u64 size) override;