From 6c7eb81f7d871f5c08a4844471633a67725aae73 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 4 Jan 2023 22:05:20 -0500
Subject: [PATCH 1/4] video_core: Cache GPU internal writes.

---
 src/video_core/CMakeLists.txt                 |  1 +
 src/video_core/engines/engine_upload.cpp      |  2 +-
 src/video_core/engines/maxwell_3d.cpp         |  7 +-
 src/video_core/engines/maxwell_dma.cpp        | 17 ++--
 src/video_core/invalidation_accumulator.h     | 78 +++++++++++++++++++
 src/video_core/memory_manager.cpp             | 62 +++++++++++----
 src/video_core/memory_manager.h               | 17 +++-
 src/video_core/rasterizer_interface.h         |  7 ++
 .../renderer_vulkan/vk_rasterizer.cpp         | 23 ++++++
 .../renderer_vulkan/vk_rasterizer.h           |  1 +
 10 files changed, 185 insertions(+), 30 deletions(-)
 create mode 100644 src/video_core/invalidation_accumulator.h

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index aa271a3770..b7095ae133 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -85,6 +85,7 @@ add_library(video_core STATIC
     gpu.h
     gpu_thread.cpp
     gpu_thread.h
+    invalidation_accumulator.h
     memory_manager.cpp
     memory_manager.h
     precompiled_headers.h
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp
index cea1dd8b0f..7f5a0c29d7 100644
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
                                        regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
                                        x_elements, regs.line_count, regs.dest.BlockHeight(),
                                        regs.dest.BlockDepth(), regs.line_length_in);
-        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
+        memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
     }
 }
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index fbfd1ddd24..97f5477897 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
 }
 
 void Maxwell3D::ProcessQueryGet() {
-    // TODO(Subv): Support the other query units.
-    if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
-        LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
-    }
-
     switch (regs.report_semaphore.query.operation) {
     case Regs::ReportSemaphore::Operation::Release:
         if (regs.report_semaphore.query.short_query != 0) {
@@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {
 
     const GPUVAddr address{buffer_address + regs.const_buffer.offset};
     const size_t copy_size = amount * sizeof(u32);
-    memory_manager.WriteBlock(address, start_base, copy_size);
+    memory_manager.WriteBlockCached(address, start_base, copy_size);
 
     // Increment the current buffer position.
     regs.const_buffer.offset += static_cast<u32>(copy_size);
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 01f70ea9e5..7bf08e3e0d 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
     if (launch.multi_line_enable) {
         const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
         const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
-
+        memory_manager.FlushCaching();
         if (!is_src_pitch && !is_dst_pitch) {
             // If both the source and the destination are in block layout, assert.
             CopyBlockLinearToBlockLinear();
@@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
                                             reinterpret_cast<u8*>(tmp_buffer.data()),
                                             regs.line_length_in * sizeof(u32));
         } else {
+            memory_manager.FlushCaching();
             const auto convert_linear_2_blocklinear_addr = [](u64 address) {
                 return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
                        ((address & 0x180) >> 1) | ((address & 0x20) << 3);
@@ -121,7 +122,7 @@ void MaxwellDMA::Launch() {
                     memory_manager.ReadBlockUnsafe(
                         convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                         tmp_buffer.data(), tmp_buffer.size());
-                    memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
                                               tmp_buffer.size());
                 }
             } else if (is_src_pitch && !is_dst_pitch) {
@@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
                 for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
                     memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
                                                    tmp_buffer.size());
-                    memory_manager.WriteBlock(
+                    memory_manager.WriteBlockCached(
                         convert_linear_2_blocklinear_addr(regs.offset_out + offset),
                         tmp_buffer.data(), tmp_buffer.size());
                 }
@@ -141,7 +142,7 @@ void MaxwellDMA::Launch() {
                     std::vector<u8> tmp_buffer(regs.line_length_in);
                     memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                    regs.line_length_in);
-                    memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
+                    memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
                                               regs.line_length_in);
                 }
             }
@@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
                      src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                      regs.pitch_out);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::CopyPitchToBlockLinear() {
@@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
                    dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
                    regs.pitch_in);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::FastCopyBlockLinearToPitch() {
@@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
                      regs.src_params.block_size.height, regs.src_params.block_size.depth,
                      regs.pitch_out);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::CopyBlockLinearToBlockLinear() {
@@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
                    dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
                    dst.block_size.height, dst.block_size.depth, pitch);
 
-    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+    memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
 }
 
 void MaxwellDMA::ReleaseSemaphore() {
diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h
new file mode 100644
index 0000000000..42420e31c8
--- /dev/null
+++ b/src/video_core/invalidation_accumulator.h
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <vector>
+
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+class InvalidationAccumulator {
+public:
+    InvalidationAccumulator() = default;
+    ~InvalidationAccumulator() = default;
+
+    void Add(GPUVAddr address, size_t size) {
+        const auto reset_values = [&]() {
+            if (has_collected) {
+                buffer.emplace_back(start_address, accumulated_size);
+            }
+            start_address = address;
+            accumulated_size = size;
+            last_collection = start_address + size;
+        };
+        if (address >= start_address && address + size <= last_collection) [[likely]] {
+            return;
+        }
+        size = (address + size + atomicy_side_mask) & atomicy_mask - address;
+        address = address & atomicy_mask;
+        if (!has_collected) [[unlikely]] {
+            reset_values();
+            has_collected = true;
+            return;
+        }
+        if (address != last_collection) [[unlikely]] {
+            reset_values();
+            return;
+        }
+        accumulated_size += size;
+        last_collection += size;
+    }
+
+    void Clear() {
+        buffer.clear();
+        start_address = 0;
+        last_collection = 0;
+        has_collected = false;
+    }
+
+    bool AnyAccumulated() const {
+        return has_collected;
+    }
+
+    template <typename Func>
+    void Callback(Func&& func) {
+        if (!has_collected) {
+            return;
+        }
+        buffer.emplace_back(start_address, accumulated_size);
+        for (auto& [address, size] : buffer) {
+            func(address, size);
+        }
+    }
+
+private:
+    static constexpr size_t atomicy_bits = 5;
+    static constexpr size_t atomicy_size = 1ULL << atomicy_bits;
+    static constexpr size_t atomicy_side_mask = atomicy_size - 1;
+    static constexpr size_t atomicy_mask = ~atomicy_side_mask;
+    GPUVAddr start_address{};
+    GPUVAddr last_collection{};
+    size_t accumulated_size{};
+    bool has_collected{};
+    std::vector<std::pair<VAddr, size_t>> buffer;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 3a5cdeb39f..83924475b5 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -11,6 +11,7 @@
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/memory.h"
+#include "video_core/invalidation_accumulator.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
@@ -26,7 +27,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
       entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
                                            page_bits != big_page_bits ? page_bits : 0},
       kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
-                                      1, std::memory_order_acq_rel)} {
+                                      1, std::memory_order_acq_rel)},
+      accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
     address_space_size = 1ULL << address_space_bits;
     page_size = 1ULL << page_bits;
     page_mask = page_size - 1ULL;
@@ -185,15 +187,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
     if (size == 0) {
         return;
     }
-    const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
+    GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);
 
-    for (const auto& [map_addr, map_size] : submapped_ranges) {
-        // Flush and invalidate through the GPU interface, to be asynchronous if possible.
-        const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
-        ASSERT(cpu_addr);
-
-        rasterizer->UnmapMemory(*cpu_addr, map_size);
+    for (const auto& [map_addr, map_size] : page_stash) {
+        rasterizer->UnmapMemory(map_addr, map_size);
     }
+    page_stash.clear();
 
     BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
     PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
@@ -454,6 +453,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
     WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
 }
 
+void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                                     std::size_t size) {
+    WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
+    accumulator->Add(gpu_dest_addr, size);
+}
+
 void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
                                 VideoCommon::CacheType which) const {
     auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
@@ -663,7 +668,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
 std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
     GPUVAddr gpu_addr, std::size_t size) const {
     std::vector<std::pair<GPUVAddr, std::size_t>> result{};
-    std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
+    GetSubmappedRangeImpl<true>(gpu_addr, size, result);
+    return result;
+}
+
+template <bool is_gpu_address>
+void MemoryManager::GetSubmappedRangeImpl(
+    GPUVAddr gpu_addr, std::size_t size,
+    std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+        result) const {
+    std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
+        last_segment{};
     std::optional<VAddr> old_page_addr{};
     const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
                                                 [[maybe_unused]] std::size_t offset,
@@ -685,8 +700,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
         }
         old_page_addr = {cpu_addr_base + copy_amount};
         if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
         } else {
             last_segment->second += copy_amount;
         }
@@ -703,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
         }
         old_page_addr = {cpu_addr_base + copy_amount};
         if (!last_segment) {
-            const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
-            last_segment = {new_base_addr, copy_amount};
+            if constexpr (is_gpu_address) {
+                const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
+                last_segment = {new_base_addr, copy_amount};
+            } else {
+                last_segment = {cpu_addr_base, copy_amount};
+            }
         } else {
             last_segment->second += copy_amount;
         }
@@ -715,7 +738,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
     };
     MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
     split(0, 0, 0);
-    return result;
+}
+
+void MemoryManager::FlushCaching() {
+    if (!accumulator->AnyAccumulated()) {
+        return;
+    }
+    accumulator->Callback([this](GPUVAddr addr, size_t size) {
+        GetSubmappedRangeImpl<false>(addr, size, page_stash);
+    });
+    rasterizer->InnerInvalidation(page_stash);
+    page_stash.clear();
+    accumulator->Clear();
 }
 
 } // namespace Tegra
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 828e13439e..e6de0d0cba 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -19,6 +19,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }
 
+namespace VideoCommon {
+class InvalidationAccumulator;
+}
+
 namespace Core {
 class DeviceMemory;
 namespace Memory {
@@ -80,6 +84,7 @@ public:
      */
     void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
     void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
 
     /**
      * Checks if a gpu region can be simply read with a pointer.
@@ -102,7 +107,7 @@ public:
      * will be returned;
      */
     std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
-                                                                    std::size_t size) const;
+                                                                 std::size_t size) const;
 
     GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size,
                  PTEKind kind = PTEKind::INVALID, bool is_big_pages = true);
@@ -129,6 +134,8 @@ public:
     size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
                                size_t max_size = std::numeric_limits<size_t>::max()) const;
 
+    void FlushCaching();
+
 private:
     template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
     inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
@@ -154,6 +161,12 @@ private:
     inline bool IsBigPageContinous(size_t big_page_index) const;
     inline void SetBigPageContinous(size_t big_page_index, bool value);
 
+    template <bool is_gpu_address>
+    void GetSubmappedRangeImpl(
+        GPUVAddr gpu_addr, std::size_t size,
+        std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
+            result) const;
+
     Core::System& system;
     Core::Memory::Memory& memory;
     Core::DeviceMemory& device_memory;
@@ -201,10 +214,12 @@ private:
     Common::VirtualBuffer<u32> big_page_table_cpu;
 
     std::vector<u64> big_page_continous;
+    std::vector<std::pair<VAddr, std::size_t>> page_stash{};
 
     constexpr static size_t continous_bits = 64;
 
     const size_t unique_identifier;
+    std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;
 
     static std::atomic<size_t> unique_identifier_generator;
 };
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index f44c7df506..6b66ad7b60 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -6,6 +6,7 @@
 #include <functional>
 #include <optional>
 #include <span>
+#include <utility>
 #include "common/common_types.h"
 #include "common/polyfill_thread.h"
 #include "video_core/cache_types.h"
@@ -95,6 +96,12 @@ public:
     virtual void InvalidateRegion(VAddr addr, u64 size,
                                   VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
 
+    virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+        for (const auto [cpu_addr, size] : sequences) {
+            InvalidateRegion(cpu_addr, size);
+        }
+    }
+
     /// Notify rasterizer that any caches of the specified region are desync with guest
     virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
 
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 242bf9602a..6c4d745649 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
 
     SCOPE_EXIT({ gpu.TickWork(); });
     FlushWork();
+    gpu_memory->FlushCaching();
 
     query_cache.UpdateCounters();
 
@@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {
 
 void RasterizerVulkan::DispatchCompute() {
     FlushWork();
+    gpu_memory->FlushCaching();
 
     ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
     if (!pipeline) {
@@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
     }
 }
 
+void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
+    {
+        std::scoped_lock lock{texture_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            texture_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        std::scoped_lock lock{buffer_cache.mutex};
+        for (const auto [addr, size] : sequences) {
+            buffer_cache.WriteMemory(addr, size);
+        }
+    }
+    {
+        for (const auto [addr, size] : sequences) {
+            query_cache.InvalidateRegion(addr, size);
+            pipeline_cache.InvalidateRegion(addr, size);
+        }
+    }
+}
+
 void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
     if (addr == 0 || size == 0) {
         return;
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h
index c661e5b197..472cc64d91 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -79,6 +79,7 @@ public:
                          VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
     void InvalidateRegion(VAddr addr, u64 size,
                           VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
+    void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
     void OnCPUWrite(VAddr addr, u64 size) override;
     void InvalidateGPUCache() override;
     void UnmapMemory(VAddr addr, u64 size) override;

From af5ecb0b15d4449f58434e70eed835cf71fc5527 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Thu, 5 Jan 2023 06:06:33 -0500
Subject: [PATCH 2/4] MemoryManager: use fastmem directly.

---
 src/core/memory.cpp               |  2 +-
 src/video_core/memory_manager.cpp | 40 ++++++++++++++++++++++++-------
 src/video_core/memory_manager.h   |  3 ++-
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 26be74df4a..a1e41faffb 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -436,7 +436,7 @@ struct Memory::Impl {
         }
 
         if (Settings::IsFastmemEnabled()) {
-            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
             system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
         }
 
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 83924475b5..0a63900543 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -6,6 +6,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
@@ -45,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
     big_page_table_cpu.resize(big_page_table_size);
     big_page_continous.resize(big_page_table_size / continous_bits, 0);
     entries.resize(page_table_size / 32, 0);
+    if (!Settings::IsGPULevelExtreme()) {
+        fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
+    } else {
+        fastmem_arena = nullptr;
+    }
 }
 
 MemoryManager::~MemoryManager() = default;
@@ -354,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
     }
 }
 
-template <bool is_safe>
+template <bool is_safe, bool use_fastmem>
 void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                                   [[maybe_unused]] VideoCommon::CacheType which) const {
     auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
@@ -368,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
         if constexpr (is_safe) {
             rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
         }
-        u8* physical = memory.GetPointer(cpu_addr_base);
-        std::memcpy(dest_buffer, physical, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
+        } else {
+            u8* physical = memory.GetPointer(cpu_addr_base);
+            std::memcpy(dest_buffer, physical, copy_amount);
+        }
         dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
     };
     auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
@@ -378,11 +388,15 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
         if constexpr (is_safe) {
             rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
         }
-        if (!IsBigPageContinous(page_index)) [[unlikely]] {
-            memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+        if constexpr (use_fastmem) {
+            std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
         } else {
-            u8* physical = memory.GetPointer(cpu_addr_base);
-            std::memcpy(dest_buffer, physical, copy_amount);
+            if (!IsBigPageContinous(page_index)) [[unlikely]] {
+                memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
+            } else {
+                u8* physical = memory.GetPointer(cpu_addr_base);
+                std::memcpy(dest_buffer, physical, copy_amount);
+            }
         }
         dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
     };
@@ -396,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
 
 void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                               VideoCommon::CacheType which) const {
-    ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
+        return;
+    }
+    ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
 }
 
 void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                     const std::size_t size) const {
-    ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+    if (fastmem_arena) [[likely]] {
+        ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
+        return;
+    }
+    ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
 }
 
 template <bool is_safe>
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index e6de0d0cba..ff9e3c0b3a 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -141,7 +141,7 @@ private:
     inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
                                 FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;
 
-    template <bool is_safe>
+    template <bool is_safe, bool use_fastmem>
     void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
                        VideoCommon::CacheType which) const;
 
@@ -215,6 +215,7 @@ private:
 
     std::vector<u64> big_page_continous;
     std::vector<std::pair<VAddr, std::size_t>> page_stash{};
+    u8* fastmem_arena{};
 
     constexpr static size_t continous_bits = 64;
 

From 2d0c4f2b1d8a069754725a2fb3597a7506e265a5 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Thu, 5 Jan 2023 06:43:28 -0500
Subject: [PATCH 3/4] Fermi2D: sync cache flushes

---
 src/video_core/engines/fermi_2d.cpp | 6 ++++--
 src/video_core/engines/fermi_2d.h   | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index e655e72543..a126c359cd 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,6 +6,7 @@
 #include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
 #include "video_core/textures/decoders.h"
@@ -20,8 +21,8 @@ namespace Tegra::Engines {
 
 using namespace Texture;
 
-Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
-    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
     // Nvidia's OpenGL driver seems to assume these values
     regs.src.depth = 1;
     regs.dst.depth = 1;
@@ -104,6 +105,7 @@ void Fermi2D::Blit() {
         config.src_x0 = 0;
     }
 
+    memory_manager.FlushCaching();
     if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
         sw_blitter->Blit(src, regs.dst, config);
     }
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 523fbdec25..705b323e15 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -305,6 +305,7 @@ public:
 private:
     VideoCore::RasterizerInterface* rasterizer = nullptr;
     std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
+    MemoryManager& memory_manager;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.

From b56ad93bbc9ac38820c1e1cb4b03256dd50aa17a Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Thu, 5 Jan 2023 06:43:54 -0500
Subject: [PATCH 4/4] BufferBase: Don't ignore GPU pages.

---
 src/tests/video_core/buffer_base.cpp             |  2 +-
 src/video_core/buffer_cache/buffer_base.h        | 14 ++++++--------
 src/video_core/engines/maxwell_dma.cpp           |  4 ++--
 src/video_core/invalidation_accumulator.h        | 13 +++++++------
 src/video_core/memory_manager.cpp                |  2 +-
 src/video_core/memory_manager.h                  |  2 +-
 src/video_core/rasterizer_interface.h            |  2 +-
 src/video_core/renderer_vulkan/vk_rasterizer.cpp |  6 +++---
 8 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp
index f7236afabf..5cd0628f22 100644
--- a/src/tests/video_core/buffer_base.cpp
+++ b/src/tests/video_core/buffer_base.cpp
@@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
     int num = 0;
     buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
     buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
-    REQUIRE(num == 0);
+    REQUIRE(num == 1);
     REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
     REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
     buffer.FlushCachedWrites();
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h
index 92d77eef2d..c47b7d8666 100644
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -430,7 +430,7 @@ private:
         if (query_begin >= SizeBytes() || size < 0) {
             return;
         }
-        u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
         u64* const state_words = Array<type>();
         const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
         u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
@@ -483,7 +483,7 @@ private:
                 NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
             }
             // Exclude CPU modified pages when visiting GPU pages
-            const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
+            const u64 word = current_word;
             u64 page = page_begin;
             page_begin = 0;
 
@@ -531,7 +531,7 @@ private:
     [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
         static_assert(type != Type::Untracked);
 
-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
         const u64* const state_words = Array<type>();
         const u64 num_query_words = size / BYTES_PER_WORD + 1;
         const u64 word_begin = offset / BYTES_PER_WORD;
@@ -539,8 +539,7 @@ private:
         const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
         u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
         for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
             if (word == 0) {
                 continue;
             }
@@ -564,7 +563,7 @@ private:
     [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
         static_assert(type != Type::Untracked);
 
-        const u64* const untracked_words = Array<Type::Untracked>();
+        [[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
         const u64* const state_words = Array<type>();
         const u64 num_query_words = size / BYTES_PER_WORD + 1;
         const u64 word_begin = offset / BYTES_PER_WORD;
@@ -574,8 +573,7 @@ private:
         u64 begin = std::numeric_limits<u64>::max();
         u64 end = 0;
         for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
-            const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
-            const u64 word = state_words[word_index] & ~off_word;
+            const u64 word = state_words[word_index];
             if (word == 0) {
                 continue;
             }
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 7bf08e3e0d..7762c7d96f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -123,7 +123,7 @@ void MaxwellDMA::Launch() {
                         convert_linear_2_blocklinear_addr(regs.offset_in + offset),
                         tmp_buffer.data(), tmp_buffer.size());
                     memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
-                                              tmp_buffer.size());
+                                                    tmp_buffer.size());
                 }
             } else if (is_src_pitch && !is_dst_pitch) {
                 UNIMPLEMENTED_IF(regs.line_length_in % 16 != 0);
@@ -143,7 +143,7 @@ void MaxwellDMA::Launch() {
                     memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
                                                    regs.line_length_in);
                     memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
-                                              regs.line_length_in);
+                                                    regs.line_length_in);
                 }
             }
         }
diff --git a/src/video_core/invalidation_accumulator.h b/src/video_core/invalidation_accumulator.h
index 42420e31c8..2c2aaf7bb0 100644
--- a/src/video_core/invalidation_accumulator.h
+++ b/src/video_core/invalidation_accumulator.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <utility>
 #include <vector>
 
 #include "common/common_types.h"
@@ -26,8 +27,8 @@ public:
         if (address >= start_address && address + size <= last_collection) [[likely]] {
             return;
         }
-        size = (address + size + atomicy_side_mask) & atomicy_mask - address;
-        address = address & atomicy_mask;
+        size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
+        address = address & atomicity_mask;
         if (!has_collected) [[unlikely]] {
             reset_values();
             has_collected = true;
@@ -64,10 +65,10 @@ public:
     }
 
 private:
-    static constexpr size_t atomicy_bits = 5;
-    static constexpr size_t atomicy_size = 1ULL << atomicy_bits;
-    static constexpr size_t atomicy_side_mask = atomicy_size - 1;
-    static constexpr size_t atomicy_mask = ~atomicy_side_mask;
+    static constexpr size_t atomicity_bits = 5;
+    static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
+    static constexpr size_t atomicity_size_mask = atomicity_size - 1;
+    static constexpr size_t atomicity_mask = ~atomicity_size_mask;
     GPUVAddr start_address{};
     GPUVAddr last_collection{};
     size_t accumulated_size{};
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 0a63900543..3bcae3503b 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -46,7 +46,7 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
     big_page_table_cpu.resize(big_page_table_size);
     big_page_continous.resize(big_page_table_size / continous_bits, 0);
     entries.resize(page_table_size / 32, 0);
-    if (!Settings::IsGPULevelExtreme()) {
+    if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
         fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
     } else {
         fastmem_arena = nullptr;
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index ff9e3c0b3a..2936364f09 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -107,7 +107,7 @@ public:
      * will be returned;
      */
     std::vector<std::pair<GPUVAddr, std::size_t>> GetSubmappedRange(GPUVAddr gpu_addr,
-                                                                 std::size_t size) const;
+                                                                    std::size_t size) const;
 
     GPUVAddr Map(GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size,
                  PTEKind kind = PTEKind::INVALID, bool is_big_pages = true);
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6b66ad7b60..1735b61645 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -97,7 +97,7 @@ public:
                                   VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
 
     virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
-        for (const auto [cpu_addr, size] : sequences) {
+        for (const auto& [cpu_addr, size] : sequences) {
             InvalidateRegion(cpu_addr, size);
         }
     }
diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
index 6c4d745649..ed4a721668 100644
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -486,18 +486,18 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
 void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
     {
         std::scoped_lock lock{texture_cache.mutex};
-        for (const auto [addr, size] : sequences) {
+        for (const auto& [addr, size] : sequences) {
             texture_cache.WriteMemory(addr, size);
         }
     }
     {
         std::scoped_lock lock{buffer_cache.mutex};
-        for (const auto [addr, size] : sequences) {
+        for (const auto& [addr, size] : sequences) {
             buffer_cache.WriteMemory(addr, size);
         }
     }
     {
-        for (const auto [addr, size] : sequences) {
+        for (const auto& [addr, size] : sequences) {
             query_cache.InvalidateRegion(addr, size);
             pipeline_cache.InvalidateRegion(addr, size);
         }