From cb1497d0d7711a1c0e527aaa3e1dc3f95e5a6644 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 27 Nov 2022 00:58:06 +0100
Subject: [PATCH] DMAPusher: Improve collection of non executing methods

---
 src/video_core/dma_pusher.cpp             |  6 ++
 src/video_core/engines/engine_interface.h | 21 +++++
 src/video_core/engines/fermi_2d.cpp       | 10 +++
 src/video_core/engines/fermi_2d.h         |  2 +
 src/video_core/engines/kepler_compute.cpp | 14 +++-
 src/video_core/engines/kepler_compute.h   |  2 +
 src/video_core/engines/kepler_memory.cpp  | 11 +++
 src/video_core/engines/kepler_memory.h    |  2 +
 src/video_core/engines/maxwell_3d.cpp     | 94 +++++++++++++++++++++++
 src/video_core/engines/maxwell_3d.h       |  4 +
 src/video_core/engines/maxwell_dma.cpp    | 12 ++-
 src/video_core/engines/maxwell_dma.h      |  2 +
 src/video_core/macro/macro_hle.cpp        |  3 +
 13 files changed, 181 insertions(+), 2 deletions(-)

diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index b3e9cb82e6..5519298248 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -178,6 +178,11 @@ void DmaPusher::CallMethod(u32 argument) const {
         });
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
+        if (!subchannel->execution_mask[dma_state.method]) [[likely]] {
+            subchannel->method_sink.emplace_back(dma_state.method, argument);
+            return;
+        }
+        subchannel->ConsumeSink();
         subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
         subchannel->CallMethod(dma_state.method, argument, dma_state.is_last_call);
     }
@@ -189,6 +194,7 @@ void DmaPusher::CallMultiMethod(const u32* base_start, u32 num_methods) const {
                                dma_state.method_count);
     } else {
         auto subchannel = subchannels[dma_state.subchannel];
+        subchannel->ConsumeSink();
         subchannel->current_dma_segment = dma_state.dma_get + dma_state.dma_word_offset;
         subchannel->CallMultiMethod(dma_state.method, base_start, num_methods,
                                     dma_state.method_count);
diff --git a/src/video_core/engines/engine_interface.h b/src/video_core/engines/engine_interface.h
index 38f1abdc49..3923223589 100644
--- a/src/video_core/engines/engine_interface.h
+++ b/src/video_core/engines/engine_interface.h
@@ -3,6 +3,10 @@
 
 #pragma once
 
+#include <bitset>
+#include <limits>
+#include <vector>
+
 #include "common/common_types.h"
 
 namespace Tegra::Engines {
@@ -18,8 +22,25 @@ public:
     virtual void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                  u32 methods_pending) = 0;
 
+    void ConsumeSink() {
+        if (method_sink.empty()) {
+            return;
+        }
+        ConsumeSinkImpl();
+    }
+
+    std::bitset<std::numeric_limits<u16>::max()> execution_mask{};
+    std::vector<std::pair<u32, u32>> method_sink{};
     bool current_dirty{};
     GPUVAddr current_dma_segment;
+
+protected:
+    virtual void ConsumeSinkImpl() {
+        for (auto [method, value] : method_sink) {
+            CallMethod(method, value, true);
+        }
+        method_sink.clear();
+    }
 };
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index c6478ae85a..e655e72543 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -25,6 +25,9 @@ Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
     // Nvidia's OpenGL driver seems to assume these values
     regs.src.depth = 1;
     regs.dst.depth = 1;
+
+    execution_mask.reset();
+    execution_mask[FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1] = true;
 }
 
 Fermi2D::~Fermi2D() = default;
@@ -49,6 +52,13 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
     }
 }
 
+void Fermi2D::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void Fermi2D::Blit() {
     MICROPROFILE_SCOPE(GPU_BlitEngine);
     LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 100b21bac9..523fbdec25 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -309,6 +309,8 @@ private:
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
     void Blit();
+
+    void ConsumeSinkImpl() override;
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp
index e5c622155e..601095f038 100644
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -14,7 +14,12 @@
 namespace Tegra::Engines {
 
 KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manager_)
-    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {}
+    : system{system_}, memory_manager{memory_manager_}, upload_state{memory_manager, regs.upload} {
+    execution_mask.reset();
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(exec_upload)] = true;
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(data_upload)] = true;
+    execution_mask[KEPLER_COMPUTE_REG_INDEX(launch)] = true;
+}
 
 KeplerCompute::~KeplerCompute() = default;
 
@@ -23,6 +28,13 @@ void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_)
     upload_state.BindRasterizer(rasterizer);
 }
 
+void KeplerCompute::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     ASSERT_MSG(method < Regs::NUM_REGS,
                "Invalid KeplerCompute register, increase the size of the Regs structure");
diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index e154e3f062..2092e685fe 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -204,6 +204,8 @@ public:
 private:
     void ProcessLaunch();
 
+    void ConsumeSinkImpl() override;
+
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 08045d1cf4..c026801a35 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -18,6 +18,17 @@ KeplerMemory::~KeplerMemory() = default;
 
 void KeplerMemory::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
     upload_state.BindRasterizer(rasterizer_);
+
+    execution_mask.reset();
+    execution_mask[KEPLERMEMORY_REG_INDEX(exec)] = true;
+    execution_mask[KEPLERMEMORY_REG_INDEX(data)] = true;
+}
+
+void KeplerMemory::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
 }
 
 void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 5fe7489f0c..fb1eecbba9 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -73,6 +73,8 @@ public:
     } regs{};
 
 private:
+    void ConsumeSinkImpl() override;
+
     Core::System& system;
     Upload::State upload_state;
 };
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index bbe3202fe2..d44a5cabfc 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -4,6 +4,7 @@
 #include <cstring>
 #include <optional>
 #include "common/assert.h"
+#include "common/scope_exit.h"
 #include "common/settings.h"
 #include "core/core.h"
 #include "core/core_timing.h"
@@ -30,6 +31,10 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
                                                                                 regs.upload} {
     dirty.flags.flip();
     InitializeRegisterDefaults();
+    execution_mask.reset();
+    for (size_t i = 0; i < execution_mask.size(); i++) {
+        execution_mask[i] = IsMethodExecutable(static_cast<u32>(i));
+    }
 }
 
 Maxwell3D::~Maxwell3D() = default;
@@ -123,6 +128,71 @@ void Maxwell3D::InitializeRegisterDefaults() {
     shadow_state = regs;
 }
 
+bool Maxwell3D::IsMethodExecutable(u32 method) {
+    if (method >= MacroRegistersStart) {
+        return true;
+    }
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(draw.end):
+    case MAXWELL3D_REG_INDEX(draw.begin):
+    case MAXWELL3D_REG_INDEX(vertex_buffer.first):
+    case MAXWELL3D_REG_INDEX(vertex_buffer.count):
+    case MAXWELL3D_REG_INDEX(index_buffer.first):
+    case MAXWELL3D_REG_INDEX(index_buffer.count):
+    case MAXWELL3D_REG_INDEX(draw_inline_index):
+    case MAXWELL3D_REG_INDEX(index_buffer32_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer16_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer8_subsequent):
+    case MAXWELL3D_REG_INDEX(index_buffer32_first):
+    case MAXWELL3D_REG_INDEX(index_buffer16_first):
+    case MAXWELL3D_REG_INDEX(index_buffer8_first):
+    case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
+    case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
+    case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent):
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+    case MAXWELL3D_REG_INDEX(load_mme.instruction_ptr):
+    case MAXWELL3D_REG_INDEX(load_mme.instruction):
+    case MAXWELL3D_REG_INDEX(load_mme.start_address):
+    case MAXWELL3D_REG_INDEX(falcon[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer):
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 1:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 2:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 3:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 4:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 5:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 6:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 7:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 8:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 9:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 10:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 11:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 12:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 13:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 14:
+    case MAXWELL3D_REG_INDEX(const_buffer.buffer) + 15:
+    case MAXWELL3D_REG_INDEX(bind_groups[0].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[1].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[2].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[3].raw_config):
+    case MAXWELL3D_REG_INDEX(bind_groups[4].raw_config):
+    case MAXWELL3D_REG_INDEX(topology_override):
+    case MAXWELL3D_REG_INDEX(clear_surface):
+    case MAXWELL3D_REG_INDEX(report_semaphore.query):
+    case MAXWELL3D_REG_INDEX(render_enable.mode):
+    case MAXWELL3D_REG_INDEX(clear_report_value):
+    case MAXWELL3D_REG_INDEX(sync_info):
+    case MAXWELL3D_REG_INDEX(launch_dma):
+    case MAXWELL3D_REG_INDEX(inline_data):
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return true;
+    default:
+        return false;
+    }
+}
+
 void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
     if (executing_macro == 0) {
         // A macro call must begin by writing the macro method's register, not its argument.
@@ -141,6 +211,7 @@ void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool
 
     // Call the macro when there are no more parameters in the command buffer
     if (is_last_call) {
+        ConsumeSink();
         CallMacroMethod(executing_macro, macro_params);
         macro_params.clear();
         macro_addresses.clear();
@@ -214,6 +285,29 @@ u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
     return argument;
 }
 
+void Maxwell3D::ConsumeSinkImpl() {
+    SCOPE_EXIT({ method_sink.clear(); });
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+
+        for (auto [method, value] : method_sink) {
+            shadow_state.reg_array[method] = value;
+            ProcessDirtyRegisters(method, value);
+        }
+        return;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        for (auto [method, value] : method_sink) {
+            ProcessDirtyRegisters(method, shadow_state.reg_array[method]);
+        }
+        return;
+    }
+    for (auto [method, value] : method_sink) {
+        ProcessDirtyRegisters(method, value);
+    }
+}
+
 void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
     if (regs.reg_array[method] == argument) {
         return;
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index f0a3798018..478ba4dc7c 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -3123,6 +3123,8 @@ private:
 
     void ProcessDirtyRegisters(u32 method, u32 argument);
 
+    void ConsumeSinkImpl() override;
+
     void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
 
     /// Retrieves information about a specific TIC entry from the TIC buffer.
@@ -3172,6 +3174,8 @@ private:
 
     void RefreshParametersImpl();
 
+    bool IsMethodExecutable(u32 method);
+
     Core::System& system;
     MemoryManager& memory_manager;
 
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index f73d7bf0f7..01f70ea9e5 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -21,7 +21,10 @@ namespace Tegra::Engines {
 using namespace Texture;
 
 MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_)
-    : system{system_}, memory_manager{memory_manager_} {}
+    : system{system_}, memory_manager{memory_manager_} {
+    execution_mask.reset();
+    execution_mask[offsetof(Regs, launch_dma) / sizeof(u32)] = true;
+}
 
 MaxwellDMA::~MaxwellDMA() = default;
 
@@ -29,6 +32,13 @@ void MaxwellDMA::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
     rasterizer = rasterizer_;
 }
 
+void MaxwellDMA::ConsumeSinkImpl() {
+    for (auto [method, value] : method_sink) {
+        regs.reg_array[method] = value;
+    }
+    method_sink.clear();
+}
+
 void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
     ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register");
 
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index c88191a613..0e594fa745 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -231,6 +231,8 @@ private:
 
     void ReleaseSemaphore();
 
+    void ConsumeSinkImpl() override;
+
     Core::System& system;
 
     MemoryManager& memory_manager;
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 3eac50975b..294a338d2b 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -126,6 +126,7 @@ private:
 
         const u32 vertex_first = parameters[3];
         const u32 vertex_count = parameters[1];
+        
 
         if (maxwell3d.AnyParametersDirty() &&
             maxwell3d.GetMaxCurrentVertices() < vertex_first + vertex_count) {
@@ -135,6 +136,7 @@ private:
 
         const u32 base_instance = parameters[4];
         if (extended) {
+            maxwell3d.regs.global_base_instance_index = base_instance;
             maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
             maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseInstance);
         }
@@ -144,6 +146,7 @@ private:
             vertex_first, vertex_count, base_instance, instance_count);
 
         if (extended) {
+            maxwell3d.regs.global_base_instance_index = 0;
             maxwell3d.engine_state = Maxwell::EngineHint::None;
             maxwell3d.replace_table.clear();
         }