From aad0cbf024fb8077a9b375a093c60a7e2ab1db3d Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 9 Nov 2022 17:58:10 +0100
Subject: [PATCH] MacroHLE: Add HLE replacement for base vertex and base
 instance.

---
 .../spirv/emit_spirv_context_get_set.cpp      |   8 ++
 .../backend/spirv/spirv_emit_context.cpp      |  10 ++
 src/shader_recompiler/environment.h           |   5 +
 .../frontend/ir/attribute.cpp                 |   4 +
 src/shader_recompiler/frontend/ir/attribute.h |   4 +
 .../frontend/ir/ir_emitter.cpp                |   8 ++
 .../frontend/ir/ir_emitter.h                  |   2 +
 .../frontend/maxwell/translate_program.cpp    |   2 +-
 .../ir_opt/constant_propagation_pass.cpp      |  45 ++++++-
 src/shader_recompiler/ir_opt/passes.h         |   2 +-
 src/shader_recompiler/shader_info.h           |   5 +
 src/shader_recompiler/varying_state.h         |   2 +-
 src/video_core/engines/maxwell_3d.cpp         |  15 ++-
 src/video_core/engines/maxwell_3d.h           |  17 +++
 src/video_core/macro/macro_hle.cpp            | 115 +++++++++---------
 src/video_core/memory_manager.cpp             |  10 +-
 src/video_core/memory_manager.h               |   3 +-
 .../renderer_vulkan/fixed_pipeline_state.cpp  |   1 +
 .../renderer_vulkan/fixed_pipeline_state.h    |   1 +
 .../renderer_vulkan/vk_pipeline_cache.cpp     |   2 +-
 src/video_core/shader_environment.cpp         |  53 ++++++++
 src/video_core/shader_environment.h           |  21 ++++
 22 files changed, 265 insertions(+), 70 deletions(-)

diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
index 73b67f0af6..e4802bf9e7 100644
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -339,6 +339,10 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
             const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_vertex)};
             return ctx.OpBitcast(ctx.F32[1], ctx.OpISub(ctx.U32[1], index, base));
         }
+    case IR::Attribute::BaseInstance:
+        return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.base_instance));
+    case IR::Attribute::BaseVertex:
+        return ctx.OpBitcast(ctx.F32[1], ctx.OpLoad(ctx.U32[1], ctx.base_vertex));
     case IR::Attribute::FrontFace:
         return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
                             ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())),
@@ -380,6 +384,10 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, Id) {
             const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_vertex)};
             return ctx.OpISub(ctx.U32[1], index, base);
         }
+    case IR::Attribute::BaseInstance:
+        return ctx.OpLoad(ctx.U32[1], ctx.base_instance);
+    case IR::Attribute::BaseVertex:
+        return ctx.OpLoad(ctx.U32[1], ctx.base_vertex);
     default:
         throw NotImplementedException("Read U32 attribute {}", attr);
     }
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
index 41dc6d0319..563a5fc49a 100644
--- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
+++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp
@@ -1379,18 +1379,28 @@ void EmitContext::DefineInputs(const IR::Program& program) {
     if (loads[IR::Attribute::InstanceId]) {
         if (profile.support_vertex_instance_id) {
             instance_id = DefineInput(*this, U32[1], true, spv::BuiltIn::InstanceId);
+            if (loads[IR::Attribute::BaseInstance]) {
+                base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
+            }
         } else {
             instance_index = DefineInput(*this, U32[1], true, spv::BuiltIn::InstanceIndex);
             base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseInstance);
         }
+    } else if (loads[IR::Attribute::BaseInstance]) {
+        base_instance = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseInstance);
     }
     if (loads[IR::Attribute::VertexId]) {
         if (profile.support_vertex_instance_id) {
             vertex_id = DefineInput(*this, U32[1], true, spv::BuiltIn::VertexId);
+            if (loads[IR::Attribute::BaseVertex]) {
+                base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
+            }
         } else {
             vertex_index = DefineInput(*this, U32[1], true, spv::BuiltIn::VertexIndex);
             base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
         }
+    } else if (loads[IR::Attribute::BaseVertex]) {
+        base_vertex = DefineInput(*this, U32[1], true, spv::BuiltIn::BaseVertex);
     }
     if (loads[IR::Attribute::FrontFace]) {
         front_face = DefineInput(*this, U1, true, spv::BuiltIn::FrontFacing);
diff --git a/src/shader_recompiler/environment.h b/src/shader_recompiler/environment.h
index 402f2664f6..b9b4455f69 100644
--- a/src/shader_recompiler/environment.h
+++ b/src/shader_recompiler/environment.h
@@ -34,6 +34,11 @@ public:
 
     [[nodiscard]] virtual std::array<u32, 3> WorkgroupSize() const = 0;
 
+    [[nodiscard]] virtual bool HasHLEMacroState() const = 0;
+
+    [[nodiscard]] virtual std::optional<ReplaceConstant> GetReplaceConstBuffer(
+        u32 bank, u32 offset) = 0;
+
     virtual void Dump(u64 hash) = 0;
 
     [[nodiscard]] const ProgramHeader& SPH() const noexcept {
diff --git a/src/shader_recompiler/frontend/ir/attribute.cpp b/src/shader_recompiler/frontend/ir/attribute.cpp
index 7d3d882e44..73e189a895 100644
--- a/src/shader_recompiler/frontend/ir/attribute.cpp
+++ b/src/shader_recompiler/frontend/ir/attribute.cpp
@@ -446,6 +446,10 @@ std::string NameOf(Attribute attribute) {
         return "ViewportMask";
     case Attribute::FrontFace:
         return "FrontFace";
+    case Attribute::BaseInstance:
+        return "BaseInstance";
+    case Attribute::BaseVertex:
+        return "BaseVertex";
     }
     return fmt::format("<reserved attribute {}>", static_cast<int>(attribute));
 }
diff --git a/src/shader_recompiler/frontend/ir/attribute.h b/src/shader_recompiler/frontend/ir/attribute.h
index 6ee3947b12..364d8a9124 100644
--- a/src/shader_recompiler/frontend/ir/attribute.h
+++ b/src/shader_recompiler/frontend/ir/attribute.h
@@ -219,6 +219,10 @@ enum class Attribute : u64 {
     FixedFncTexture9Q = 231,
     ViewportMask = 232,
     FrontFace = 255,
+
+    // Implementation attributes
+    BaseInstance = 256,
+    BaseVertex = 257,
 };
 
 constexpr size_t NUM_GENERICS = 32;
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.cpp b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
index 0cdac0effd..eb2e49a688 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.cpp
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.cpp
@@ -294,6 +294,14 @@ F32 IREmitter::GetAttribute(IR::Attribute attribute, const U32& vertex) {
     return Inst<F32>(Opcode::GetAttribute, attribute, vertex);
 }
 
+U32 IREmitter::GetAttributeU32(IR::Attribute attribute) {
+    return GetAttributeU32(attribute, Imm32(0));
+}
+
+U32 IREmitter::GetAttributeU32(IR::Attribute attribute, const U32& vertex) {
+    return Inst<U32>(Opcode::GetAttributeU32, attribute, vertex);
+}
+
 void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, const U32& vertex) {
     Inst(Opcode::SetAttribute, attribute, value, vertex);
 }
diff --git a/src/shader_recompiler/frontend/ir/ir_emitter.h b/src/shader_recompiler/frontend/ir/ir_emitter.h
index 2df992feb6..7aaaa4ab06 100644
--- a/src/shader_recompiler/frontend/ir/ir_emitter.h
+++ b/src/shader_recompiler/frontend/ir/ir_emitter.h
@@ -74,6 +74,8 @@ public:
 
     [[nodiscard]] F32 GetAttribute(IR::Attribute attribute);
     [[nodiscard]] F32 GetAttribute(IR::Attribute attribute, const U32& vertex);
+    [[nodiscard]] U32 GetAttributeU32(IR::Attribute attribute);
+    [[nodiscard]] U32 GetAttributeU32(IR::Attribute attribute, const U32& vertex);
     void SetAttribute(IR::Attribute attribute, const F32& value, const U32& vertex);
 
     [[nodiscard]] F32 GetAttributeIndexed(const U32& phys_address);
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index 3adbd2b166..ac159d24b9 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -219,7 +219,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
     }
     Optimization::SsaRewritePass(program);
 
-    Optimization::ConstantPropagationPass(program);
+    Optimization::ConstantPropagationPass(env, program);
 
     Optimization::PositionPass(env, program);
 
diff --git a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
index 826f9a54a5..ac10405f34 100644
--- a/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
+++ b/src/shader_recompiler/ir_opt/constant_propagation_pass.cpp
@@ -7,6 +7,7 @@
 #include <type_traits>
 
 #include "common/bit_cast.h"
+#include "shader_recompiler/environment.h"
 #include "shader_recompiler/exception.h"
 #include "shader_recompiler/frontend/ir/ir_emitter.h"
 #include "shader_recompiler/frontend/ir/value.h"
@@ -515,6 +516,8 @@ void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) {
             case IR::Attribute::PrimitiveId:
             case IR::Attribute::InstanceId:
             case IR::Attribute::VertexId:
+            case IR::Attribute::BaseVertex:
+            case IR::Attribute::BaseInstance:
                 break;
             default:
                 return;
@@ -644,7 +647,37 @@ void FoldFSwizzleAdd(IR::Block& block, IR::Inst& inst) {
     }
 }
 
-void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
+void FoldConstBuffer(Environment& env, IR::Block& block, IR::Inst& inst) {
+    const IR::Value bank{inst.Arg(0)};
+    const IR::Value offset{inst.Arg(1)};
+    if (!bank.IsImmediate() || !offset.IsImmediate()) {
+        return;
+    }
+    const auto bank_value = bank.U32();
+    const auto offset_value = offset.U32();
+    auto replacement = env.GetReplaceConstBuffer(bank_value, offset_value);
+    if (!replacement) {
+        return;
+    }
+    const auto new_attribute = [replacement]() {
+        switch (*replacement) {
+        case ReplaceConstant::BaseInstance:
+            return IR::Attribute::BaseInstance;
+        case ReplaceConstant::BaseVertex:
+            return IR::Attribute::BaseVertex;
+        default:
+            throw NotImplementedException("Not implemented replacement variable {}", *replacement);
+        }
+    }();
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    if (inst.GetOpcode() == IR::Opcode::GetCbufU32) {
+        inst.ReplaceUsesWith(ir.GetAttributeU32(new_attribute));
+    } else {
+        inst.ReplaceUsesWith(ir.GetAttribute(new_attribute));
+    }
+}
+
+void ConstantPropagation(Environment& env, IR::Block& block, IR::Inst& inst) {
     switch (inst.GetOpcode()) {
     case IR::Opcode::GetRegister:
         return FoldGetRegister(inst);
@@ -789,18 +822,24 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) {
                                     IR::Opcode::CompositeInsertF16x4);
     case IR::Opcode::FSwizzleAdd:
         return FoldFSwizzleAdd(block, inst);
+    case IR::Opcode::GetCbufF32:
+    case IR::Opcode::GetCbufU32:
+        if (env.HasHLEMacroState()) {
+            return FoldConstBuffer(env, block, inst);
+        }
+        break;
     default:
         break;
     }
 }
 } // Anonymous namespace
 
-void ConstantPropagationPass(IR::Program& program) {
+void ConstantPropagationPass(Environment& env, IR::Program& program) {
     const auto end{program.post_order_blocks.rend()};
     for (auto it = program.post_order_blocks.rbegin(); it != end; ++it) {
         IR::Block* const block{*it};
         for (IR::Inst& inst : block->Instructions()) {
-            ConstantPropagation(*block, inst);
+            ConstantPropagation(env, *block, inst);
         }
     }
 }
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h
index 11bfe801a2..1f8f2ba95e 100644
--- a/src/shader_recompiler/ir_opt/passes.h
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -13,7 +13,7 @@ struct HostTranslateInfo;
 namespace Shader::Optimization {
 
 void CollectShaderInfoPass(Environment& env, IR::Program& program);
-void ConstantPropagationPass(IR::Program& program);
+void ConstantPropagationPass(Environment& env, IR::Program& program);
 void DeadCodeEliminationPass(IR::Program& program);
 void GlobalMemoryToStorageBufferPass(IR::Program& program);
 void IdentityRemovalPass(IR::Program& program);
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h
index d9c6e92dbe..ea0f483441 100644
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -16,6 +16,11 @@
 
 namespace Shader {
 
+enum class ReplaceConstant : u32 {
+    BaseInstance,
+    BaseVertex,
+};
+
 enum class TextureType : u32 {
     Color1D,
     ColorArray1D,
diff --git a/src/shader_recompiler/varying_state.h b/src/shader_recompiler/varying_state.h
index 7b28a285f9..18a9aaf50f 100644
--- a/src/shader_recompiler/varying_state.h
+++ b/src/shader_recompiler/varying_state.h
@@ -11,7 +11,7 @@
 namespace Shader {
 
 struct VaryingState {
-    std::bitset<256> mask{};
+    std::bitset<512> mask{};
 
     void Set(IR::Attribute attribute, bool state = true) {
         mask[static_cast<size_t>(attribute)] = state;
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index a0dd7400df..50d8a94b15 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -182,8 +182,14 @@ u32 Maxwell3D::GetMaxCurrentVertices() {
 size_t Maxwell3D::EstimateIndexBufferSize() {
     GPUVAddr start_address = regs.index_buffer.StartAddress();
     GPUVAddr end_address = regs.index_buffer.EndAddress();
-    return std::min<size_t>(memory_manager.GetMemoryLayoutSize(start_address),
-                            static_cast<size_t>(end_address - start_address));
+    constexpr std::array<size_t, 4> max_sizes = {
+        std::numeric_limits<u8>::max(), std::numeric_limits<u16>::max(),
+        std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()};
+    const size_t byte_size = regs.index_buffer.FormatSizeInBytes();
+    return std::min<size_t>(
+        memory_manager.GetMemoryLayoutSize(start_address, byte_size * max_sizes[byte_size]) /
+            byte_size,
+        static_cast<size_t>(end_address - start_address));
 }
 
 u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
@@ -572,4 +578,9 @@ u32 Maxwell3D::GetRegisterValue(u32 method) const {
     return regs.reg_array[method];
 }
 
+void Maxwell3D::setHLEReplacementName(u32 bank, u32 offset, HLEReplaceName name) {
+    const u64 key = (static_cast<u64>(bank) << 32) | offset;
+    replace_table.emplace(key, name);
+}
+
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index cfe1e48832..397e88f675 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -3020,6 +3020,23 @@ public:
     /// Store temporary hw register values, used by some calls to restore state after a operation
     Regs shadow_state;
 
+    // None Engine
+    enum class EngineHint : u32 {
+        None = 0x0,
+        OnHLEMacro = 0x1,
+    };
+
+    EngineHint engine_state{EngineHint::None};
+
+    enum class HLEReplaceName : u32 {
+        BaseVertex = 0x0,
+        BaseInstance = 0x1,
+    };
+
+    void setHLEReplacementName(u32 bank, u32 offset, HLEReplaceName name);
+
+    std::unordered_map<u64, HLEReplaceName> replace_table;
+
     static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
     static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
 
diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp
index 93b6d42a47..638247e55d 100644
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -14,26 +14,29 @@
 #include "video_core/rasterizer_interface.h"
 
 namespace Tegra {
+
+using Maxwell = Engines::Maxwell3D;
+
 namespace {
 
-bool IsTopologySafe(Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology topology) {
+bool IsTopologySafe(Maxwell::Regs::PrimitiveTopology topology) {
     switch (topology) {
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Points:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Lines:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineLoop:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStrip:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Triangles:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStrip:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleFan:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LinesAdjacency:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::LineStripAdjacency:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TrianglesAdjacency:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Patches:
+    case Maxwell::Regs::PrimitiveTopology::Points:
+    case Maxwell::Regs::PrimitiveTopology::Lines:
+    case Maxwell::Regs::PrimitiveTopology::LineLoop:
+    case Maxwell::Regs::PrimitiveTopology::LineStrip:
+    case Maxwell::Regs::PrimitiveTopology::Triangles:
+    case Maxwell::Regs::PrimitiveTopology::TriangleStrip:
+    case Maxwell::Regs::PrimitiveTopology::TriangleFan:
+    case Maxwell::Regs::PrimitiveTopology::LinesAdjacency:
+    case Maxwell::Regs::PrimitiveTopology::LineStripAdjacency:
+    case Maxwell::Regs::PrimitiveTopology::TrianglesAdjacency:
+    case Maxwell::Regs::PrimitiveTopology::TriangleStripAdjacency:
+    case Maxwell::Regs::PrimitiveTopology::Patches:
         return true;
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Quads:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::QuadStrip:
-    case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::Polygon:
+    case Maxwell::Regs::PrimitiveTopology::Quads:
+    case Maxwell::Regs::PrimitiveTopology::QuadStrip:
+    case Maxwell::Regs::PrimitiveTopology::Polygon:
     default:
         return false;
     }
@@ -82,8 +85,7 @@ public:
         : HLEMacroImpl(maxwell3d_), extended(extended_) {}
 
     void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        auto topology =
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        auto topology = static_cast<Maxwell::Regs::PrimitiveTopology>(parameters[0]);
         if (!IsTopologySafe(topology)) {
             Fallback(parameters);
             return;
@@ -99,18 +101,16 @@ public:
         params.stride = 0;
 
         if (extended) {
-            maxwell3d.CallMethod(0x8e3, 0x640, true);
-            maxwell3d.CallMethod(0x8e4, parameters[4], true);
+            maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+            maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseInstance);
         }
 
         maxwell3d.draw_manager->DrawArrayIndirect(topology);
 
         if (extended) {
-            maxwell3d.CallMethod(0x8e3, 0x640, true);
-            maxwell3d.CallMethod(0x8e4, 0, true);
+            maxwell3d.engine_state = Maxwell::EngineHint::None;
+            maxwell3d.replace_table.clear();
         }
-        maxwell3d.regs.vertex_buffer.first = 0;
-        maxwell3d.regs.vertex_buffer.count = 0;
     }
 
 private:
@@ -134,13 +134,18 @@ private:
 
         const u32 base_instance = parameters[4];
         if (extended) {
-            maxwell3d.CallMethod(0x8e3, 0x640, true);
-            maxwell3d.CallMethod(0x8e4, base_instance, true);
+            maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+            maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseInstance);
         }
 
         maxwell3d.draw_manager->DrawArray(
             static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
             vertex_first, vertex_count, base_instance, instance_count);
+
+        if (extended) {
+            maxwell3d.engine_state = Maxwell::EngineHint::None;
+            maxwell3d.replace_table.clear();
+        }
     }
 
     bool extended;
@@ -151,8 +156,7 @@ public:
     explicit HLE_DrawIndexedIndirect(Engines::Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
 
     void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        auto topology =
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]);
+        auto topology = static_cast<Maxwell::Regs::PrimitiveTopology>(parameters[0]);
         if (!IsTopologySafe(topology)) {
             Fallback(parameters);
             return;
@@ -164,16 +168,12 @@ public:
             minimum_limit = std::max(parameters[3], minimum_limit);
         }
         const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
-        const u32 base_size = std::max(minimum_limit, estimate);
-        const u32 element_base = parameters[4];
-        const u32 base_instance = parameters[5];
-        maxwell3d.regs.index_buffer.first = 0;
-        maxwell3d.regs.index_buffer.count = base_size; // Use a fixed size, just for mapping
+        const u32 base_size = std::max<u32>(minimum_limit, estimate);
         maxwell3d.regs.draw.topology.Assign(topology);
         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, element_base, true);
-        maxwell3d.CallMethod(0x8e5, base_instance, true);
+        maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+        maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseVertex);
+        maxwell3d.setHLEReplacementName(0, 0x644, Maxwell::HLEReplaceName::BaseInstance);
         auto& params = maxwell3d.draw_manager->GetIndirectParams();
         params.is_indexed = true;
         params.include_count = false;
@@ -184,9 +184,8 @@ public:
         params.stride = 0;
         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
         maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, base_size);
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, 0x0, true);
-        maxwell3d.CallMethod(0x8e5, 0x0, true);
+        maxwell3d.engine_state = Maxwell::EngineHint::None;
+        maxwell3d.replace_table.clear();
     }
 
 private:
@@ -197,18 +196,17 @@ private:
         const u32 base_instance = parameters[5];
         maxwell3d.regs.vertex_id_base = element_base;
         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, element_base, true);
-        maxwell3d.CallMethod(0x8e5, base_instance, true);
+        maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+        maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseVertex);
+        maxwell3d.setHLEReplacementName(0, 0x644, Maxwell::HLEReplaceName::BaseInstance);
 
         maxwell3d.draw_manager->DrawIndex(
             static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]),
             parameters[3], parameters[1], element_base, base_instance, instance_count);
 
         maxwell3d.regs.vertex_id_base = 0x0;
-        maxwell3d.CallMethod(0x8e3, 0x640, true);
-        maxwell3d.CallMethod(0x8e4, 0x0, true);
-        maxwell3d.CallMethod(0x8e5, 0x0, true);
+        maxwell3d.engine_state = Maxwell::EngineHint::None;
+        maxwell3d.replace_table.clear();
     }
 
     u32 minimum_limit{1 << 18};
@@ -238,8 +236,7 @@ public:
         : HLEMacroImpl(maxwell3d_) {}
 
     void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
-        const auto topology =
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+        const auto topology = static_cast<Maxwell::Regs::PrimitiveTopology>(parameters[2]);
         if (!IsTopologySafe(topology)) {
             Fallback(parameters);
             return;
@@ -277,9 +274,6 @@ public:
         }
         const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
         const u32 base_size = std::max(minimum_limit, estimate);
-
-        maxwell3d.regs.index_buffer.first = 0;
-        maxwell3d.regs.index_buffer.count = std::max(highest_limit, base_size);
         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
         auto& params = maxwell3d.draw_manager->GetIndirectParams();
         params.is_indexed = true;
@@ -290,7 +284,12 @@ public:
         params.max_draw_counts = draw_count;
         params.stride = stride;
         maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, highest_limit);
+        maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+        maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseVertex);
+        maxwell3d.setHLEReplacementName(0, 0x644, Maxwell::HLEReplaceName::BaseInstance);
+        maxwell3d.draw_manager->DrawIndexedIndirect(topology, 0, base_size);
+        maxwell3d.engine_state = Maxwell::EngineHint::None;
+        maxwell3d.replace_table.clear();
     }
 
 private:
@@ -299,9 +298,8 @@ private:
             // Clean everything.
             // Clean everything.
             maxwell3d.regs.vertex_id_base = 0x0;
-            maxwell3d.CallMethod(0x8e3, 0x640, true);
-            maxwell3d.CallMethod(0x8e4, 0x0, true);
-            maxwell3d.CallMethod(0x8e5, 0x0, true);
+            maxwell3d.engine_state = Maxwell::EngineHint::None;
+            maxwell3d.replace_table.clear();
         });
         maxwell3d.RefreshParameters();
         const u32 start_indirect = parameters[0];
@@ -310,8 +308,7 @@ private:
             // Nothing to do.
             return;
         }
-        const auto topology =
-            static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[2]);
+        const auto topology = static_cast<Maxwell::Regs::PrimitiveTopology>(parameters[2]);
         maxwell3d.regs.draw.topology.Assign(topology);
         const u32 padding = parameters[3];
         const std::size_t max_draws = parameters[4];
@@ -326,9 +323,9 @@ private:
             const u32 base_vertex = parameters[base + 3];
             const u32 base_instance = parameters[base + 4];
             maxwell3d.regs.vertex_id_base = base_vertex;
-            maxwell3d.CallMethod(0x8e3, 0x640, true);
-            maxwell3d.CallMethod(0x8e4, base_vertex, true);
-            maxwell3d.CallMethod(0x8e5, base_instance, true);
+            maxwell3d.engine_state = Maxwell::EngineHint::OnHLEMacro;
+            maxwell3d.setHLEReplacementName(0, 0x640, Maxwell::HLEReplaceName::BaseVertex);
+            maxwell3d.setHLEReplacementName(0, 0x644, Maxwell::HLEReplaceName::BaseInstance);
             maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
             maxwell3d.draw_manager->DrawIndex(topology, parameters[base + 2], parameters[base],
                                               base_vertex, base_instance, parameters[base + 1]);
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 8f6c510458..11e7d225ec 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -577,7 +577,7 @@ size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
     return range_so_far;
 }
 
-size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr) const {
+size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size) const {
     PTEKind base_kind = GetPageKind(gpu_addr);
     if (base_kind == PTEKind::INVALID) {
         return 0;
@@ -596,6 +596,10 @@ size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr) const {
             return true;
         }
         range_so_far += copy_amount;
+        if (range_so_far >= max_size) {
+            result = true;
+            return true;
+        }
         return false;
     };
     auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
@@ -605,6 +609,10 @@ size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr) const {
             return true;
         }
         range_so_far += copy_amount;
+        if (range_so_far >= max_size) {
+            result = true;
+            return true;
+        }
         return false;
     };
     auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 65f6e8134d..ca22520d75 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -118,7 +118,8 @@ public:
 
     PTEKind GetPageKind(GPUVAddr gpu_addr) const;
 
-    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr) const;
+    size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
+                               size_t max_size = std::numeric_limits<size_t>::max()) const;
 
 private:
     template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
index e62b36822f..df229f41b8 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp
@@ -97,6 +97,7 @@ void FixedPipelineState::Refresh(Tegra::Engines::Maxwell3D& maxwell3d,
     smooth_lines.Assign(regs.line_anti_alias_enable != 0 ? 1 : 0);
     alpha_to_coverage_enabled.Assign(regs.anti_alias_alpha_control.alpha_to_coverage != 0 ? 1 : 0);
     alpha_to_one_enabled.Assign(regs.anti_alias_alpha_control.alpha_to_one != 0 ? 1 : 0);
+    app_stage.Assign(maxwell3d.engine_state);
 
     for (size_t i = 0; i < regs.rt.size(); ++i) {
         color_formats[i] = static_cast<u8>(regs.rt[i].format);
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
index ab79fb8f36..03bf64b575 100644
--- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h
+++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h
@@ -197,6 +197,7 @@ struct FixedPipelineState {
         BitField<14, 1, u32> smooth_lines;
         BitField<15, 1, u32> alpha_to_coverage_enabled;
         BitField<16, 1, u32> alpha_to_one_enabled;
+        BitField<17, 3, Tegra::Engines::Maxwell3D::EngineHint> app_stage;
     };
     std::array<u8, Maxwell::NumRenderTargets> color_formats;
 
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index e7262420c9..58b955821e 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -54,7 +54,7 @@ using VideoCommon::FileEnvironment;
 using VideoCommon::GenericEnvironment;
 using VideoCommon::GraphicsEnvironment;
 
-constexpr u32 CACHE_VERSION = 8;
+constexpr u32 CACHE_VERSION = 9;
 
 template <typename Container>
 auto MakeSpan(Container& container) {
diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp
index 9588107473..99d85bfb3e 100644
--- a/src/video_core/shader_environment.cpp
+++ b/src/video_core/shader_environment.cpp
@@ -202,12 +202,15 @@ void GenericEnvironment::Serialize(std::ofstream& file) const {
     const u64 num_texture_types{static_cast<u64>(texture_types.size())};
     const u64 num_texture_pixel_formats{static_cast<u64>(texture_pixel_formats.size())};
     const u64 num_cbuf_values{static_cast<u64>(cbuf_values.size())};
+    const u64 num_cbuf_replacement_values{static_cast<u64>(cbuf_replacements.size())};
 
     file.write(reinterpret_cast<const char*>(&code_size), sizeof(code_size))
         .write(reinterpret_cast<const char*>(&num_texture_types), sizeof(num_texture_types))
         .write(reinterpret_cast<const char*>(&num_texture_pixel_formats),
                sizeof(num_texture_pixel_formats))
         .write(reinterpret_cast<const char*>(&num_cbuf_values), sizeof(num_cbuf_values))
+        .write(reinterpret_cast<const char*>(&num_cbuf_replacement_values),
+               sizeof(num_cbuf_replacement_values))
         .write(reinterpret_cast<const char*>(&local_memory_size), sizeof(local_memory_size))
         .write(reinterpret_cast<const char*>(&texture_bound), sizeof(texture_bound))
         .write(reinterpret_cast<const char*>(&start_address), sizeof(start_address))
@@ -229,6 +232,10 @@ void GenericEnvironment::Serialize(std::ofstream& file) const {
         file.write(reinterpret_cast<const char*>(&key), sizeof(key))
             .write(reinterpret_cast<const char*>(&type), sizeof(type));
     }
+    for (const auto& [key, type] : cbuf_replacements) {
+        file.write(reinterpret_cast<const char*>(&key), sizeof(key))
+            .write(reinterpret_cast<const char*>(&type), sizeof(type));
+    }
     if (stage == Shader::Stage::Compute) {
         file.write(reinterpret_cast<const char*>(&workgroup_size), sizeof(workgroup_size))
             .write(reinterpret_cast<const char*>(&shared_memory_size), sizeof(shared_memory_size));
@@ -318,6 +325,8 @@ GraphicsEnvironment::GraphicsEnvironment(Tegra::Engines::Maxwell3D& maxwell3d_,
     ASSERT(local_size <= std::numeric_limits<u32>::max());
     local_memory_size = static_cast<u32>(local_size) + sph.common3.shader_local_memory_crs_size;
     texture_bound = maxwell3d->regs.bindless_texture_const_buffer_slot;
+    has_hle_engine_state =
+        maxwell3d->engine_state == Tegra::Engines::Maxwell3D::EngineHint::OnHLEMacro;
 }
 
 u32 GraphicsEnvironment::ReadCbufValue(u32 cbuf_index, u32 cbuf_offset) {
@@ -331,6 +340,30 @@ u32 GraphicsEnvironment::ReadCbufValue(u32 cbuf_index, u32 cbuf_offset) {
     return value;
 }
 
+std::optional<Shader::ReplaceConstant> GraphicsEnvironment::GetReplaceConstBuffer(u32 bank,
+                                                                                  u32 offset) {
+    if (!has_hle_engine_state) {
+        return std::nullopt;
+    }
+    const u64 key = (static_cast<u64>(bank) << 32) | static_cast<u64>(offset);
+    auto it = maxwell3d->replace_table.find(key);
+    if (it == maxwell3d->replace_table.end()) {
+        return std::nullopt;
+    }
+    const auto converted_value = [](Tegra::Engines::Maxwell3D::HLEReplaceName name) {
+        switch (name) {
+        case Tegra::Engines::Maxwell3D::HLEReplaceName::BaseVertex:
+            return Shader::ReplaceConstant::BaseVertex;
+        case Tegra::Engines::Maxwell3D::HLEReplaceName::BaseInstance:
+            return Shader::ReplaceConstant::BaseInstance;
+        default:
+            UNREACHABLE();
+        }
+    }(it->second);
+    cbuf_replacements.emplace(key, converted_value);
+    return converted_value;
+}
+
 Shader::TextureType GraphicsEnvironment::ReadTextureType(u32 handle) {
     const auto& regs{maxwell3d->regs};
     const bool via_header_index{regs.sampler_binding == Maxwell::SamplerBinding::ViaHeaderBinding};
@@ -409,11 +442,14 @@ void FileEnvironment::Deserialize(std::ifstream& file) {
     u64 num_texture_types{};
     u64 num_texture_pixel_formats{};
     u64 num_cbuf_values{};
+    u64 num_cbuf_replacement_values{};
     file.read(reinterpret_cast<char*>(&code_size), sizeof(code_size))
         .read(reinterpret_cast<char*>(&num_texture_types), sizeof(num_texture_types))
         .read(reinterpret_cast<char*>(&num_texture_pixel_formats),
               sizeof(num_texture_pixel_formats))
         .read(reinterpret_cast<char*>(&num_cbuf_values), sizeof(num_cbuf_values))
+        .read(reinterpret_cast<char*>(&num_cbuf_replacement_values),
+              sizeof(num_cbuf_replacement_values))
         .read(reinterpret_cast<char*>(&local_memory_size), sizeof(local_memory_size))
         .read(reinterpret_cast<char*>(&texture_bound), sizeof(texture_bound))
         .read(reinterpret_cast<char*>(&start_address), sizeof(start_address))
@@ -444,6 +480,13 @@ void FileEnvironment::Deserialize(std::ifstream& file) {
             .read(reinterpret_cast<char*>(&value), sizeof(value));
         cbuf_values.emplace(key, value);
     }
+    for (size_t i = 0; i < num_cbuf_replacement_values; ++i) {
+        u64 key;
+        Shader::ReplaceConstant value;
+        file.read(reinterpret_cast<char*>(&key), sizeof(key))
+            .read(reinterpret_cast<char*>(&value), sizeof(value));
+        cbuf_replacements.emplace(key, value);
+    }
     if (stage == Shader::Stage::Compute) {
         file.read(reinterpret_cast<char*>(&workgroup_size), sizeof(workgroup_size))
             .read(reinterpret_cast<char*>(&shared_memory_size), sizeof(shared_memory_size));
@@ -512,6 +555,16 @@ std::array<u32, 3> FileEnvironment::WorkgroupSize() const {
     return workgroup_size;
 }
 
+std::optional<Shader::ReplaceConstant> FileEnvironment::GetReplaceConstBuffer(u32 bank,
+                                                                              u32 offset) {
+    const u64 key = (static_cast<u64>(bank) << 32) | static_cast<u64>(offset);
+    auto it = cbuf_replacements.find(key);
+    if (it == cbuf_replacements.end()) {
+        return std::nullopt;
+    }
+    return it->second;
+}
+
 void SerializePipeline(std::span<const char> key, std::span<const GenericEnvironment* const> envs,
                        const std::filesystem::path& filename, u32 cache_version) try {
     std::ofstream file(filename, std::ios::binary | std::ios::ate | std::ios::app);
diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h
index 1342fab1e9..d75987a52e 100644
--- a/src/video_core/shader_environment.h
+++ b/src/video_core/shader_environment.h
@@ -60,6 +60,10 @@ public:
 
     void Serialize(std::ofstream& file) const;
 
+    bool HasHLEMacroState() const override {
+        return has_hle_engine_state;
+    }
+
 protected:
     std::optional<u64> TryFindSize();
 
@@ -73,6 +77,7 @@ protected:
     std::unordered_map<u32, Shader::TextureType> texture_types;
     std::unordered_map<u32, Shader::TexturePixelFormat> texture_pixel_formats;
     std::unordered_map<u64, u32> cbuf_values;
+    std::unordered_map<u64, Shader::ReplaceConstant> cbuf_replacements;
 
     u32 local_memory_size{};
     u32 texture_bound{};
@@ -89,6 +94,7 @@ protected:
     u32 viewport_transform_state = 1;
 
     bool has_unbound_instructions = false;
+    bool has_hle_engine_state = false;
 };
 
 class GraphicsEnvironment final : public GenericEnvironment {
@@ -109,6 +115,8 @@ public:
 
     u32 ReadViewportTransformState() override;
 
+    std::optional<Shader::ReplaceConstant> GetReplaceConstBuffer(u32 bank, u32 offset) override;
+
 private:
     Tegra::Engines::Maxwell3D* maxwell3d{};
     size_t stage_index{};
@@ -131,6 +139,11 @@ public:
 
     u32 ReadViewportTransformState() override;
 
+    std::optional<Shader::ReplaceConstant> GetReplaceConstBuffer(
+        [[maybe_unused]] u32 bank, [[maybe_unused]] u32 offset) override {
+        return std::nullopt;
+    }
+
 private:
     Tegra::Engines::KeplerCompute* kepler_compute{};
 };
@@ -166,6 +179,13 @@ public:
 
     [[nodiscard]] std::array<u32, 3> WorkgroupSize() const override;
 
+    [[nodiscard]] std::optional<Shader::ReplaceConstant> GetReplaceConstBuffer(u32 bank,
+                                                                               u32 offset) override;
+
+    [[nodiscard]] bool HasHLEMacroState() const override {
+        return cbuf_replacements.size() != 0;
+    }
+
     void Dump(u64 hash) override;
 
 private:
@@ -173,6 +193,7 @@ private:
     std::unordered_map<u32, Shader::TextureType> texture_types;
     std::unordered_map<u32, Shader::TexturePixelFormat> texture_pixel_formats;
     std::unordered_map<u64, u32> cbuf_values;
+    std::unordered_map<u64, Shader::ReplaceConstant> cbuf_replacements;
     std::array<u32, 3> workgroup_size{};
     u32 local_memory_size{};
     u32 shared_memory_size{};