From 3b84e04af1ce2f9e218e7bcf225dd3eff1ddc61d Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Sat, 29 Dec 2018 02:44:54 -0300
Subject: [PATCH] shader_decode: Implement LDG and basic cbuf tracking

---
 src/video_core/CMakeLists.txt                 |  1 +
 src/video_core/engines/shader_bytecode.h      |  8 ++
 .../renderer_opengl/gl_shader_decompiler.cpp  | 44 +++++++++--
 .../renderer_opengl/gl_shader_decompiler.h    | 34 +++++++++
 src/video_core/shader/decode/memory.cpp       | 49 ++++++++++++
 src/video_core/shader/shader_ir.h             | 38 +++++++++-
 src/video_core/shader/track.cpp               | 76 +++++++++++++++++++
 7 files changed, 240 insertions(+), 10 deletions(-)
 create mode 100644 src/video_core/shader/track.cpp

diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 509ca117a1..6113e17ff1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -87,6 +87,7 @@ add_library(video_core STATIC
     shader/decode.cpp
     shader/shader_ir.cpp
     shader/shader_ir.h
+    shader/track.cpp
     surface.cpp
     surface.h
     textures/astc.cpp
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index cdef97bc68..9989825f8e 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -208,6 +208,8 @@ enum class UniformType : u64 {
     SignedShort = 3,
     Single = 4,
     Double = 5,
+    Quad = 6,
+    UnsignedQuad = 7,
 };
 
 enum class StoreType : u64 {
@@ -784,6 +786,12 @@ union Instruction {
         BitField<44, 2, u64> unknown;
     } st_l;
 
+    union {
+        BitField<48, 3, UniformType> type;
+        BitField<46, 2, u64> cache_mode;
+        BitField<20, 24, s64> immediate_offset;
+    } ldg;
+
     union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 3411cf9e64..e072216f04 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -34,6 +34,8 @@ using Operation = const OperationNode&;
 enum : u32 { POSITION_VARYING_LOCATION = 0, GENERIC_VARYING_START_LOCATION = 1 };
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_GLOBALMEMORY_ELEMENTS =
+    static_cast<u32>(RasterizerOpenGL::MaxGlobalMemorySize) / sizeof(float);
 
 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
 
@@ -143,6 +145,7 @@ public:
         DeclareInputAttributes();
         DeclareOutputAttributes();
         DeclareConstantBuffers();
+        DeclareGlobalMemory();
         DeclareSamplers();
 
         code.AddLine("void execute_" + suffix + "() {");
@@ -190,12 +193,15 @@ public:
     ShaderEntries GetShaderEntries() const {
         ShaderEntries entries;
         for (const auto& cbuf : ir.GetConstantBuffers()) {
-            ConstBufferEntry desc(cbuf.second, stage, GetConstBufferBlock(cbuf.first), cbuf.first);
-            entries.const_buffers.push_back(desc);
+            entries.const_buffers.emplace_back(cbuf.second, stage, GetConstBufferBlock(cbuf.first),
+                                               cbuf.first);
         }
         for (const auto& sampler : ir.GetSamplers()) {
-            SamplerEntry desc(sampler, stage, GetSampler(sampler));
-            entries.samplers.push_back(desc);
+            entries.samplers.emplace_back(sampler, stage, GetSampler(sampler));
+        }
+        for (const auto& gmem : ir.GetGlobalMemoryBases()) {
+            entries.global_memory_entries.emplace_back(gmem.cbuf_index, gmem.cbuf_offset, stage,
+                                                       GetGlobalMemoryBlock(gmem));
         }
         entries.clip_distances = ir.GetClipDistances();
         entries.shader_length = ir.GetLength();
@@ -375,6 +381,15 @@ private:
         }
     }
 
+    void DeclareGlobalMemory() {
+        for (const auto& entry : ir.GetGlobalMemoryBases()) {
+            code.AddLine("layout (std430) buffer " + GetGlobalMemoryBlock(entry) + " {");
+            code.AddLine("    float " + GetGlobalMemory(entry) + "[MAX_GLOBALMEMORY_ELEMENTS];");
+            code.AddLine("};");
+            code.AddNewLine();
+        }
+    }
+
     void DeclareSamplers() {
         const auto& samplers = ir.GetSamplers();
         for (const auto& sampler : samplers) {
@@ -538,6 +553,12 @@ private:
                 UNREACHABLE_MSG("Unmanaged offset node type");
             }
 
+        } else if (const auto gmem = std::get_if<GmemNode>(node)) {
+            const std::string real = Visit(gmem->GetRealAddress());
+            const std::string base = Visit(gmem->GetBaseAddress());
+            const std::string final_offset = "(ftou(" + real + ") - ftou(" + base + ")) / 4";
+            return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+
         } else if (const auto lmem = std::get_if<LmemNode>(node)) {
             return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
 
@@ -1471,6 +1492,15 @@ private:
         return GetDeclarationWithSuffix(index, "cbuf");
     }
 
+    std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
+        return fmt::format("gmem_{}_{}_{}", descriptor.cbuf_index, descriptor.cbuf_offset, suffix);
+    }
+
+    std::string GetGlobalMemoryBlock(const GlobalMemoryBase& descriptor) const {
+        return fmt::format("gmem_block_{}_{}_{}", descriptor.cbuf_index, descriptor.cbuf_offset,
+                           suffix);
+    }
+
     std::string GetConstBufferBlock(u32 index) const {
         return GetDeclarationWithSuffix(index, "cbuf_block");
     }
@@ -1505,8 +1535,10 @@ private:
 };
 
 std::string GetCommonDeclarations() {
-    return "#define MAX_CONSTBUFFER_ELEMENTS " + std::to_string(MAX_CONSTBUFFER_ELEMENTS) +
-           "\n"
+    const auto cbuf = std::to_string(MAX_CONSTBUFFER_ELEMENTS);
+    const auto gmem = std::to_string(MAX_GLOBALMEMORY_ELEMENTS);
+    return "#define MAX_CONSTBUFFER_ELEMENTS " + cbuf + "\n" +
+           "#define MAX_GLOBALMEMORY_ELEMENTS " + gmem + "\n" +
            "#define ftoi floatBitsToInt\n"
            "#define ftou floatBitsToUint\n"
            "#define itof intBitsToFloat\n"
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 396a560d8e..e47bc37293 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -71,9 +71,43 @@ private:
     Maxwell::ShaderStage stage{};
 };
 
+class GlobalMemoryEntry {
+public:
+    explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, Maxwell::ShaderStage stage,
+                               std::string name)
+        : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, stage{stage}, name{std::move(name)} {}
+
+    u32 GetCbufIndex() const {
+        return cbuf_index;
+    }
+
+    u32 GetCbufOffset() const {
+        return cbuf_offset;
+    }
+
+    const std::string& GetName() const {
+        return name;
+    }
+
+    Maxwell::ShaderStage GetStage() const {
+        return stage;
+    }
+
+    u32 GetHash() const {
+        return (static_cast<u32>(stage) << 24) | (cbuf_index << 16) | cbuf_offset;
+    }
+
+private:
+    u32 cbuf_index{};
+    u32 cbuf_offset{};
+    Maxwell::ShaderStage stage{};
+    std::string name;
+};
+
 struct ShaderEntries {
     std::vector<ConstBufferEntry> const_buffers;
     std::vector<SamplerEntry> samplers;
+    std::vector<GlobalMemoryEntry> global_memory_entries;
     std::array<bool, Maxwell::NumClipDistances> clip_distances{};
     std::size_t shader_length{};
 };
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index ae71672d68..04cb386b76 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <vector>
+#include <fmt/format.h>
 
 #include "common/assert.h"
 #include "common/common_types.h"
@@ -119,6 +120,54 @@ u32 ShaderIR::DecodeMemory(BasicBlock& bb, const BasicBlock& code, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::LDG: {
+        const u32 count = [&]() {
+            switch (instr.ldg.type) {
+            case Tegra::Shader::UniformType::Single:
+                return 1;
+            case Tegra::Shader::UniformType::Double:
+                return 2;
+            case Tegra::Shader::UniformType::Quad:
+            case Tegra::Shader::UniformType::UnsignedQuad:
+                return 4;
+            default:
+                UNIMPLEMENTED_MSG("Unimplemented LDG size!");
+                return 1;
+            }
+        }();
+
+        const Node addr_register = GetRegister(instr.gpr8);
+        const Node base_address = TrackCbuf(addr_register, code, static_cast<s64>(code.size()));
+        const auto cbuf = std::get_if<CbufNode>(base_address);
+        ASSERT(cbuf != nullptr);
+        const auto cbuf_offset_imm = std::get_if<ImmediateNode>(cbuf->GetOffset());
+        ASSERT(cbuf_offset_imm != nullptr);
+        const auto cbuf_offset = cbuf_offset_imm->GetValue() * 4;
+
+        bb.push_back(Comment(
+            fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+
+        const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+        used_global_memory_bases.insert(descriptor);
+
+        const Node immediate_offset =
+            Immediate(static_cast<u32>(instr.ldg.immediate_offset.Value()));
+        const Node base_real_address =
+            Operation(OperationCode::UAdd, NO_PRECISE, immediate_offset, addr_register);
+
+        for (u32 i = 0; i < count; ++i) {
+            const Node it_offset = Immediate(i * 4);
+            const Node real_address =
+                Operation(OperationCode::UAdd, NO_PRECISE, base_real_address, it_offset);
+            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+
+            SetTemporal(bb, i, gmem);
+        }
+        for (u32 i = 0; i < count; ++i) {
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+        }
+        break;
+    }
     case OpCode::Id::ST_A: {
         UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                              "Indirect attribute loads are not supported");
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index ef8f94480b..c4ecb2e3cd 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -257,6 +257,15 @@ private:
     bool is_indirect{};
 };
 
+struct GlobalMemoryBase {
+    u32 cbuf_index{};
+    u32 cbuf_offset{};
+
+    bool operator<(const GlobalMemoryBase& rhs) const {
+        return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset);
+    }
+};
+
 struct MetaArithmetic {
     bool precise{};
 };
@@ -478,14 +487,26 @@ private:
 /// Global memory node
 class GmemNode final {
 public:
-    explicit constexpr GmemNode(Node address) : address{address} {}
+    explicit constexpr GmemNode(Node real_address, Node base_address,
+                                const GlobalMemoryBase& descriptor)
+        : real_address{real_address}, base_address{base_address}, descriptor{descriptor} {}
 
-    Node GetAddress() const {
-        return address;
+    Node GetRealAddress() const {
+        return real_address;
+    }
+
+    Node GetBaseAddress() const {
+        return base_address;
+    }
+
+    const GlobalMemoryBase& GetDescriptor() const {
+        return descriptor;
     }
 
 private:
-    const Node address;
+    const Node real_address;
+    const Node base_address;
+    const GlobalMemoryBase descriptor;
 };
 
 /// Commentary, can be dropped
@@ -543,6 +564,10 @@ public:
         return used_clip_distances;
     }
 
+    const std::set<GlobalMemoryBase>& GetGlobalMemoryBases() const {
+        return used_global_memory_bases;
+    }
+
     std::size_t GetLength() const {
         return static_cast<std::size_t>(coverage_end * sizeof(u64));
     }
@@ -734,6 +759,10 @@ private:
     void WriteLop3Instruction(BasicBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                               Node op_c, Node imm_lut, bool sets_cc);
 
+    Node TrackCbuf(Node tracked, const BasicBlock& code, s64 cursor);
+
+    std::pair<Node, s64> TrackRegister(const GprNode* tracked, const BasicBlock& code, s64 cursor);
+
     template <typename... T>
     Node Operation(OperationCode code, const T*... operands) {
         return StoreNode(OperationNode(code, operands...));
@@ -786,6 +815,7 @@ private:
     std::map<u32, ConstBuffer> used_cbufs;
     std::set<Sampler> used_samplers;
     std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
+    std::set<GlobalMemoryBase> used_global_memory_bases;
 
     Tegra::Shader::Header header;
 };
diff --git a/src/video_core/shader/track.cpp b/src/video_core/shader/track.cpp
new file mode 100644
index 0000000000..d6d29ee9fc
--- /dev/null
+++ b/src/video_core/shader/track.cpp
@@ -0,0 +1,76 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <utility>
+#include <variant>
+
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+namespace {
+std::pair<Node, s64> FindOperation(const BasicBlock& code, s64 cursor,
+                                   OperationCode operation_code) {
+    for (; cursor >= 0; --cursor) {
+        const Node node = code[cursor];
+        if (const auto operation = std::get_if<OperationNode>(node)) {
+            if (operation->GetCode() == operation_code)
+                return {node, cursor};
+        }
+    }
+    return {};
+}
+} // namespace
+
+Node ShaderIR::TrackCbuf(Node tracked, const BasicBlock& code, s64 cursor) {
+    if (const auto cbuf = std::get_if<CbufNode>(tracked)) {
+        // Cbuf found, but it has to be immediate
+        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+    }
+    if (const auto gpr = std::get_if<GprNode>(tracked)) {
+        if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
+            return nullptr;
+        }
+        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
+        // register that it uses as operand
+        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
+        if (!source) {
+            return nullptr;
+        }
+        return TrackCbuf(source, code, new_cursor);
+    }
+    if (const auto operation = std::get_if<OperationNode>(tracked)) {
+        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
+            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
+                // Cbuf found in operand
+                return found;
+            }
+        }
+        return nullptr;
+    }
+    return nullptr;
+}
+
+std::pair<Node, s64> ShaderIR::TrackRegister(const GprNode* tracked, const BasicBlock& code,
+                                             s64 cursor) {
+    for (; cursor >= 0; --cursor) {
+        const auto [found_node, new_cursor] = FindOperation(code, cursor, OperationCode::Assign);
+        if (!found_node) {
+            return {};
+        }
+        const auto operation = std::get_if<OperationNode>(found_node);
+        ASSERT(operation);
+
+        const auto& target = (*operation)[0];
+        if (const auto gpr_target = std::get_if<GprNode>(target)) {
+            if (gpr_target->GetIndex() == tracked->GetIndex()) {
+                return {(*operation)[1], new_cursor};
+            }
+        }
+    }
+    return {};
+}
+
+} // namespace VideoCommon::Shader