From 63ba41a26d6bfdd30a4d7cd0879003fb4000332b Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Thu, 16 Jan 2020 04:00:52 -0300
Subject: [PATCH] shader/memory: Implement ATOMS.ADD.U32

---
 src/video_core/engines/shader_bytecode.h      | 37 +++++++++++++++++--
 .../renderer_opengl/gl_shader_decompiler.cpp  | 12 ++++++
 .../renderer_vulkan/vk_shader_decompiler.cpp  |  7 ++++
 src/video_core/shader/decode/memory.cpp       | 19 ++++++++++
 src/video_core/shader/node.h                  |  2 +
 5 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 57b57c647d..6f98bd8277 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -215,6 +215,18 @@ enum class F2fRoundingOp : u64 {
     Trunc = 11,
 };
 
+enum class AtomicOp : u64 {
+    Add = 0,
+    Min = 1,
+    Max = 2,
+    Inc = 3,
+    Dec = 4,
+    And = 5,
+    Or = 6,
+    Xor = 7,
+    Exch = 8,
+};
+
 enum class UniformType : u64 {
     UnsignedByte = 0,
     SignedByte = 1,
@@ -236,6 +248,13 @@ enum class StoreType : u64 {
     Bits128 = 6,
 };
 
+enum class AtomicType : u64 {
+    U32 = 0,
+    S32 = 1,
+    U64 = 2,
+    S64 = 3,
+};
+
 enum class IMinMaxExchange : u64 {
     None = 0,
     XLo = 1,
@@ -938,6 +957,16 @@ union Instruction {
         BitField<46, 2, u64> cache_mode;
     } stg;
 
+    union {
+        BitField<52, 4, AtomicOp> operation;
+        BitField<28, 2, AtomicType> type;
+        BitField<30, 22, s64> offset;
+
+        s32 GetImmediateOffset() const {
+            return static_cast<s32>(offset << 2);
+        }
+    } atoms;
+
     union {
         BitField<32, 1, PhysicalAttributeDirection> direction;
         BitField<47, 3, AttributeSize> size;
@@ -1659,9 +1688,10 @@ public:
         ST_A,
         ST_L,
         ST_S,
-        ST,   // Store in generic memory
-        STG,  // Store in global memory
-        AL2P, // Transforms attribute memory into physical memory
+        ST,    // Store in generic memory
+        STG,   // Store in global memory
+        ATOMS, // Atomic operation on shared memory
+        AL2P,  // Transforms attribute memory into physical memory
         TEX,
         TEX_B,  // Texture Load Bindless
         TXQ,    // Texture Query
@@ -1964,6 +1994,7 @@ private:
             INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
             INST("101-------------", Id::ST, Type::Memory, "ST"),
             INST("1110111011011---", Id::STG, Type::Memory, "STG"),
+            INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"),
             INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
             INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
             INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index f9f7a97b5a..19751939a0 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -1856,6 +1856,16 @@ private:
                 Type::Uint};
     }
 
+    template <const std::string_view& opname, Type type>
+    Expression Atomic(Operation operation) {
+        ASSERT(stage == ShaderType::Compute);
+        auto& smem = std::get<SmemNode>(*operation[0]);
+
+        return {fmt::format("atomic{}(smem[{} >> 2], {})", opname, Visit(smem.GetAddress()).AsInt(),
+                            Visit(operation[1]).As(type)),
+                type};
+    }
+
     Expression Branch(Operation operation) {
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
@@ -2194,6 +2204,8 @@ private:
         &GLSLDecompiler::AtomicImage<Func::Xor>,
         &GLSLDecompiler::AtomicImage<Func::Exchange>,
 
+        &GLSLDecompiler::Atomic<Func::Add, Type::Uint>,
+
         &GLSLDecompiler::Branch,
         &GLSLDecompiler::BranchIndirect,
         &GLSLDecompiler::PushFlowStack,
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 8fe852ce83..0cf97cafac 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1796,6 +1796,11 @@ private:
         return {};
     }
 
+    Expression UAtomicAdd(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Expression Branch(Operation operation) {
         const auto& target = std::get<ImmediateNode>(*operation[0]);
         OpStore(jmp_to, Constant(t_uint, target.GetValue()));
@@ -2373,6 +2378,8 @@ private:
         &SPIRVDecompiler::AtomicImageXor,
         &SPIRVDecompiler::AtomicImageExchange,
 
+        &SPIRVDecompiler::UAtomicAdd,
+
         &SPIRVDecompiler::Branch,
         &SPIRVDecompiler::BranchIndirect,
         &SPIRVDecompiler::PushFlowStack,
diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp
index 8cc84e935d..7591a715fd 100644
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -16,6 +16,8 @@
 
 namespace VideoCommon::Shader {
 
+using Tegra::Shader::AtomicOp;
+using Tegra::Shader::AtomicType;
 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
@@ -333,6 +335,23 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
         }
         break;
     }
+    case OpCode::Id::ATOMS: {
+        UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}",
+                             static_cast<int>(instr.atoms.operation.Value()));
+        UNIMPLEMENTED_IF_MSG(instr.atoms.type != AtomicType::U32, "type={}",
+                             static_cast<int>(instr.atoms.type.Value()));
+
+        const s32 offset = instr.atoms.GetImmediateOffset();
+        Node address = GetRegister(instr.gpr8);
+        address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset));
+
+        Node memory = GetSharedMemory(std::move(address));
+        Node data = GetRegister(instr.gpr20);
+
+        Node value = Operation(OperationCode::UAtomicAdd, std::move(memory), std::move(data));
+        SetRegister(bb, instr.gpr0, std::move(value));
+        break;
+    }
     case OpCode::Id::AL2P: {
         // Ignore al2p.direction since we don't care about it.
 
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 4e155542aa..075c7d07ce 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -162,6 +162,8 @@ enum class OperationCode {
     AtomicImageXor,      /// (MetaImage, int[N] coords) -> void
     AtomicImageExchange, /// (MetaImage, int[N] coords) -> void
 
+    UAtomicAdd, /// (smem, uint) -> uint
+
     Branch,         /// (uint branch_target) -> void
     BranchIndirect, /// (uint branch_target) -> void
     PushFlowStack,  /// (uint branch_target) -> void