From 9343b81afd5132aa41f50fe033faca4f9713b8cf Mon Sep 17 00:00:00 2001
From: Ameer J <52414509+ameerj@users.noreply.github.com>
Date: Fri, 17 Nov 2023 22:29:12 -0500
Subject: [PATCH] shader_recompiler: add byteswap pattern workaround for Nvidia

---
 src/shader_recompiler/CMakeLists.txt          |  1 +
 .../frontend/maxwell/translate_program.cpp    |  1 +
 src/shader_recompiler/ir_opt/passes.h         |  1 +
 .../ir_opt/vendor_workaround_pass.cpp         | 79 +++++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 src/shader_recompiler/ir_opt/vendor_workaround_pass.cpp

diff --git a/src/shader_recompiler/CMakeLists.txt b/src/shader_recompiler/CMakeLists.txt
index 83b763447e..19db17c6d2 100644
--- a/src/shader_recompiler/CMakeLists.txt
+++ b/src/shader_recompiler/CMakeLists.txt
@@ -231,6 +231,7 @@ add_library(shader_recompiler STATIC
     ir_opt/rescaling_pass.cpp
     ir_opt/ssa_rewrite_pass.cpp
     ir_opt/texture_pass.cpp
+    ir_opt/vendor_workaround_pass.cpp
     ir_opt/verification_pass.cpp
     object_pool.h
     precompiled_headers.h
diff --git a/src/shader_recompiler/frontend/maxwell/translate_program.cpp b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
index 928b355611..8fac6bad3f 100644
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -310,6 +310,7 @@ IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Blo
     }
     Optimization::CollectShaderInfoPass(env, program);
     Optimization::LayerPass(program, host_info);
+    Optimization::VendorWorkaroundPass(program);
 
     CollectInterpolationInfo(env, program);
     AddNVNStorageBuffers(program);
diff --git a/src/shader_recompiler/ir_opt/passes.h b/src/shader_recompiler/ir_opt/passes.h
index 629d18fa19..d4d5285e53 100644
--- a/src/shader_recompiler/ir_opt/passes.h
+++ b/src/shader_recompiler/ir_opt/passes.h
@@ -26,6 +26,7 @@ void SsaRewritePass(IR::Program& program);
 void PositionPass(Environment& env, IR::Program& program);
 void TexturePass(Environment& env, IR::Program& program, const HostTranslateInfo& host_info);
 void LayerPass(IR::Program& program, const HostTranslateInfo& host_info);
+void VendorWorkaroundPass(IR::Program& program);
 void VerificationPass(const IR::Program& program);
 
 // Dual Vertex
diff --git a/src/shader_recompiler/ir_opt/vendor_workaround_pass.cpp b/src/shader_recompiler/ir_opt/vendor_workaround_pass.cpp
new file mode 100644
index 0000000000..08c658cb81
--- /dev/null
+++ b/src/shader_recompiler/ir_opt/vendor_workaround_pass.cpp
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "shader_recompiler/frontend/ir/basic_block.h"
+#include "shader_recompiler/frontend/ir/ir_emitter.h"
+#include "shader_recompiler/frontend/ir/value.h"
+#include "shader_recompiler/ir_opt/passes.h"
+
+namespace Shader::Optimization {
+
+namespace {
+void AddingByteSwapsWorkaround(IR::Block& block, IR::Inst& inst) {
+    /*
+     * Workaround for an NVIDIA bug seen in Super Mario RPG
+     *
+     * We are looking for this pattern:
+     *   %lhs_bfe = BitFieldUExtract %factor_a, #0, #16
+     *   %lhs_mul = IMul32 %lhs_bfe, %factor_b           // potentially optional?
+     *   %lhs_shl = ShiftLeftLogical32 %lhs_mul, #16
+     *   %rhs_bfe = BitFieldUExtract %factor_a, #16, #16
+     *   %result  = IAdd32 %lhs_shl, %rhs_bfe
+     *
+     * And replacing the IAdd32 with a BitwiseOr32
+     *   %result  = BitwiseOr32 %lhs_shl, %rhs_bfe
+     *
+     */
+    IR::Inst* const lhs_shl{inst.Arg(0).TryInstRecursive()};
+    IR::Inst* const rhs_bfe{inst.Arg(1).TryInstRecursive()};
+    if (!lhs_shl || !rhs_bfe) {
+        return;
+    }
+    if (lhs_shl->GetOpcode() != IR::Opcode::ShiftLeftLogical32 ||
+        lhs_shl->Arg(1) != IR::Value{16U}) {
+        return;
+    }
+    if (rhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract || rhs_bfe->Arg(1) != IR::Value{16U} ||
+        rhs_bfe->Arg(2) != IR::Value{16U}) {
+        return;
+    }
+    IR::Inst* const lhs_mul{lhs_shl->Arg(0).TryInstRecursive()};
+    if (!lhs_mul) {
+        return;
+    }
+    const bool lhs_mul_optional{lhs_mul->GetOpcode() == IR::Opcode::BitFieldUExtract};
+    if (lhs_mul->GetOpcode() != IR::Opcode::IMul32 &&
+        lhs_mul->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return;
+    }
+    IR::Inst* const lhs_bfe{lhs_mul_optional ? lhs_mul : lhs_mul->Arg(0).TryInstRecursive()};
+    if (!lhs_bfe) {
+        return;
+    }
+    if (lhs_bfe->GetOpcode() != IR::Opcode::BitFieldUExtract) {
+        return;
+    }
+    if (lhs_bfe->Arg(1) != IR::Value{0U} || lhs_bfe->Arg(2) != IR::Value{16U}) {
+        return;
+    }
+    IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)};
+    inst.ReplaceUsesWith(ir.BitwiseOr(IR::U32{inst.Arg(0)}, IR::U32{inst.Arg(1)}));
+}
+
+} // Anonymous namespace
+
+void VendorWorkaroundPass(IR::Program& program) {
+    for (IR::Block* const block : program.post_order_blocks) {
+        for (IR::Inst& inst : block->Instructions()) {
+            switch (inst.GetOpcode()) {
+            case IR::Opcode::IAdd32:
+                AddingByteSwapsWorkaround(*block, inst);
+                break;
+            default:
+                break;
+            }
+        }
+    }
+}
+
+} // namespace Shader::Optimization