From 82107b33a2251eb4f55ab2006a8fc0cb47cc39e8 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sun, 25 Jun 2023 18:43:23 -0400
Subject: [PATCH 1/4] OpenGL: Add Local Memory warmup shader

---
 src/video_core/host_shaders/CMakeLists.txt    |  1 +
 .../host_shaders/opengl_lmem_warmup.comp      | 47 +++++++++++++++++++
 .../renderer_opengl/gl_rasterizer.cpp         |  2 +
 .../renderer_opengl/gl_shader_manager.cpp     | 10 +++-
 .../renderer_opengl/gl_shader_manager.h       |  3 ++
 5 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 src/video_core/host_shaders/opengl_lmem_warmup.comp

diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt
index 2442c3c294..e61d9af806 100644
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -33,6 +33,7 @@ set(SHADER_FILES
     opengl_fidelityfx_fsr.frag
     opengl_fidelityfx_fsr_easu.frag
     opengl_fidelityfx_fsr_rcas.frag
+    opengl_lmem_warmup.comp
     opengl_present.frag
     opengl_present.vert
     opengl_present_scaleforce.frag
diff --git a/src/video_core/host_shaders/opengl_lmem_warmup.comp b/src/video_core/host_shaders/opengl_lmem_warmup.comp
new file mode 100644
index 0000000000..518268477a
--- /dev/null
+++ b/src/video_core/host_shaders/opengl_lmem_warmup.comp
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+// This shader is a workaround for a quirk in NVIDIA OpenGL drivers
+// Shaders using local memory see a great performance benefit if a shader that was dispatched
+// before it had more local memory allocated.
+// This shader allocates the maximum local memory allowed on NVIDIA drivers to ensure that
+// subsequent shaders see the performance boost.
+
+// NOTE: This shader does no actual meaningful work and returns immediately,
+// it is simply a means to have the driver expect a shader using lots of local memory.
+
+#version 450
+
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+layout(location = 0) uniform uint uniform_data;
+
+layout(binding = 0, rgba8) uniform writeonly restrict image2DArray dest_image;
+
+#define MAX_LMEM_SIZE 4080 // Size chosen to avoid errors in Nvidia's GLSL compiler
+#define NUM_LMEM_CONSTANTS 1
+#define ARRAY_SIZE MAX_LMEM_SIZE - NUM_LMEM_CONSTANTS
+
+uint lmem_0[ARRAY_SIZE];
+const uvec4 constant_values[NUM_LMEM_CONSTANTS] = uvec4[](uvec4(0));
+
+void main() {
+    const uint global_id = gl_GlobalInvocationID.x;
+    if (global_id <= 128) {
+        // Since the shader is called with a dispatch of 1x1x1
+        // This should always be the case, and this shader will not actually execute
+        return;
+    }
+    for (uint t = 0; t < uniform_data; t++) {
+        const uint offset = (t * uniform_data);
+        lmem_0[offset] = t;
+    }
+    const uint offset = (gl_GlobalInvocationID.y * uniform_data + gl_GlobalInvocationID.x);
+    const uint value = lmem_0[offset];
+    const uint const_value = constant_values[offset / 4][offset % 4];
+    const uvec4 color = uvec4(value + const_value);
+
+    // A "side-effect" is needed so the variables don't get optimized out,
+    // but this should never execute so there should be no clobbering of previously bound state.
+    imageStore(dest_image, ivec3(gl_GlobalInvocationID), color);
+}
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index fc711c44ae..d032885168 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -222,6 +222,7 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
     gpu.TickWork();
 
     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
+    program_manager.LocalMemoryWarmup();
     pipeline->SetEngine(maxwell3d, gpu_memory);
     pipeline->Configure(is_indexed);
 
@@ -371,6 +372,7 @@ void RasterizerOpenGL::DispatchCompute() {
     if (!pipeline) {
         return;
     }
+    program_manager.LocalMemoryWarmup();
     pipeline->SetEngine(kepler_compute, gpu_memory);
     pipeline->Configure();
     const auto& qmd{kepler_compute->launch_description};
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 98841ae65e..2f6ba68239 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -3,7 +3,9 @@
 
 #include <glad/glad.h>
 
+#include "video_core/host_shaders/opengl_lmem_warmup_comp.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"
+#include "video_core/renderer_opengl/gl_shader_util.h"
 
 namespace OpenGL {
 
@@ -12,7 +14,8 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{
     GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
 };
 
-ProgramManager::ProgramManager(const Device& device) {
+ProgramManager::ProgramManager(const Device& device)
+    : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) {
     glCreateProgramPipelines(1, &pipeline.handle);
     if (device.UseAssemblyShaders()) {
         glEnable(GL_COMPUTE_PROGRAM_NV);
@@ -98,6 +101,11 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU
 
 void ProgramManager::RestoreGuestCompute() {}
 
+void ProgramManager::LocalMemoryWarmup() {
+    BindComputeProgram(lmem_warmup_program.handle);
+    glDispatchCompute(1, 1, 1);
+}
+
 void ProgramManager::BindPipeline() {
     if (!is_pipeline_bound) {
         is_pipeline_bound = true;
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h
index 07ffab77f1..852d8c88e8 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -30,6 +30,8 @@ public:
 
     void RestoreGuestCompute();
 
+    void LocalMemoryWarmup();
+
 private:
     void BindPipeline();
 
@@ -44,6 +46,7 @@ private:
     u32 current_stage_mask = 0;
     std::array<GLuint, NUM_STAGES> current_programs{};
     GLuint current_assembly_compute_program = 0;
+    OGLProgram lmem_warmup_program;
 };
 
 } // namespace OpenGL

From b198339580f1a54c4c670eb58593eb64e2ef945c Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sun, 25 Jun 2023 18:43:52 -0400
Subject: [PATCH 2/4] emit_glasm: Fix lmem size computation

---
 src/shader_recompiler/backend/glasm/emit_glasm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/shader_recompiler/backend/glasm/emit_glasm.cpp b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
index fd4a61a4de..b795c01797 100644
--- a/src/shader_recompiler/backend/glasm/emit_glasm.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm.cpp
@@ -461,7 +461,7 @@ std::string EmitGLASM(const Profile& profile, const RuntimeInfo& runtime_info, I
         header += fmt::format("R{},", index);
     }
     if (program.local_memory_size > 0) {
-        header += fmt::format("lmem[{}],", program.local_memory_size);
+        header += fmt::format("lmem[{}],", Common::DivCeil(program.local_memory_size, 4U));
     }
     if (program.info.uses_fswzadd) {
         header += "FSWZA[4],FSWZB[4],";

From 405eae3734dd6bfb259df0afceecf4de1f1262ce Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sun, 25 Jun 2023 18:59:33 -0400
Subject: [PATCH 3/4] shaders: Track local memory usage

---
 src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | 4 ++++
 src/shader_recompiler/shader_info.h                       | 1 +
 src/video_core/renderer_opengl/gl_compute_pipeline.cpp    | 1 +
 src/video_core/renderer_opengl/gl_compute_pipeline.h      | 5 +++++
 src/video_core/renderer_opengl/gl_graphics_pipeline.cpp   | 1 +
 src/video_core/renderer_opengl/gl_graphics_pipeline.h     | 5 +++++
 src/video_core/renderer_opengl/gl_rasterizer.cpp          | 8 ++++++--
 7 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
index 5a41952175..70292686fc 100644
--- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -424,6 +424,10 @@ void VisitUsages(Info& info, IR::Inst& inst) {
         info.used_constant_buffer_types |= IR::Type::U32 | IR::Type::U32x2;
         info.used_storage_buffer_types |= IR::Type::U32 | IR::Type::U32x2 | IR::Type::U32x4;
         break;
+    case IR::Opcode::LoadLocal:
+    case IR::Opcode::WriteLocal:
+        info.uses_local_memory = true;
+        break;
     default:
         break;
     }
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h
index d308db9424..b4b4afd37d 100644
--- a/src/shader_recompiler/shader_info.h
+++ b/src/shader_recompiler/shader_info.h
@@ -172,6 +172,7 @@ struct Info {
     bool stores_indexed_attributes{};
 
     bool stores_global_memory{};
+    bool uses_local_memory{};
 
     bool uses_fp16{};
     bool uses_fp64{};
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
index 3151c0db8b..f9ca55c36d 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.cpp
@@ -63,6 +63,7 @@ ComputePipeline::ComputePipeline(const Device& device, TextureCache& texture_cac
     writes_global_memory = !use_storage_buffers &&
                            std::ranges::any_of(info.storage_buffers_descriptors,
                                                [](const auto& desc) { return desc.is_written; });
+    uses_local_memory = info.uses_local_memory;
     if (force_context_flush) {
         std::scoped_lock lock{built_mutex};
         built_fence.Create();
diff --git a/src/video_core/renderer_opengl/gl_compute_pipeline.h b/src/video_core/renderer_opengl/gl_compute_pipeline.h
index 9bcc72b59e..c26b4fa5e2 100644
--- a/src/video_core/renderer_opengl/gl_compute_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_compute_pipeline.h
@@ -59,6 +59,10 @@ public:
         return writes_global_memory;
     }
 
+    [[nodiscard]] bool UsesLocalMemory() const noexcept {
+        return uses_local_memory;
+    }
+
     void SetEngine(Tegra::Engines::KeplerCompute* kepler_compute_,
                    Tegra::MemoryManager* gpu_memory_) {
         kepler_compute = kepler_compute_;
@@ -84,6 +88,7 @@ private:
 
     bool use_storage_buffers{};
     bool writes_global_memory{};
+    bool uses_local_memory{};
 
     std::mutex built_mutex;
     std::condition_variable built_condvar;
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
index c58f760b8f..23a48c6fe7 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -215,6 +215,7 @@ GraphicsPipeline::GraphicsPipeline(const Device& device, TextureCache& texture_c
 
         writes_global_memory |= std::ranges::any_of(
             info.storage_buffers_descriptors, [](const auto& desc) { return desc.is_written; });
+        uses_local_memory |= info.uses_local_memory;
     }
     ASSERT(num_textures <= MAX_TEXTURES);
     ASSERT(num_images <= MAX_IMAGES);
diff --git a/src/video_core/renderer_opengl/gl_graphics_pipeline.h b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
index 7bab3be0a7..7b3d7eae81 100644
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@@ -98,6 +98,10 @@ public:
         return writes_global_memory;
     }
 
+    [[nodiscard]] bool UsesLocalMemory() const noexcept {
+        return uses_local_memory;
+    }
+
     [[nodiscard]] bool IsBuilt() noexcept;
 
     template <typename Spec>
@@ -146,6 +150,7 @@ private:
 
     bool use_storage_buffers{};
     bool writes_global_memory{};
+    bool uses_local_memory{};
 
     static constexpr std::size_t XFB_ENTRY_STRIDE = 3;
     GLsizei num_xfb_attribs{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index d032885168..edf527f2d3 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -222,7 +222,9 @@ void RasterizerOpenGL::PrepareDraw(bool is_indexed, Func&& draw_func) {
     gpu.TickWork();
 
     std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
-    program_manager.LocalMemoryWarmup();
+    if (pipeline->UsesLocalMemory()) {
+        program_manager.LocalMemoryWarmup();
+    }
     pipeline->SetEngine(maxwell3d, gpu_memory);
     pipeline->Configure(is_indexed);
 
@@ -372,7 +374,9 @@ void RasterizerOpenGL::DispatchCompute() {
     if (!pipeline) {
         return;
     }
-    program_manager.LocalMemoryWarmup();
+    if (pipeline->UsesLocalMemory()) {
+        program_manager.LocalMemoryWarmup();
+    }
     pipeline->SetEngine(kepler_compute, gpu_memory);
     pipeline->Configure();
     const auto& qmd{kepler_compute->launch_description};

From 4f160633d369b702a45ace9b6ff133312761c5f8 Mon Sep 17 00:00:00 2001
From: ameerj <52414509+ameerj@users.noreply.github.com>
Date: Sun, 25 Jun 2023 19:06:51 -0400
Subject: [PATCH 4/4] OpenGL: Limit lmem warmup to NVIDIA

:frog:
---
 src/video_core/renderer_opengl/gl_device.cpp        |  1 +
 src/video_core/renderer_opengl/gl_device.h          |  5 +++++
 .../renderer_opengl/gl_shader_manager.cpp           | 13 +++++++++----
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 03d234f2fd..33e63c17d7 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -194,6 +194,7 @@ Device::Device(Core::Frontend::EmuWindow& emu_window) {
             has_bool_ref_bug = true;
         }
     }
+    has_lmem_perf_bug = is_nvidia;
 
     strict_context_required = emu_window.StrictContextRequired();
     // Blocks AMD and Intel OpenGL drivers on Windows from using asynchronous shader compilation.
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index ad27264e5f..a5a6bbbba7 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -192,6 +192,10 @@ public:
         return supports_conditional_barriers;
     }
 
+    bool HasLmemPerfBug() const {
+        return has_lmem_perf_bug;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestPreciseBug();
@@ -238,6 +242,7 @@ private:
     bool can_report_memory{};
     bool strict_context_required{};
     bool supports_conditional_barriers{};
+    bool has_lmem_perf_bug{};
 
     std::string vendor_name;
 };
diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp
index 2f6ba68239..03d4b9d061 100644
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -14,12 +14,15 @@ static constexpr std::array ASSEMBLY_PROGRAM_ENUMS{
     GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
 };
 
-ProgramManager::ProgramManager(const Device& device)
-    : lmem_warmup_program(CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER)) {
+ProgramManager::ProgramManager(const Device& device) {
     glCreateProgramPipelines(1, &pipeline.handle);
     if (device.UseAssemblyShaders()) {
         glEnable(GL_COMPUTE_PROGRAM_NV);
     }
+    if (device.HasLmemPerfBug()) {
+        lmem_warmup_program =
+            CreateProgram(HostShaders::OPENGL_LMEM_WARMUP_COMP, GL_COMPUTE_SHADER);
+    }
 }
 
 void ProgramManager::BindComputeProgram(GLuint program) {
@@ -102,8 +105,10 @@ void ProgramManager::BindAssemblyPrograms(std::span<const OGLAssemblyProgram, NU
 void ProgramManager::RestoreGuestCompute() {}
 
 void ProgramManager::LocalMemoryWarmup() {
-    BindComputeProgram(lmem_warmup_program.handle);
-    glDispatchCompute(1, 1, 1);
+    if (lmem_warmup_program.handle != 0) {
+        BindComputeProgram(lmem_warmup_program.handle);
+        glDispatchCompute(1, 1, 1);
+    }
 }
 
 void ProgramManager::BindPipeline() {