From 287ae2b9e8ea38642a4c8e36f7863d881d4c0e87 Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Wed, 13 Nov 2019 00:25:52 -0300
Subject: [PATCH] gl_shader_cache: Specialize local memory size for compute
 shaders

Local memory size in compute shaders was stubbed with an arbitary size.
This commit specializes local memory size from guest GPU parameters.
---
 src/video_core/engines/kepler_compute.h        |  7 ++++++-
 .../renderer_opengl/gl_rasterizer.cpp          |  3 ++-
 .../renderer_opengl/gl_shader_cache.cpp        |  5 +++++
 .../renderer_opengl/gl_shader_decompiler.cpp   | 18 ++++++++----------
 .../renderer_opengl/gl_shader_disk_cache.cpp   |  4 ++--
 .../renderer_opengl/gl_shader_disk_cache.h     | 16 +++++++++-------
 6 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index bd49c6627c..c526287b7a 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -178,7 +178,12 @@ public:
             BitField<24, 5, u32> gpr_alloc;
         };
 
-        INSERT_PADDING_WORDS(0x11);
+        union {
+            BitField<0, 20, u32> local_crs_alloc;
+            BitField<24, 5, u32> sass_version;
+        };
+
+        INSERT_PADDING_WORDS(0x10);
     } launch_description{};
 
     struct {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index ebfe52e6dc..d890076f8f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -731,7 +731,8 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
 
     const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
     const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
-                                 launch_desc.block_dim_z, launch_desc.shared_alloc);
+                                 launch_desc.block_dim_z, launch_desc.shared_alloc,
+                                 launch_desc.local_pos_alloc);
     std::tie(state.draw.shader_program, std::ignore) = kernel->GetHandle(variant);
     state.draw.program_pipeline = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 982c4e23aa..b23a982d76 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -329,6 +329,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
             source += fmt::format("shared uint smem[{}];",
                                   Common::AlignUp(variant.shared_memory_size, 4) / 4);
         }
+
+        if (variant.local_memory_size > 0) {
+            source += fmt::format("#define LOCAL_MEMORY_SIZE {}",
+                                  Common::AlignUp(variant.local_memory_size, 4) / 4);
+        }
     }
 
     source += '\n';
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index fb2ba09056..fe016c05c3 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -510,10 +510,14 @@ private:
     }
 
     void DeclareLocalMemory() {
-        // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
-        // specialization time.
-        const u64 local_memory_size =
-            stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
+        if (stage == ProgramType::Compute) {
+            code.AddLine("#ifdef LOCAL_MEMORY_SIZE");
+            code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory());
+            code.AddLine("#endif");
+            return;
+        }
+
+        const u64 local_memory_size = header.GetLocalMemorySize();
         if (local_memory_size == 0) {
             return;
         }
@@ -851,9 +855,6 @@ private:
         }
 
         if (const auto lmem = std::get_if<LmemNode>(&*node)) {
-            if (stage == ProgramType::Compute) {
-                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
-            }
             return {
                 fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
                 Type::Uint};
@@ -1228,9 +1229,6 @@ private:
             }
             target = std::move(*output);
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
-            if (stage == ProgramType::Compute) {
-                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
-            }
             target = {
                 fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
                 Type::Uint};
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index d2bb8502a5..5ebcbbbbae 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -52,11 +52,11 @@ struct BindlessSamplerKey {
     Tegra::Engines::SamplerDescriptor sampler{};
 };
 
-constexpr u32 NativeVersion = 8;
+constexpr u32 NativeVersion = 9;
 
 // Making sure sizes doesn't change by accident
 static_assert(sizeof(BaseBindings) == 16);
-static_assert(sizeof(ProgramVariant) == 32);
+static_assert(sizeof(ProgramVariant) == 36);
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 6f8e51364f..28689f6c7b 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -64,10 +64,10 @@ struct ProgramVariant final {
         : base_bindings{base_bindings}, primitive_mode{primitive_mode} {}
 
     /// Compute constructor.
-    explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z,
-                                      u32 shared_memory_size) noexcept
+    explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size,
+                                      u32 local_memory_size) noexcept
         : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)},
-          shared_memory_size{shared_memory_size} {}
+          shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {}
 
     // Graphics specific parameters.
     BaseBindings base_bindings{};
@@ -78,12 +78,13 @@ struct ProgramVariant final {
     u16 block_y{};
     u16 block_z{};
     u32 shared_memory_size{};
+    u32 local_memory_size{};
 
     bool operator==(const ProgramVariant& rhs) const noexcept {
         return std::tie(base_bindings, primitive_mode, block_x, block_y, block_z,
-                        shared_memory_size) == std::tie(rhs.base_bindings, rhs.primitive_mode,
-                                                        rhs.block_x, rhs.block_y, rhs.block_z,
-                                                        rhs.shared_memory_size);
+                        shared_memory_size, local_memory_size) ==
+               std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.block_x, rhs.block_y,
+                        rhs.block_z, rhs.shared_memory_size, rhs.local_memory_size);
     }
 
     bool operator!=(const ProgramVariant& rhs) const noexcept {
@@ -133,7 +134,8 @@ struct hash<OpenGL::ProgramVariant> {
                static_cast<std::size_t>(variant.block_x) ^
                (static_cast<std::size_t>(variant.block_y) << 32) ^
                (static_cast<std::size_t>(variant.block_z) << 48) ^
-               (static_cast<std::size_t>(variant.shared_memory_size) << 16);
+               (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^
+               (static_cast<std::size_t>(variant.local_memory_size) << 36);
     }
 };