From 4f5d8e434278cd5999bf21e91f0923d55ec8d52b Mon Sep 17 00:00:00 2001
From: ReinUsesLisp <reinuseslisp@airmail.cc>
Date: Tue, 12 Nov 2019 23:26:56 -0300
Subject: [PATCH] gl_shader_cache: Specialize shader workgroup

Drop the usage of ARB_compute_variable_group_size and specialize compute
shaders instead. This permits compute to run on AMD and Intel
proprietary drivers.
---
 src/video_core/engines/kepler_compute.h       |  2 +-
 .../renderer_opengl/gl_rasterizer.cpp         | 21 +++----
 .../renderer_opengl/gl_shader_cache.cpp       | 63 +++++++++----------
 .../renderer_opengl/gl_shader_cache.h         |  6 +-
 .../renderer_opengl/gl_shader_disk_cache.cpp  |  4 +-
 .../renderer_opengl/gl_shader_disk_cache.h    | 48 +++++++++-----
 6 files changed, 75 insertions(+), 69 deletions(-)

diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h
index 5259d92bd7..bd49c6627c 100644
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -140,7 +140,7 @@ public:
 
         INSERT_PADDING_WORDS(0x3);
 
-        BitField<0, 16, u32> shared_alloc;
+        BitField<0, 18, u32> shared_alloc;
 
         BitField<16, 16, u32> block_dim_x;
         union {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index b76de71ec6..bd4e5f6e3a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -273,8 +273,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
         SetupDrawGlobalMemory(stage, shader);
         SetupDrawTextures(stage, shader, base_bindings);
 
-        const ProgramVariant variant{base_bindings, primitive_mode};
-        const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant);
+        const ProgramVariant variant(base_bindings, primitive_mode);
+        const auto [program_handle, next_bindings] = shader->GetHandle(variant);
 
         switch (program) {
         case Maxwell::ShaderProgram::VertexA:
@@ -725,18 +725,14 @@ bool RasterizerOpenGL::DrawMultiBatch(bool is_indexed) {
 }
 
 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (!GLAD_GL_ARB_compute_variable_group_size) {
-        LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
-                                 "lack of GL_ARB_compute_variable_group_size");
-        return;
-    }
-
     auto kernel = shader_cache.GetComputeKernel(code_addr);
     SetupComputeTextures(kernel);
     SetupComputeImages(kernel);
 
-    const auto [program, next_bindings] = kernel->GetProgramHandle({});
-    state.draw.shader_program = program;
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
+                                 launch_desc.block_dim_z);
+    std::tie(state.draw.shader_program, std::ignore) = kernel->GetHandle(variant);
     state.draw.program_pipeline = 0;
 
     const std::size_t buffer_size =
@@ -760,10 +756,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
     state.ApplyShaderProgram();
     state.ApplyProgramPipeline();
 
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
-    glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
-                                  launch_desc.grid_dim_z, launch_desc.block_dim_x,
-                                  launch_desc.block_dim_y, launch_desc.block_dim_z);
+    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
 }
 
 void RasterizerOpenGL::FlushAll() {}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 7ce06a978a..a5789b6d3d 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -255,7 +255,7 @@ void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
 
 CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type,
                           const ProgramCode& program_code, const ProgramCode& program_code_b,
-                          const ProgramVariant& variant, ConstBufferLocker& locker,
+                          ConstBufferLocker& locker, const ProgramVariant& variant,
                           bool hint_retrievable = false) {
     LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type));
 
@@ -268,17 +268,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
     }
     const auto entries = GLShader::GetEntries(ir);
 
-    auto base_bindings{variant.base_bindings};
-    const auto primitive_mode{variant.primitive_mode};
-
     std::string source = fmt::format(R"(// {}
 #version 430 core
 #extension GL_ARB_separate_shader_objects : enable
 )",
                                      GetShaderId(unique_identifier, program_type));
-    if (is_compute) {
-        source += "#extension GL_ARB_compute_variable_group_size : require\n";
-    }
     if (device.HasShaderBallot()) {
         source += "#extension GL_ARB_shader_ballot : require\n";
     }
@@ -295,6 +289,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
     }
     source += '\n';
 
+    auto base_bindings = variant.base_bindings;
     if (!is_compute) {
         source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
     }
@@ -318,13 +313,15 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramTy
 
     if (program_type == ProgramType::Geometry) {
         const auto [glsl_topology, debug_name, max_vertices] =
-            GetPrimitiveDescription(primitive_mode);
+            GetPrimitiveDescription(variant.primitive_mode);
 
-        source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
-        source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
+        source += fmt::format("layout ({}) in;\n\n", glsl_topology);
+        source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices);
     }
     if (program_type == ProgramType::Compute) {
-        source += "layout (local_size_variable) in;\n";
+        source +=
+            fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n",
+                        variant.block_x, variant.block_y, variant.block_z);
     }
 
     source += '\n';
@@ -422,58 +419,53 @@ Shader CachedShader::CreateFromCache(const ShaderParameters& params,
                                                           unspecialized.code_b));
 }
 
-std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
-    UpdateVariant();
+std::tuple<GLuint, BaseBindings> CachedShader::GetHandle(const ProgramVariant& variant) {
+    EnsureValidLockerVariant();
 
-    const auto [entry, is_cache_miss] = curr_variant->programs.try_emplace(variant);
+    const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant);
     auto& program = entry->second;
     if (is_cache_miss) {
         program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b,
-                              variant, *curr_variant->locker);
-        disk_cache.SaveUsage(GetUsage(variant, *curr_variant->locker));
+                              *curr_locker_variant->locker, variant);
+        disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker));
 
         LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
     }
 
     auto base_bindings = variant.base_bindings;
     base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
-    if (program_type != ProgramType::Compute) {
-        base_bindings.cbuf += STAGE_RESERVED_UBOS;
-    }
+    base_bindings.cbuf += STAGE_RESERVED_UBOS;
     base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
     base_bindings.sampler += static_cast<u32>(entries.samplers.size());
 
     return {program->handle, base_bindings};
 }
 
-void CachedShader::UpdateVariant() {
-    if (curr_variant && !curr_variant->locker->IsConsistent()) {
-        curr_variant = nullptr;
+bool CachedShader::EnsureValidLockerVariant() {
+    const auto previous_variant = curr_locker_variant;
+    if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) {
+        curr_locker_variant = nullptr;
     }
-    if (!curr_variant) {
+    if (!curr_locker_variant) {
         for (auto& variant : locker_variants) {
             if (variant->locker->IsConsistent()) {
-                curr_variant = variant.get();
+                curr_locker_variant = variant.get();
             }
         }
     }
-    if (!curr_variant) {
+    if (!curr_locker_variant) {
         auto& new_variant = locker_variants.emplace_back();
         new_variant = std::make_unique<LockerVariant>();
         new_variant->locker = MakeLocker(system, program_type);
-        curr_variant = new_variant.get();
+        curr_locker_variant = new_variant.get();
     }
+    return previous_variant == curr_locker_variant;
 }
 
 ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
                                             const ConstBufferLocker& locker) const {
-    ShaderDiskCacheUsage usage;
-    usage.unique_identifier = unique_identifier;
-    usage.variant = variant;
-    usage.keys = locker.GetKeys();
-    usage.bound_samplers = locker.GetBoundSamplers();
-    usage.bindless_samplers = locker.GetBindlessSamplers();
-    return usage;
+    return ShaderDiskCacheUsage{unique_identifier, variant, locker.GetKeys(),
+                                locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
 }
 
 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -534,9 +526,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
             if (!shader) {
                 auto locker{MakeLocker(system, unspecialized.program_type)};
                 FillLocker(*locker, usage);
+
                 shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type,
-                                     unspecialized.code, unspecialized.code_b, usage.variant,
-                                     *locker, true);
+                                     unspecialized.code, unspecialized.code_b, *locker,
+                                     usage.variant, true);
             }
 
             std::scoped_lock lock{mutex};
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 6bd7c9cf15..795b05a194 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -86,7 +86,7 @@ public:
     }
 
     /// Gets the GL program handle for the shader
-    std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);
+    std::tuple<GLuint, BaseBindings> GetHandle(const ProgramVariant& variant);
 
 private:
     struct LockerVariant {
@@ -98,7 +98,7 @@ private:
                           GLShader::ShaderEntries entries, ProgramCode program_code,
                           ProgramCode program_code_b);
 
-    void UpdateVariant();
+    bool EnsureValidLockerVariant();
 
     ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
                                   const VideoCommon::Shader::ConstBufferLocker& locker) const;
@@ -117,7 +117,7 @@ private:
     ProgramCode program_code;
     ProgramCode program_code_b;
 
-    LockerVariant* curr_variant = nullptr;
+    LockerVariant* curr_locker_variant = nullptr;
     std::vector<std::unique_ptr<LockerVariant>> locker_variants;
 };
 
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
index 3f4daf28d7..9156f180a4 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -52,11 +52,11 @@ struct BindlessSamplerKey {
     Tegra::Engines::SamplerDescriptor sampler{};
 };
 
-constexpr u32 NativeVersion = 6;
+constexpr u32 NativeVersion = 7;
 
 // Making sure sizes doesn't change by accident
 static_assert(sizeof(BaseBindings) == 16);
-static_assert(sizeof(ProgramVariant) == 20);
+static_assert(sizeof(ProgramVariant) == 28);
 
 ShaderCacheVersionHash GetShaderCacheVersionHash() {
     ShaderCacheVersionHash hash{};
diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
index 55311dc6df..4c7ca004d0 100644
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -44,32 +44,49 @@ struct BaseBindings {
     u32 sampler{};
     u32 image{};
 
-    bool operator==(const BaseBindings& rhs) const {
+    bool operator==(const BaseBindings& rhs) const noexcept {
         return std::tie(cbuf, gmem, sampler, image) ==
                std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image);
     }
 
-    bool operator!=(const BaseBindings& rhs) const {
+    bool operator!=(const BaseBindings& rhs) const noexcept {
         return !operator==(rhs);
     }
 };
 static_assert(std::is_trivially_copyable_v<BaseBindings>);
 
-/// Describes the different variants a single program can be compiled.
-struct ProgramVariant {
-    BaseBindings base_bindings;
-    GLenum primitive_mode{};
+/// Describes the different variants a program can be compiled with.
+struct ProgramVariant final {
+    ProgramVariant() = default;
 
-    bool operator==(const ProgramVariant& rhs) const {
-        return std::tie(base_bindings, primitive_mode) ==
-               std::tie(rhs.base_bindings, rhs.primitive_mode);
+    /// Graphics constructor.
+    explicit constexpr ProgramVariant(BaseBindings base_bindings, GLenum primitive_mode) noexcept
+        : base_bindings{base_bindings}, primitive_mode{primitive_mode} {}
+
+    /// Compute constructor.
+    explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z) noexcept
+        : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)} {
     }
 
-    bool operator!=(const ProgramVariant& rhs) const {
+    // Graphics specific parameters.
+    BaseBindings base_bindings{};
+    GLenum primitive_mode{};
+
+    // Compute specific parameters.
+    u32 block_x{};
+    u16 block_y{};
+    u16 block_z{};
+
+    bool operator==(const ProgramVariant& rhs) const noexcept {
+        return std::tie(base_bindings, primitive_mode, block_x, block_y, block_z) ==
+               std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.block_x, rhs.block_y,
+                        rhs.block_z);
+    }
+
+    bool operator!=(const ProgramVariant& rhs) const noexcept {
         return !operator==(rhs);
     }
 };
-
 static_assert(std::is_trivially_copyable_v<ProgramVariant>);
 
 /// Describes how a shader is used.
@@ -108,8 +125,11 @@ struct hash<OpenGL::BaseBindings> {
 template <>
 struct hash<OpenGL::ProgramVariant> {
     std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
-        return std::hash<OpenGL::BaseBindings>()(variant.base_bindings) ^
-               (static_cast<std::size_t>(variant.primitive_mode) << 6);
+        return std::hash<OpenGL::BaseBindings>{}(variant.base_bindings) ^
+               (static_cast<std::size_t>(variant.primitive_mode) << 6) ^
+               static_cast<std::size_t>(variant.block_x) ^
+               (static_cast<std::size_t>(variant.block_y) << 32) ^
+               (static_cast<std::size_t>(variant.block_z) << 48);
     }
 };
 
@@ -117,7 +137,7 @@ template <>
 struct hash<OpenGL::ShaderDiskCacheUsage> {
     std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
         return static_cast<std::size_t>(usage.unique_identifier) ^
-               std::hash<OpenGL::ProgramVariant>()(usage.variant);
+               std::hash<OpenGL::ProgramVariant>{}(usage.variant);
     }
 };