From d3ad9469a172eeaaf34ca641a6bf679b7b10eedf Mon Sep 17 00:00:00 2001
From: Markus Wick <markus@selfnet.de>
Date: Wed, 5 Sep 2018 11:36:50 +0200
Subject: [PATCH] gl_rasterizer: Implement a VAO cache.

This patch caches VAO objects instead of re-emiting all pointers per draw call.
Configuring this pointers is known as a fast task, but it yields too many GL
calls. So for better performance, just bind the VAO instead of 16 pointers.
---
 src/video_core/engines/maxwell_3d.h           |  9 +-
 .../renderer_opengl/gl_rasterizer.cpp         | 96 +++++++++----------
 .../renderer_opengl/gl_rasterizer.h           |  8 +-
 3 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 92bfda053a..f59d01738d 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -127,6 +127,7 @@ public:
                 BitField<21, 6, Size> size;
                 BitField<27, 3, Type> type;
                 BitField<31, 1, u32> bgra;
+                u32 hex;
             };
 
             u32 ComponentCount() const {
@@ -262,6 +263,10 @@ public:
             bool IsValid() const {
                 return size != Size::Invalid;
             }
+
+            bool operator<(const VertexAttribute& other) const {
+                return hex < other.hex;
+            }
         };
 
         enum class PrimitiveTopology : u32 {
@@ -545,7 +550,7 @@ public:
 
                 INSERT_PADDING_WORDS(0x5B);
 
-                VertexAttribute vertex_attrib_format[NumVertexAttributes];
+                std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;
 
                 INSERT_PADDING_WORDS(0xF);
 
@@ -964,7 +969,7 @@ ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
 ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
 ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
 ASSERT_REG_POSITION(zeta, 0x3F8);
-ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458);
+ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index c66a18155e..5d493a2b2f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -70,28 +70,13 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, ScreenInfo
     // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
     state.clip_distance[0] = true;
 
-    // Generate VAO and UBO
-    sw_vao.Create();
-    uniform_buffer.Create();
-
-    state.draw.vertex_array = sw_vao.handle;
-    state.draw.uniform_buffer = uniform_buffer.handle;
-    state.Apply();
-
     // Create render framebuffer
     framebuffer.Create();
 
-    hw_vao.Create();
-
-    state.draw.vertex_buffer = buffer_cache.GetHandle();
-
     shader_program_manager = std::make_unique<GLShader::ProgramManager>();
     state.draw.shader_program = 0;
-    state.draw.vertex_array = hw_vao.handle;
     state.Apply();
 
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer_cache.GetHandle());
-
     glEnable(GL_BLEND);
 
     glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
@@ -106,7 +91,54 @@ void RasterizerOpenGL::SetupVertexArrays() {
     const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
     const auto& regs = gpu.regs;
 
-    state.draw.vertex_array = hw_vao.handle;
+    auto [iter, is_cache_miss] = vertex_array_cache.try_emplace(regs.vertex_attrib_format);
+    auto& VAO = iter->second;
+
+    if (is_cache_miss) {
+        VAO.Create();
+        state.draw.vertex_array = VAO.handle;
+        state.Apply();
+
+        // The index buffer binding is stored within the VAO. Stupid OpenGL, but easy to work
+        // around.
+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer_cache.GetHandle());
+
+        // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
+        // Enables the first 16 vertex attributes always, as we don't know which ones are actually
+        // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
+        // for now to avoid OpenGL errors.
+        // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
+        // assume every shader uses them all.
+        for (unsigned index = 0; index < 16; ++index) {
+            const auto& attrib = regs.vertex_attrib_format[index];
+
+            // Ignore invalid attributes.
+            if (!attrib.IsValid())
+                continue;
+
+            const auto& buffer = regs.vertex_array[attrib.buffer];
+            LOG_TRACE(HW_GPU,
+                      "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
+                      index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
+                      attrib.offset.Value(), attrib.IsNormalized());
+
+            ASSERT(buffer.IsEnabled());
+
+            glEnableVertexAttribArray(index);
+            if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt ||
+                attrib.type ==
+                    Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) {
+                glVertexAttribIFormat(index, attrib.ComponentCount(),
+                                      MaxwellToGL::VertexType(attrib), attrib.offset);
+            } else {
+                glVertexAttribFormat(index, attrib.ComponentCount(),
+                                     MaxwellToGL::VertexType(attrib),
+                                     attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
+            }
+            glVertexAttribBinding(index, attrib.buffer);
+        }
+    }
+    state.draw.vertex_array = VAO.handle;
     state.draw.vertex_buffer = buffer_cache.GetHandle();
     state.Apply();
 
@@ -142,38 +174,6 @@ void RasterizerOpenGL::SetupVertexArrays() {
             glVertexBindingDivisor(index, 0);
         }
     }
-
-    // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
-    // Enables the first 16 vertex attributes always, as we don't know which ones are actually used
-    // until shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now
-    // to avoid OpenGL errors.
-    // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
-    // assume every shader uses them all.
-    for (unsigned index = 0; index < 16; ++index) {
-        auto& attrib = regs.vertex_attrib_format[index];
-
-        // Ignore invalid attributes.
-        if (!attrib.IsValid())
-            continue;
-
-        auto& buffer = regs.vertex_array[attrib.buffer];
-        LOG_TRACE(HW_GPU, "vertex attrib {}, count={}, size={}, type={}, offset={}, normalize={}",
-                  index, attrib.ComponentCount(), attrib.SizeString(), attrib.TypeString(),
-                  attrib.offset.Value(), attrib.IsNormalized());
-
-        ASSERT(buffer.IsEnabled());
-
-        glEnableVertexAttribArray(index);
-        if (attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::SignedInt ||
-            attrib.type == Tegra::Engines::Maxwell3D::Regs::VertexAttribute::Type::UnsignedInt) {
-            glVertexAttribIFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
-                                  attrib.offset);
-        } else {
-            glVertexAttribFormat(index, attrib.ComponentCount(), MaxwellToGL::VertexType(attrib),
-                                 attrib.IsNormalized() ? GL_TRUE : GL_FALSE, attrib.offset);
-        }
-        glVertexAttribBinding(index, attrib.buffer);
-    }
 }
 
 void RasterizerOpenGL::SetupShaders() {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 4c4b084b8f..9c30dc0e8f 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <map>
 #include <memory>
 #include <tuple>
 #include <utility>
@@ -168,14 +169,15 @@ private:
     ScreenInfo& screen_info;
 
     std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
-    OGLVertexArray sw_vao;
-    OGLVertexArray hw_vao;
+    std::map<std::array<Tegra::Engines::Maxwell3D::Regs::VertexAttribute,
+                        Tegra::Engines::Maxwell3D::Regs::NumVertexAttributes>,
+             OGLVertexArray>
+        vertex_array_cache;
 
     std::array<SamplerInfo, GLShader::NumTextureSamplers> texture_samplers;
 
     static constexpr size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
     OGLBufferCache buffer_cache;
-    OGLBuffer uniform_buffer;
     OGLFramebuffer framebuffer;
     GLint uniform_buffer_alignment;