From ace6c2318be5e8c5b2ad5f489d8144f28505d5f9 Mon Sep 17 00:00:00 2001
From: Kelebek1 <eeeedddccc@hotmail.co.uk>
Date: Sun, 4 Jun 2023 21:09:57 +0100
Subject: [PATCH] Combine vertex/transform feedback buffer binding into a
 single call

---
 src/video_core/buffer_cache/buffer_cache.h    | 82 ++++++++++++++-----
 .../buffer_cache/buffer_cache_base.h          | 11 ++-
 .../renderer_opengl/gl_buffer_cache.cpp       | 18 ++++
 .../renderer_opengl/gl_buffer_cache.h         |  4 +-
 .../renderer_vulkan/vk_buffer_cache.cpp       | 54 +++++++++++-
 .../renderer_vulkan/vk_buffer_cache.h         |  3 +
 6 files changed, 148 insertions(+), 24 deletions(-)

diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
index 2f281b3705..251a4a8804 100644
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -715,20 +715,38 @@ void BufferCache<P>::BindHostIndexBuffer() {
 
 template <class P>
 void BufferCache<P>::BindHostVertexBuffers() {
+    HostBindings host_bindings;
+    bool any_valid{false};
     auto& flags = maxwell3d->dirty.flags;
     for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
-        const Binding& binding = channel_state->vertex_buffers[index];
-        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer, binding.buffer_id);
-        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
         if (!flags[Dirty::VertexBuffer0 + index]) {
             continue;
         }
-        flags[Dirty::VertexBuffer0 + index] = false;
+        host_bindings.min_index = std::min(host_bindings.min_index, index);
+        host_bindings.max_index = std::max(host_bindings.max_index, index);
+        any_valid = true;
+    }
 
-        const u32 stride = maxwell3d->regs.vertex_streams[index].stride;
-        const u32 offset = buffer.Offset(binding.cpu_addr);
-        runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride);
+    if (any_valid) {
+        host_bindings.max_index++;
+        for (u32 index = host_bindings.min_index; index < host_bindings.max_index; index++) {
+            flags[Dirty::VertexBuffer0 + index] = false;
+
+            const Binding& binding = channel_state->vertex_buffers[index];
+            Buffer& buffer = slot_buffers[binding.buffer_id];
+
+            TouchBuffer(buffer, binding.buffer_id);
+            SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
+
+            const u32 stride = maxwell3d->regs.vertex_streams[index].stride;
+            const u32 offset = buffer.Offset(binding.cpu_addr);
+
+            host_bindings.buffers.push_back(reinterpret_cast<void*>(&buffer));
+            host_bindings.offsets.push_back(offset);
+            host_bindings.sizes.push_back(binding.size);
+            host_bindings.strides.push_back(stride);
+        }
+        runtime.BindVertexBuffers(host_bindings);
     }
 }
 
@@ -882,15 +900,25 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
     if (maxwell3d->regs.transform_feedback_enabled == 0) {
         return;
     }
+    HostBindings host_bindings;
     for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
         const Binding& binding = channel_state->transform_feedback_buffers[index];
+        if (maxwell3d->regs.transform_feedback.controls[index].varying_count == 0 &&
+            maxwell3d->regs.transform_feedback.controls[index].stride == 0) {
+            break;
+        }
         Buffer& buffer = slot_buffers[binding.buffer_id];
         TouchBuffer(buffer, binding.buffer_id);
         const u32 size = binding.size;
         SynchronizeBuffer(buffer, binding.cpu_addr, size);
 
         const u32 offset = buffer.Offset(binding.cpu_addr);
-        runtime.BindTransformFeedbackBuffer(index, buffer, offset, size);
+        host_bindings.buffers.push_back(reinterpret_cast<void*>(&buffer));
+        host_bindings.offsets.push_back(offset);
+        host_bindings.sizes.push_back(binding.size);
+    }
+    if (host_bindings.buffers.size() > 0) {
+        runtime.BindTransformFeedbackBuffers(host_bindings);
     }
 }
 
@@ -1616,6 +1644,8 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si
 
 template <class P>
 void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
+    bool dirty_index{false};
+    boost::container::small_vector<u64, NUM_VERTEX_BUFFERS> dirty_vertex_buffers;
     const auto scalar_replace = [buffer_id](Binding& binding) {
         if (binding.buffer_id == buffer_id) {
             binding.buffer_id = BufferId{};
@@ -1624,8 +1654,19 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
     const auto replace = [scalar_replace](std::span<Binding> bindings) {
         std::ranges::for_each(bindings, scalar_replace);
     };
-    scalar_replace(channel_state->index_buffer);
-    replace(channel_state->vertex_buffers);
+
+    if (channel_state->index_buffer.buffer_id == buffer_id) {
+        channel_state->index_buffer.buffer_id = BufferId{};
+        dirty_index = true;
+    }
+
+    for (u32 index = 0; index < channel_state->vertex_buffers.size(); index++) {
+        auto& binding = channel_state->vertex_buffers[index];
+        if (binding.buffer_id == buffer_id) {
+            binding.buffer_id = BufferId{};
+            dirty_vertex_buffers.push_back(index);
+        }
+    }
     std::ranges::for_each(channel_state->uniform_buffers, replace);
     std::ranges::for_each(channel_state->storage_buffers, replace);
     replace(channel_state->transform_feedback_buffers);
@@ -1642,20 +1683,21 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
     delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
     slot_buffers.erase(buffer_id);
 
-    NotifyBufferDeletion();
-}
-
-template <class P>
-void BufferCache<P>::NotifyBufferDeletion() {
     if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) {
         channel_state->dirty_uniform_buffers.fill(~u32{0});
         channel_state->uniform_buffer_binding_sizes.fill({});
     }
+
     auto& flags = maxwell3d->dirty.flags;
-    flags[Dirty::IndexBuffer] = true;
-    flags[Dirty::VertexBuffers] = true;
-    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
-        flags[Dirty::VertexBuffer0 + index] = true;
+    if (dirty_index) {
+        flags[Dirty::IndexBuffer] = true;
+    }
+
+    if (dirty_vertex_buffers.size() > 0) {
+        flags[Dirty::VertexBuffers] = true;
+        for (auto index : dirty_vertex_buffers) {
+            flags[Dirty::VertexBuffer0 + index] = true;
+        }
     }
     channel_state->has_deleted_buffers = true;
 }
diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h
index 60a1f285ed..cf359e2413 100644
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@@ -105,6 +105,15 @@ static constexpr Binding NULL_BINDING{
     .buffer_id = NULL_BUFFER_ID,
 };
 
+struct HostBindings {
+    boost::container::small_vector<void*, NUM_VERTEX_BUFFERS> buffers;
+    boost::container::small_vector<u64, NUM_VERTEX_BUFFERS> offsets;
+    boost::container::small_vector<u64, NUM_VERTEX_BUFFERS> sizes;
+    boost::container::small_vector<u64, NUM_VERTEX_BUFFERS> strides;
+    u32 min_index{NUM_VERTEX_BUFFERS};
+    u32 max_index{0};
+};
+
 class BufferCacheChannelInfo : public ChannelInfo {
 public:
     BufferCacheChannelInfo() = delete;
@@ -519,8 +528,6 @@ private:
 
     void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false);
 
-    void NotifyBufferDeletion();
-
     [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index,
                                                bool is_written) const;
 
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index c419714d48..0cc546a3a0 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -232,6 +232,15 @@ void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset,
     }
 }
 
+void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings& bindings) {
+    for (u32 index = 0; index < bindings.buffers.size(); index++) {
+        BindVertexBuffer(
+            bindings.min_index + index, *reinterpret_cast<Buffer*>(bindings.buffers[index]),
+            static_cast<u32>(bindings.offsets[index]), static_cast<u32>(bindings.sizes[index]),
+            static_cast<u32>(bindings.strides[index]));
+    }
+}
+
 void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
                                            u32 offset, u32 size) {
     if (use_assembly_shaders) {
@@ -320,6 +329,15 @@ void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer,
                       static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
 }
 
+void BufferCacheRuntime::BindTransformFeedbackBuffers(VideoCommon::HostBindings& bindings) {
+    for (u32 index = 0; index < bindings.buffers.size(); index++) {
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index,
+                          reinterpret_cast<Buffer*>(bindings.buffers[index])->Handle(),
+                          static_cast<GLintptr>(bindings.offsets[index]),
+                          static_cast<GLsizeiptr>(bindings.sizes[index]));
+    }
+}
+
 void BufferCacheRuntime::BindTextureBuffer(Buffer& buffer, u32 offset, u32 size,
                                            PixelFormat format) {
     *texture_handles++ = buffer.View(offset, size, format);
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index a24991585f..e4e0002848 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
 #include <span>
 
 #include "common/common_types.h"
-#include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/buffer_cache/buffer_cache_base.h"
 #include "video_core/buffer_cache/memory_tracker_base.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_device.h"
@@ -87,6 +87,7 @@ public:
     void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
 
     void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
+    void BindVertexBuffers(VideoCommon::HostBindings& bindings);
 
     void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size);
 
@@ -99,6 +100,7 @@ public:
                                   bool is_written);
 
     void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size);
+    void BindTransformFeedbackBuffers(VideoCommon::HostBindings& bindings);
 
     void BindTextureBuffer(Buffer& buffer, u32 offset, u32 size,
                            VideoCore::Surface::PixelFormat format);
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
index daa128399f..d72d99899b 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -7,7 +7,6 @@
 #include <span>
 #include <vector>
 
-#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/renderer_vulkan/maxwell_to_vk.h"
 #include "video_core/renderer_vulkan/vk_buffer_cache.h"
 #include "video_core/renderer_vulkan/vk_scheduler.h"
@@ -502,6 +501,40 @@ void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset
     }
 }
 
+void BufferCacheRuntime::BindVertexBuffers(VideoCommon::HostBindings& bindings) {
+    boost::container::small_vector<VkBuffer, 32> buffer_handles;
+    for (u32 index = 0; index < bindings.buffers.size(); index++) {
+        auto& buffer = *reinterpret_cast<Buffer*>(bindings.buffers[index]);
+        auto handle = buffer.Handle();
+        if (handle == VK_NULL_HANDLE) {
+            bindings.offsets[index] = 0;
+            bindings.sizes[index] = VK_WHOLE_SIZE;
+            if (!device.HasNullDescriptor()) {
+                ReserveNullBuffer();
+                handle = *null_buffer;
+            }
+        }
+        buffer_handles.push_back(handle);
+    }
+    if (device.IsExtExtendedDynamicStateSupported()) {
+        scheduler.Record([bindings = bindings,
+                          buffer_handles = buffer_handles](vk::CommandBuffer cmdbuf) {
+            cmdbuf.BindVertexBuffers2EXT(
+                bindings.min_index, bindings.max_index - bindings.min_index, buffer_handles.data(),
+                reinterpret_cast<const VkDeviceSize*>(bindings.offsets.data()),
+                reinterpret_cast<const VkDeviceSize*>(bindings.sizes.data()),
+                reinterpret_cast<const VkDeviceSize*>(bindings.strides.data()));
+        });
+    } else {
+        scheduler.Record([bindings = bindings,
+                          buffer_handles = buffer_handles](vk::CommandBuffer cmdbuf) {
+            cmdbuf.BindVertexBuffers(
+                bindings.min_index, bindings.max_index - bindings.min_index, buffer_handles.data(),
+                reinterpret_cast<const VkDeviceSize*>(bindings.offsets.data()));
+        });
+    }
+}
+
 void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset,
                                                      u32 size) {
     if (!device.IsExtTransformFeedbackSupported()) {
@@ -523,6 +556,25 @@ void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer,
     });
 }
 
+void BufferCacheRuntime::BindTransformFeedbackBuffers(VideoCommon::HostBindings& bindings) {
+    if (!device.IsExtTransformFeedbackSupported()) {
+        // Already logged in the rasterizer
+        return;
+    }
+    boost::container::small_vector<VkBuffer, 4> buffer_handles;
+    for (u32 index = 0; index < bindings.buffers.size(); index++) {
+        auto& buffer = *reinterpret_cast<Buffer*>(bindings.buffers[index]);
+        buffer_handles.push_back(buffer.Handle());
+    }
+    scheduler.Record(
+        [bindings = bindings, buffer_handles = buffer_handles](vk::CommandBuffer cmdbuf) {
+            cmdbuf.BindTransformFeedbackBuffersEXT(
+                0, static_cast<u32>(buffer_handles.size()), buffer_handles.data(),
+                reinterpret_cast<const VkDeviceSize*>(bindings.offsets.data()),
+                reinterpret_cast<const VkDeviceSize*>(bindings.sizes.data()));
+        });
+}
+
 void BufferCacheRuntime::ReserveNullBuffer() {
     if (null_buffer) {
         return;
diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h
index 92b4f78596..92d3e9f323 100644
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -18,6 +18,7 @@ namespace Vulkan {
 class Device;
 class DescriptorPool;
 class Scheduler;
+struct HostVertexBinding;
 
 class BufferCacheRuntime;
 
@@ -96,8 +97,10 @@ public:
     void BindQuadIndexBuffer(PrimitiveTopology topology, u32 first, u32 count);
 
     void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);
+    void BindVertexBuffers(VideoCommon::HostBindings& bindings);
 
     void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
+    void BindTransformFeedbackBuffers(VideoCommon::HostBindings& bindings);
 
     std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t stage,
                                           [[maybe_unused]] u32 binding_index, u32 size) {