diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 2605c3b429..c297bc31ba 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -397,14 +397,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
     }
 }
 
-void Maxwell3D::ReleaseFences() {
-    for (const auto pair : delay_fences) {
-        const auto [addr, payload] = pair;
-        memory_manager.Write<u32>(addr, static_cast<u32>(payload));
-    }
-    delay_fences.clear();
-}
-
 void Maxwell3D::ProcessQueryGet() {
     // TODO(Subv): Support the other query units.
     ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
@@ -412,10 +404,12 @@ void Maxwell3D::ProcessQueryGet() {
 
     switch (regs.query.query_get.operation) {
     case Regs::QueryOperation::Release: {
-        rasterizer.FlushCommands();
-        rasterizer.SyncGuestHost();
         const u64 result = regs.query.query_sequence;
-        delay_fences.emplace_back(regs.query.QueryAddress(), result);
+        if (regs.query.query_get.fence == 1) {
+            rasterizer.SignalFence(regs.query.QueryAddress(), static_cast<u32>(result));
+        } else {
+            StampQueryResult(result, regs.query.query_get.short_query == 0);
+        }
         break;
     }
     case Regs::QueryOperation::Acquire:
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 0a93827ec8..59d5752d26 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1427,8 +1427,6 @@ public:
         Tables tables{};
     } dirty;
 
-    void ReleaseFences();
-
 private:
     void InitializeRegisterDefaults();
 
@@ -1469,8 +1467,6 @@ private:
 
     std::array<u8, Regs::NUM_REGS> dirty_pointers{};
 
-    std::vector<std::pair<GPUVAddr, u64>> delay_fences;
-
     /// Retrieves information about a specific TIC entry from the TIC buffer.
     Texture::TICEntry GetTICEntry(u32 tic_index) const;
 
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 71ddfbd26e..d05b6a9d2f 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -147,7 +147,7 @@ void GPU::SyncGuestHost() {
 }
 
 void GPU::OnCommandListEnd() {
-    maxwell_3d->ReleaseFences();
+    renderer.Rasterizer().ReleaseFences();
 }
 // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
 // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index b884456349..fa9991c871 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -157,7 +157,7 @@ public:
 
     void FlushCommands();
     void SyncGuestHost();
-    void OnCommandListEnd();
+    virtual void OnCommandListEnd();
 
     /// Returns a reference to the Maxwell3D GPU engine.
     Engines::Maxwell3D& Maxwell3D();
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index 20e73a37e4..53305ab436 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -52,4 +52,8 @@ void GPUAsynch::WaitIdle() const {
     gpu_thread.WaitIdle();
 }
 
+void GPUAsynch::OnCommandListEnd() {
+    gpu_thread.OnCommandListEnd();
+}
+
 } // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 03fd0eef0a..5176586127 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -32,6 +32,8 @@ public:
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
     void WaitIdle() const override;
 
+    void OnCommandListEnd() override;
+
 protected:
     void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
 
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 1994d3bb45..251a9d911f 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -37,6 +37,8 @@ static void RunThread(VideoCore::RendererBase& renderer, Core::Frontend::Graphic
             dma_pusher.DispatchCalls();
         } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
             renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
+        } else if (const auto data = std::get_if<OnCommandListEndCommand>(&next.data)) {
+            renderer.Rasterizer().ReleaseFences();
         } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
             renderer.Rasterizer().FlushRegion(data->addr, data->size);
         } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
@@ -95,6 +97,10 @@ void ThreadManager::WaitIdle() const {
     }
 }
 
+void ThreadManager::OnCommandListEnd() {
+    PushCommand(OnCommandListEndCommand());
+}
+
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
     const u64 fence{++state.last_fence};
     state.queue.Push(CommandDataContainer(std::move(command_data), fence));
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index cd74ad330a..9d08779216 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -70,9 +70,12 @@ struct FlushAndInvalidateRegionCommand final {
     u64 size;
 };
 
+/// Command to signal to the GPU thread that processing has ended
+struct OnCommandListEndCommand final {};
+
 using CommandData =
     std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
-                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand>;
 
 struct CommandDataContainer {
     CommandDataContainer() = default;
@@ -122,6 +125,8 @@ public:
     // Wait until the gpu thread is idle.
     void WaitIdle() const;
 
+    void OnCommandListEnd();
+
 private:
     /// Pushes a command to be executed by the GPU thread
     u64 PushCommand(CommandData&& command_data);
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 0d05a3fc71..72f65b166c 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -49,6 +49,14 @@ public:
     /// Records a GPU query and caches it
     virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
 
+    virtual void SignalFence(GPUVAddr addr, u32 value) {
+
+    }
+
+    virtual void ReleaseFences() {
+
+    }
+
     /// Notify rasterizer that all caches should be flushed to Switch memory
     virtual void FlushAll() = 0;
 
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 988eaeaa5a..93bb33e8cc 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -676,6 +676,34 @@ void RasterizerOpenGL::SyncGuestHost() {
     buffer_cache.SyncGuestHost();
 }
 
+void RasterizerOpenGL::SignalFence(GPUVAddr addr, u32 value) {
+    if (!fences.empty()) {
+        const std::pair<GPUVAddr, u32>& current_fence = fences.front();
+        const auto [address, payload] = current_fence;
+        texture_cache.PopAsyncFlushes();
+        auto& gpu{system.GPU()};
+        auto& memory_manager{gpu.MemoryManager()};
+        memory_manager.Write<u32>(address, payload);
+        fences.pop_front();
+    }
+    fences.emplace_back(addr, value);
+    texture_cache.CommitAsyncFlushes();
+    FlushCommands();
+    SyncGuestHost();
+}
+
+void RasterizerOpenGL::ReleaseFences() {
+    while (!fences.empty()) {
+        const std::pair<GPUVAddr, u32>& current_fence = fences.front();
+        const auto [address, payload] = current_fence;
+        texture_cache.PopAsyncFlushes();
+        auto& gpu{system.GPU()};
+        auto& memory_manager{gpu.MemoryManager()};
+        memory_manager.Write<u32>(address, payload);
+        fences.pop_front();
+    }
+}
+
 void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
     if (Settings::IsGPULevelExtreme()) {
         FlushRegion(addr, size);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a870024c63..486a154ad6 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -69,6 +69,8 @@ public:
     void InvalidateRegion(VAddr addr, u64 size) override;
     void OnCPUWrite(VAddr addr, u64 size) override;
     void SyncGuestHost() override;
+    void SignalFence(GPUVAddr addr, u32 value) override;
+    void ReleaseFences() override;
     void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
     void FlushCommands() override;
     void TickFrame() override;
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index d8c8390bb9..6629c59ed2 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -238,7 +238,7 @@ public:
             surface->MarkAsRenderTarget(false, NO_RT);
             const auto& cr_params = surface->GetSurfaceParams();
             if (!cr_params.is_tiled) {
-                FlushSurface(surface);
+                AsyncFlushSurface(surface);
             }
         }
         render_targets[index].target = surface_view.first;
@@ -317,6 +317,26 @@ public:
         return ++ticks;
     }
 
+    void CommitAsyncFlushes() {
+        commited_flushes.push_back(uncommited_flushes);
+        uncommited_flushes.reset();
+    }
+
+    void PopAsyncFlushes() {
+        if (commited_flushes.empty()) {
+            return;
+        }
+        auto& flush_list = commited_flushes.front();
+        if (!flush_list) {
+            commited_flushes.pop_front();
+            return;
+        }
+        for (TSurface& surface : *flush_list) {
+            FlushSurface(surface);
+        }
+        commited_flushes.pop_front();
+    }
+
 protected:
     explicit TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                           bool is_astc_supported)
@@ -1152,6 +1172,13 @@ private:
         TView view;
     };
 
+    void AsyncFlushSurface(TSurface& surface) {
+        if (!uncommited_flushes) {
+            uncommited_flushes = std::make_shared<std::list<TSurface>>();
+        }
+        uncommited_flushes->push_back(surface);
+    }
+
     VideoCore::RasterizerInterface& rasterizer;
 
     FormatLookupTable format_lookup_table;
@@ -1198,6 +1225,9 @@ private:
 
     std::list<TSurface> marked_for_unregister;
 
+    std::shared_ptr<std::list<TSurface>> uncommited_flushes{};
+    std::list<std::shared_ptr<std::list<TSurface>>> commited_flushes;
+
     StagingCache staging_cache;
     std::recursive_mutex mutex;
 };