From 8a099ac99f61871f0492864d7e95a5922e57223d Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 15 Apr 2019 12:43:37 -0400
Subject: [PATCH 1/4] Correct Kepler Memory on Linear Pushes.

---
 src/video_core/engines/kepler_memory.cpp | 40 +++++++++++++++---------
 src/video_core/engines/kepler_memory.h   | 24 ++++++++++++--
 2 files changed, 48 insertions(+), 16 deletions(-)

diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index cd51a31d78..3ed28f4a78 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,6 +10,8 @@
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
+#include "video_core/textures/convert.h"
+#include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
@@ -27,30 +29,40 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
 
     switch (method_call.method) {
     case KEPLERMEMORY_REG_INDEX(exec): {
-        state.write_offset = 0;
+        ProcessExec();
         break;
     }
     case KEPLERMEMORY_REG_INDEX(data): {
-        ProcessData(method_call.argument);
+        ProcessData(method_call.argument, method_call.IsLastCall());
         break;
     }
     }
 }
 
-void KeplerMemory::ProcessData(u32 data) {
-    ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported");
-    ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0);
+void KeplerMemory::ProcessExec() {
+    state.write_offset = 0;
+    state.copy_size = regs.line_length_in * regs.line_count;
+    state.inner_buffer.resize(state.copy_size);
+}
 
-    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
-    // We do this before actually writing the new data because the destination address might
-    // contain a dirty surface that will have to be written back to memory.
-    const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
-    rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
-    memory_manager.Write<u32>(address, data);
+void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
+    const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
+    std::memcpy(&state.inner_buffer[state.write_offset], &data, sub_copy_size);
+    state.write_offset += sub_copy_size;
+    if (is_last_call) {
+        UNIMPLEMENTED_IF_MSG(regs.exec.linear == 0, "Block Linear Copy is not implemented");
+        if (regs.exec.linear != 0) {
+            const GPUVAddr address{regs.dest.Address()};
+            const auto host_ptr = memory_manager.GetPointer(address);
+            // We have to invalidate the destination region to evict any outdated surfaces from the
+            // cache. We do this before actually writing the new data because the destination
+            // address might contain a dirty surface that will have to be written back to memory.
 
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
-
-    state.write_offset++;
+            rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size);
+            std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size);
+            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        }
+    }
 }
 
 } // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 78b6c3e458..5f892ddad1 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -6,6 +6,7 @@
 
 #include <array>
 #include <cstddef>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -51,7 +52,11 @@ public:
                     u32 address_high;
                     u32 address_low;
                     u32 pitch;
-                    u32 block_dimensions;
+                    union {
+                        BitField<0, 4, u32> block_width;
+                        BitField<4, 4, u32> block_height;
+                        BitField<8, 4, u32> block_depth;
+                    };
                     u32 width;
                     u32 height;
                     u32 depth;
@@ -63,6 +68,18 @@ public:
                         return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
                                                      address_low);
                     }
+
+                    u32 BlockWidth() const {
+                        return 1U << block_width.Value();
+                    }
+
+                    u32 BlockHeight() const {
+                        return 1U << block_height.Value();
+                    }
+
+                    u32 BlockDepth() const {
+                        return 1U << block_depth.Value();
+                    }
                 } dest;
 
                 struct {
@@ -81,6 +98,8 @@ public:
 
     struct {
         u32 write_offset = 0;
+        u32 copy_size = 0;
+        std::vector<u8> inner_buffer;
     } state{};
 
 private:
@@ -88,7 +107,8 @@ private:
     VideoCore::RasterizerInterface& rasterizer;
     MemoryManager& memory_manager;
 
-    void ProcessData(u32 data);
+    void ProcessExec();
+    void ProcessData(u32 data, bool is_last_call);
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \

From bec28d692d21a42f17ae26f0ab6271aca1c233cd Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 15 Apr 2019 21:06:04 -0400
Subject: [PATCH 2/4] Implement Block Linear copies in Kepler Memory.

---
 src/video_core/engines/kepler_memory.cpp | 19 ++++++++++++++-----
 src/video_core/textures/decoders.cpp     | 21 +++++++++++++++++++++
 src/video_core/textures/decoders.h       |  3 +++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 3ed28f4a78..4df19c1f5c 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,7 +10,6 @@
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
-#include "video_core/textures/convert.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
@@ -47,13 +46,12 @@ void KeplerMemory::ProcessExec() {
 
 void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
     const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
-    std::memcpy(&state.inner_buffer[state.write_offset], &data, sub_copy_size);
+    std::memcpy(&state.inner_buffer[state.write_offset], &regs.data, sub_copy_size);
     state.write_offset += sub_copy_size;
     if (is_last_call) {
-        UNIMPLEMENTED_IF_MSG(regs.exec.linear == 0, "Block Linear Copy is not implemented");
+        const GPUVAddr address{regs.dest.Address()};
+        const auto host_ptr = memory_manager.GetPointer(address);
         if (regs.exec.linear != 0) {
-            const GPUVAddr address{regs.dest.Address()};
-            const auto host_ptr = memory_manager.GetPointer(address);
             // We have to invalidate the destination region to evict any outdated surfaces from the
             // cache. We do this before actually writing the new data because the destination
             // address might contain a dirty surface that will have to be written back to memory.
@@ -61,6 +59,17 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
             rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size);
             std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size);
             system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+        } else {
+            UNIMPLEMENTED_IF(regs.dest.z != 0);
+            UNIMPLEMENTED_IF(regs.dest.depth != 1);
+            UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
+            UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
+            const std::size_t dst_size = Tegra::Texture::CalculateSize(
+                true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
+            rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), dst_size);
+            Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
+                                          regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
+                                          state.inner_buffer.data(), host_ptr);
         }
     }
 }
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 995d0e068a..6e02a64078 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,6 +288,27 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
     }
 }
 
+void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
+                   std::size_t copy_size, u8* source_data, u8* swizzle_data) {
+    const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
+    std::size_t count = 0;
+    for (u32 y = dst_y; y < height && count < copy_size; ++y) {
+        const u32 gob_address_y =
+            (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[y % gob_size_y];
+        for (u32 x = dst_x; x < width && count < copy_size; ++x) {
+            const u32 gob_address = gob_address_y + (x / gob_size_x) * gob_size * block_height;
+            const u32 swizzled_offset = gob_address + table[x % gob_size_x];
+            const u8* source_line = source_data + count;
+            u8* dest_addr = swizzle_data + swizzled_offset;
+            count++;
+
+            std::memcpy(dest_addr, source_line, 1);
+        }
+    }
+}
+
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                               u32 height) {
     std::vector<u8> rgba_data;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e078fa2744..21d4b37fc5 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,4 +51,7 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
                       u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
                       u32 offset_x, u32 offset_y);
 
+void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
+                   std::size_t copy_size, u8* source_data, u8* swizzle_data);
+
 } // namespace Tegra::Texture

From 3e96c367bd1729d1a6c8bfd8b532301da85d4b5a Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Mon, 15 Apr 2019 22:42:34 -0400
Subject: [PATCH 3/4] Use WriteBlock and ReadBlock.

---
 src/video_core/engines/kepler_memory.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 4df19c1f5c..7387886a3f 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -50,15 +50,8 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
     state.write_offset += sub_copy_size;
     if (is_last_call) {
         const GPUVAddr address{regs.dest.Address()};
-        const auto host_ptr = memory_manager.GetPointer(address);
         if (regs.exec.linear != 0) {
-            // We have to invalidate the destination region to evict any outdated surfaces from the
-            // cache. We do this before actually writing the new data because the destination
-            // address might contain a dirty surface that will have to be written back to memory.
-
-            rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), state.copy_size);
-            std::memcpy(host_ptr, state.inner_buffer.data(), state.copy_size);
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size);
         } else {
             UNIMPLEMENTED_IF(regs.dest.z != 0);
             UNIMPLEMENTED_IF(regs.dest.depth != 1);
@@ -66,11 +59,14 @@ void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
             UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
             const std::size_t dst_size = Tegra::Texture::CalculateSize(
                 true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
-            rasterizer.InvalidateRegion(ToCacheAddr(host_ptr), dst_size);
+            std::vector<u8> tmp_buffer(dst_size);
+            memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
             Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
                                           regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
-                                          state.inner_buffer.data(), host_ptr);
+                                          state.inner_buffer.data(), tmp_buffer.data());
+            memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
         }
+        system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
     }
 }
 

From da91e6e4b6aabe13f7b748930de8afa28d10aa6c Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 16 Apr 2019 12:00:46 -0400
Subject: [PATCH 4/4] Apply Const correctness to SwizzleKepler and replace u32
 for size_t on iterators.

---
 src/video_core/textures/decoders.cpp | 16 +++++++++-------
 src/video_core/textures/decoders.h   |  5 +++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 6e02a64078..2178053865 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,18 +288,20 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
     }
 }
 
-void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
-                   std::size_t copy_size, u8* source_data, u8* swizzle_data) {
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data) {
     const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
     std::size_t count = 0;
-    for (u32 y = dst_y; y < height && count < copy_size; ++y) {
-        const u32 gob_address_y =
+    for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
+        const std::size_t gob_address_y =
             (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
             ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
         const auto& table = legacy_swizzle_table[y % gob_size_y];
-        for (u32 x = dst_x; x < width && count < copy_size; ++x) {
-            const u32 gob_address = gob_address_y + (x / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[x % gob_size_x];
+        for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
+            const std::size_t gob_address =
+                gob_address_y + (x / gob_size_x) * gob_size * block_height;
+            const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
             const u8* source_line = source_data + count;
             u8* dest_addr = swizzle_data + swizzled_offset;
             count++;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 21d4b37fc5..e072d84017 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,7 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
                       u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
                       u32 offset_x, u32 offset_y);
 
-void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
-                   std::size_t copy_size, u8* source_data, u8* swizzle_data);
+void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
+                   const u32 block_height, const std::size_t copy_size, const u8* source_data,
+                   u8* swizzle_data);
 
 } // namespace Tegra::Texture