diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp
index b1657620ee..359193226d 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -6,11 +6,16 @@
 #include <sstream>
 #include <QBoxLayout>
+#include <QFileDialog>
+#include <QGroupBox>
 #include <QLabel>
+#include <QLineEdit>
 #include <QPushButton>
+#include <QSignalMapper>
+#include <QSpinBox>
 #include <QTreeView>
-#include "video_core/shader/shader_interpreter.h"
+#include "video_core/shader/shader.h"
 #include "graphics_vertex_shader.h"
@@ -19,7 +24,7 @@ using nihstro::Instruction;
 using nihstro::SourceRegister;
 using nihstro::SwizzlePattern;
-GraphicsVertexShaderModel::GraphicsVertexShaderModel(QObject* parent): QAbstractItemModel(parent) {
+GraphicsVertexShaderModel::GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent): QAbstractItemModel(parent), par(parent) {
@@ -36,7 +41,7 @@ int GraphicsVertexShaderModel::columnCount(const QModelIndex& parent) const {
 int GraphicsVertexShaderModel::rowCount(const QModelIndex& parent) const {
-    return static_cast<int>(info.code.size());
+    return static_cast<int>(par->info.code.size());
 QVariant GraphicsVertexShaderModel::headerData(int section, Qt::Orientation orientation, int role) const {
@@ -64,21 +69,21 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
         switch (index.column()) {
         case 0:
-            if (info.HasLabel(index.row()))
-                return QString::fromStdString(info.GetLabel(index.row()));
+            if (par->info.HasLabel(index.row()))
+                return QString::fromStdString(par->info.GetLabel(index.row()));
             return QString("%1").arg(4*index.row(), 4, 16, QLatin1Char('0'));
         case 1:
-            return QString("%1").arg(info.code[index.row()].hex, 8, 16, QLatin1Char('0'));
+            return QString("%1").arg(par->info.code[index.row()].hex, 8, 16, QLatin1Char('0'));
         case 2:
             std::stringstream output;
-            Instruction instr = info.code[index.row()];
-            const SwizzlePattern& swizzle = info.swizzle_info[instr.common.operand_desc_id].pattern;
+            Instruction instr = par->info.code[index.row()];
+            const SwizzlePattern& swizzle = par->info.swizzle_info[instr.common.operand_desc_id].pattern;
             // longest known instruction name: "setemit "
             output << std::setw(8) << std::left << instr.opcode.Value().GetInfo().name;
@@ -242,6 +247,18 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
     case Qt::FontRole:
         return QFont("monospace");
+    case Qt::BackgroundRole:
+        // Highlight instructions which have no debug data associated to them
+        for (const auto& record : par->debug_data.records)
+            if (index.row() == record.instruction_offset)
+                return QVariant();
+        return QBrush(QColor(255, 255, 127));
+    // TODO: Draw arrows for each "reachable" instruction to visualize control flow
@@ -249,13 +266,153 @@ QVariant GraphicsVertexShaderModel::data(const QModelIndex& index, int role) con
     return QVariant();
-void GraphicsVertexShaderModel::OnUpdate()
-    beginResetModel();
+void GraphicsVertexShaderWidget::DumpShader() {
+    QString filename = QFileDialog::getSaveFileName(this, tr("Save Shader Dump"), "shader_dump.shbin",
+                                                    tr("Shader Binary (*.shbin)"));
+    if (filename.isEmpty()) {
+        // If the user canceled the dialog, don't dump anything.
+        return;
+    }
+    auto& setup  = Pica::g_state.vs;
+    auto& config = Pica::g_state.regs.vs;
+    Pica::DebugUtils::DumpShader(filename.toStdString(), config, setup, Pica::g_state.regs.vs_output_attributes);
+GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::DebugContext > debug_context,
+                                                       QWidget* parent)
+        : BreakPointObserverDock(debug_context, "Pica Vertex Shader", parent) {
+    setObjectName("PicaVertexShader");
+    auto input_data_mapper = new QSignalMapper(this);
+    // TODO: Support inputting data in hexadecimal raw format
+    for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) {
+        input_data[i] = new QLineEdit;
+        input_data[i]->setValidator(new QDoubleValidator(input_data[i]));
+    }
+    breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)"));
+    // TODO: Add some button for jumping to the shader entry point
+    model = new GraphicsVertexShaderModel(this);
+    binary_list = new QTreeView;
+    binary_list->setModel(model);
+    binary_list->setRootIsDecorated(false);
+    binary_list->setAlternatingRowColors(true);
+    auto dump_shader = new QPushButton(QIcon::fromTheme("document-save"), tr("Dump"));
+    instruction_description = new QLabel;
+    iteration_index = new QSpinBox;
+    connect(this, SIGNAL(SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags)),
+            binary_list->selectionModel(), SLOT(select(const QModelIndex&, QItemSelectionModel::SelectionFlags)));
+    connect(dump_shader, SIGNAL(clicked()), this, SLOT(DumpShader()));
+    connect(iteration_index, SIGNAL(valueChanged(int)), this, SLOT(OnIterationIndexChanged(int)));
+    for (unsigned i = 0; i < ARRAY_SIZE(input_data); ++i) {
+        connect(input_data[i], SIGNAL(textEdited(const QString&)), input_data_mapper, SLOT(map()));
+        input_data_mapper->setMapping(input_data[i], i);
+    }
+    connect(input_data_mapper, SIGNAL(mapped(int)), this, SLOT(OnInputAttributeChanged(int)));
+    auto main_widget = new QWidget;
+    auto main_layout = new QVBoxLayout;
+    {
+        auto input_data_group = new QGroupBox(tr("Input Data"));
+        // For each vertex attribute, add a QHBoxLayout consisting of:
+        // - A QLabel denoting the source attribute index
+        // - Four QLineEdits for showing and manipulating attribute data
+        // - A QLabel denoting the shader input attribute index
+        auto sub_layout = new QVBoxLayout;
+        for (unsigned i = 0; i < 16; ++i) {
+            // Create an HBoxLayout to store the widgets used to specify a particular attribute
+            // and store it in a QWidget to allow for easy hiding and unhiding.
+            auto row_layout = new QHBoxLayout;
+            row_layout->addWidget(new QLabel(tr("Attribute %1").arg(i, 2)));
+            for (unsigned comp = 0; comp < 4; ++comp)
+                row_layout->addWidget(input_data[4 * i + comp]);
+            row_layout->addWidget(input_data_mapping[i] = new QLabel);
+            input_data_container[i] = new QWidget;
+            input_data_container[i]->setLayout(row_layout);
+            input_data_container[i]->hide();
+            sub_layout->addWidget(input_data_container[i]);
+        }
+        sub_layout->addWidget(breakpoint_warning);
+        breakpoint_warning->hide();
+        input_data_group->setLayout(sub_layout);
+        main_layout->addWidget(input_data_group);
+    }
+    {
+        auto sub_layout = new QHBoxLayout;
+        sub_layout->addWidget(binary_list);
+        main_layout->addLayout(sub_layout);
+    }
+    main_layout->addWidget(dump_shader);
+    {
+        auto sub_layout = new QHBoxLayout;
+        sub_layout->addWidget(new QLabel(tr("Iteration Index:")));
+        sub_layout->addWidget(iteration_index);
+        main_layout->addLayout(sub_layout);
+    }
+    main_layout->addWidget(instruction_description);
+    main_widget->setLayout(main_layout);
+    setWidget(main_widget);
+    widget()->setEnabled(false);
+void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
+    auto input = static_cast<Pica::Shader::InputVertex*>(data);
+    if (event == Pica::DebugContext::Event::VertexLoaded) {
+        Reload(true, data);
+    } else {
+        // No vertex data is retrievable => invalidate currently stored vertex data
+        Reload(true, nullptr);
+    }
+    widget()->setEnabled(true);
+void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_data) {
+    model->beginResetModel();
+    if (replace_vertex_data) {
+        if (vertex_data) {
+            memcpy(&input_vertex, vertex_data, sizeof(input_vertex));
+            for (unsigned attr = 0; attr < 16; ++attr) {
+                for (unsigned comp = 0; comp < 4; ++comp) {
+                    input_data[4 * attr + comp]->setText(QString("%1").arg(input_vertex.attr[attr][comp].ToFloat32()));
+                }
+            }
+            breakpoint_warning->hide();
+        } else {
+            for (unsigned attr = 0; attr < 16; ++attr) {
+                for (unsigned comp = 0; comp < 4; ++comp) {
+                    input_data[4 * attr + comp]->setText(QString("???"));
+                }
+            }
+            breakpoint_warning->show();
+        }
+    }
+    // Reload shader code
     auto& shader_setup = Pica::g_state.vs;
+    auto& shader_config = Pica::g_state.regs.vs;
     for (auto instr : shader_setup.program_code)
@@ -265,49 +422,75 @@ void GraphicsVertexShaderModel::OnUpdate()
     u32 entry_point = Pica::g_state.regs.vs.main_offset;
     info.labels.insert({ entry_point, "main" });
-    endResetModel();
+    // Generate debug information
+    debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, 1, shader_config, shader_setup);
-void GraphicsVertexShaderModel::DumpShader() {
-    auto& setup  = Pica::g_state.vs;
-    auto& config = Pica::g_state.regs.vs;
+    // Reload widget state
-    Pica::DebugUtils::DumpShader(config, setup, Pica::g_state.regs.vs_output_attributes);
-GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::DebugContext > debug_context,
-                                                       QWidget* parent)
-        : BreakPointObserverDock(debug_context, "Pica Vertex Shader", parent) {
-    setObjectName("PicaVertexShader");
-    auto binary_model = new GraphicsVertexShaderModel(this);
-    auto binary_list = new QTreeView;
-    binary_list->setModel(binary_model);
-    binary_list->setRootIsDecorated(false);
-    binary_list->setAlternatingRowColors(true);
-    auto dump_shader = new QPushButton(tr("Dump"));
-    connect(dump_shader, SIGNAL(clicked()), binary_model, SLOT(DumpShader()));
-    connect(this, SIGNAL(Update()), binary_model, SLOT(OnUpdate()));
-    auto main_widget = new QWidget;
-    auto main_layout = new QVBoxLayout;
-    {
-        auto sub_layout = new QHBoxLayout;
-        sub_layout->addWidget(binary_list);
-        main_layout->addLayout(sub_layout);
+    // Only show input attributes which are used as input to the shader
+    for (unsigned int attr = 0; attr < 16; ++attr) {
+        input_data_container[attr]->setVisible(false);
+    }
+    for (unsigned int attr = 0; attr < Pica::g_state.regs.vertex_attributes.GetNumTotalAttributes(); ++attr) {
+        unsigned source_attr = shader_config.input_register_map.GetRegisterForAttribute(attr);
+        input_data_mapping[source_attr]->setText(QString("-> v%1").arg(attr));
+        input_data_container[source_attr]->setVisible(true);
-    main_layout->addWidget(dump_shader);
-    main_widget->setLayout(main_layout);
-    setWidget(main_widget);
-void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
-    emit Update();
-    widget()->setEnabled(true);
+    // Initialize debug info text for current iteration count
+    iteration_index->setMaximum(debug_data.records.size() - 1);
+    OnIterationIndexChanged(iteration_index->value());
+    model->endResetModel();
 void GraphicsVertexShaderWidget::OnResumed() {
+void GraphicsVertexShaderWidget::OnInputAttributeChanged(int index) {
+    float value = input_data[index]->text().toFloat();
+    Reload();
+void GraphicsVertexShaderWidget::OnIterationIndexChanged(int index) {
+    QString text;
+    auto& record = debug_data.records[index];
+    if (record.mask & Pica::Shader::DebugDataRecord::SRC1)
+        text += tr("SRC1: %1, %2, %3, %4\n").arg(record.src1.x.ToFloat32()).arg(record.src1.y.ToFloat32()).arg(record.src1.z.ToFloat32()).arg(record.src1.w.ToFloat32());
+    if (record.mask & Pica::Shader::DebugDataRecord::SRC2)
+        text += tr("SRC2: %1, %2, %3, %4\n").arg(record.src2.x.ToFloat32()).arg(record.src2.y.ToFloat32()).arg(record.src2.z.ToFloat32()).arg(record.src2.w.ToFloat32());
+    if (record.mask & Pica::Shader::DebugDataRecord::SRC3)
+        text += tr("SRC3: %1, %2, %3, %4\n").arg(record.src3.x.ToFloat32()).arg(record.src3.y.ToFloat32()).arg(record.src3.z.ToFloat32()).arg(record.src3.w.ToFloat32());
+    if (record.mask & Pica::Shader::DebugDataRecord::DEST_IN)
+        text += tr("DEST_IN: %1, %2, %3, %4\n").arg(record.dest_in.x.ToFloat32()).arg(record.dest_in.y.ToFloat32()).arg(record.dest_in.z.ToFloat32()).arg(record.dest_in.w.ToFloat32());
+    if (record.mask & Pica::Shader::DebugDataRecord::DEST_OUT)
+        text += tr("DEST_OUT: %1, %2, %3, %4\n").arg(record.dest_out.x.ToFloat32()).arg(record.dest_out.y.ToFloat32()).arg(record.dest_out.z.ToFloat32()).arg(record.dest_out.w.ToFloat32());
+    if (record.mask & Pica::Shader::DebugDataRecord::ADDR_REG_OUT)
+        text += tr("Addres Registers: %1, %2\n").arg(record.address_registers[0]).arg(record.address_registers[1]);
+    if (record.mask & Pica::Shader::DebugDataRecord::CMP_RESULT)
+        text += tr("Compare Result: %1, %2\n").arg(record.conditional_code[0] ? "true" : "false").arg(record.conditional_code[1] ? "true" : "false");
+    if (record.mask & Pica::Shader::DebugDataRecord::COND_BOOL_IN)
+        text += tr("Static Condition: %1\n").arg(record.cond_bool ? "true" : "false");
+    if (record.mask & Pica::Shader::DebugDataRecord::COND_CMP_IN)
+        text += tr("Dynamic Conditions: %1, %2\n").arg(record.cond_cmp[0] ? "true" : "false").arg(record.cond_cmp[1] ? "true" : "false");
+    if (record.mask & Pica::Shader::DebugDataRecord::LOOP_INT_IN)
+        text += tr("Loop Parameters: %1 (repeats), %2 (initializer), %3 (increment), %4\n").arg(record.loop_int.x).arg(record.loop_int.y).arg(record.loop_int.z).arg(record.loop_int.w);
+    text += tr("Instruction offset: 0x%1").arg(4 * record.instruction_offset, 4, 16, QLatin1Char('0'));
+    if (record.mask & Pica::Shader::DebugDataRecord::NEXT_INSTR) {
+        text += tr(" -> 0x%2").arg(4 * record.next_instruction, 4, 16, QLatin1Char('0'));
+    } else {
+        text += tr(" (last instruction)");
+    }
+    instruction_description->setText(text);
+    // Scroll to current instruction
+    const QModelIndex& instr_index = model->index(record.instruction_offset, 0);
+    emit SelectCommand(instr_index, QItemSelectionModel::ClearAndSelect | QItemSelectionModel::Rows);
+    binary_list->scrollTo(instr_index, QAbstractItemView::EnsureVisible);
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics_vertex_shader.h
index 5dc9e37033..1b46aa0d9a 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.h
+++ b/src/citra_qt/debugger/graphics_vertex_shader.h
@@ -10,11 +10,18 @@
 #include "nihstro/parser_shbin.h"
+#include "video_core/shader/shader.h"
+class QLabel;
+class QSpinBox;
+class GraphicsVertexShaderWidget;
 class GraphicsVertexShaderModel : public QAbstractItemModel {
-    GraphicsVertexShaderModel(QObject* parent);
+    GraphicsVertexShaderModel(GraphicsVertexShaderWidget* parent);
     QModelIndex index(int row, int column, const QModelIndex& parent = QModelIndex()) const override;
     QModelIndex parent(const QModelIndex& child) const override;
@@ -23,13 +30,10 @@ public:
     QVariant data(const QModelIndex& index, int role = Qt::DisplayRole) const override;
     QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const override;
-public slots:
-    void OnUpdate();
-    void DumpShader();
-    nihstro::ShaderInfo info;
+    GraphicsVertexShaderWidget* par;
+    friend class GraphicsVertexShaderWidget;
 class GraphicsVertexShaderWidget : public BreakPointObserverDock {
@@ -45,9 +49,41 @@ private slots:
     void OnBreakPointHit(Pica::DebugContext::Event event, void* data) override;
     void OnResumed() override;
+    void OnInputAttributeChanged(int index);
+    void OnIterationIndexChanged(int index);
+    void DumpShader();
+    /** Reload widget based on the current PICA200 state
+      * @param replace_vertex_data If true, invalidate all current vertex data
+      * @param vertex_data New vertex data to use, as passed to OnBreakPointHit. May be nullptr to specify that no valid vertex data can be retrieved currently. Only used if replace_vertex_data is true.
+      */
+    void Reload(bool replace_vertex_data = false, void* vertex_data = nullptr);
-    void Update();
+    // Call this to change the current command selection in the disassembly view
+    void SelectCommand(const QModelIndex&, QItemSelectionModel::SelectionFlags);
+    QLabel* instruction_description;
+    QTreeView* binary_list;
+    GraphicsVertexShaderModel* model;
+    // TODO: Move these into a single struct
+    std::array<QLineEdit*, 4*16> input_data;  // A text box for each of the 4 components of up to 16 vertex attributes
+    std::array<QWidget*, 16> input_data_container; // QWidget containing the QLayout containing each vertex attribute
+    std::array<QLabel*, 16> input_data_mapping; // A QLabel denoting the shader input attribute which the vertex attribute maps to
+    // Text to be shown when input vertex data is not retrievable
+    QLabel* breakpoint_warning;
+    QSpinBox* iteration_index;
+    nihstro::ShaderInfo info;
+    Pica::Shader::DebugData<true> debug_data;
+    Pica::Shader::InputVertex input_vertex;
+    friend class GraphicsVertexShaderModel;
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 374c4748d5..8c741f31f6 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -215,7 +215,7 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
             unsigned int vertex_cache_pos = 0;
-            Shader::UnitState shader_unit;
+            Shader::UnitState<false> shader_unit;
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index a79d90ef14..ac071790a4 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -111,7 +111,7 @@ void GeometryDumper::Dump() {
-void DumpShader(const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes)
+void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes)
     struct StuffToWrite {
         u8* pointer;
@@ -294,7 +294,6 @@ void DumpShader(const Regs::ShaderConfig& config, const State::ShaderSetup& setu
     // Write data to file
     static int dump_index = 0;
-    std::string filename = std::string("shader_dump") + std::to_string(++dump_index) + std::string(".shbin");
     std::ofstream file(filename, std::ios_base::out | std::ios_base::binary);
     for (auto& chunk : writing_queue) {
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index 1450e5bf3a..0b30d7ffa9 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -181,7 +181,8 @@ private:
     std::vector<Face> faces;
-void DumpShader(const Regs::ShaderConfig& config, const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes);
+void DumpShader(const std::string& filename, const Regs::ShaderConfig& config,
+                const State::ShaderSetup& setup, const Regs::VSOutputAttributes* output_attributes);
 // Utility class to log Pica commands.
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 2692b91e4a..4e9836c804 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -5,6 +5,8 @@
 #include <memory>
 #include <unordered_map>
+#include <boost/range/algorithm/fill.hpp>
 #include "common/hash.h"
 #include "common/make_unique.h"
 #include "common/profiler.h"
@@ -30,7 +32,7 @@ static JitCompiler jit;
 static CompiledShader* jit_shader;
 #endif // ARCHITECTURE_x86_64
-void Setup(UnitState& state) {
+void Setup(UnitState<false>& state) {
 #ifdef ARCHITECTURE_x86_64
     if (VideoCore::g_shader_jit_enabled) {
         u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@@ -54,9 +56,8 @@ void Shutdown() {
 static Common::Profiling::TimingCategory shader_category("Vertex Shader");
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes) {
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
     auto& config = g_state.regs.vs;
-    auto& setup = g_state.vs;
     Common::Profiling::ScopeTimer timer(shader_category);
@@ -67,6 +68,8 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
     // Setup input register table
     const auto& attribute_register_map = config.input_register_map;
+    // TODO: Instead of this cumbersome logic, just load the input data directly like
+    // for (int attr = 0; attr < num_attributes; ++attr) { input_attr[0] = state.registers.input[attribute_register_map.attribute0_register]; }
     if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = input.attr[0];
     if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = input.attr[1];
     if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = input.attr[2];
@@ -126,14 +129,52 @@ OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes)
             std::fmin(std::fabs(ret.color[i].ToFloat32()), 1.0f));
-    LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
+    LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), quat (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
         ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
+        ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(),
         ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),
         ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32());
     return ret;
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup) {
+    UnitState<true> state;
+    const auto& shader_memory = setup.program_code;
+    state.program_counter = config.main_offset;
+    state.debug.max_offset = 0;
+    state.debug.max_opdesc_id = 0;
+    // Setup input register table
+    const auto& attribute_register_map = config.input_register_map;
+    float24 dummy_register;
+    boost::fill(state.registers.input, &dummy_register);
+    if (num_attributes > 0) state.registers.input[attribute_register_map.attribute0_register] = &input.attr[0].x;
+    if (num_attributes > 1) state.registers.input[attribute_register_map.attribute1_register] = &input.attr[1].x;
+    if (num_attributes > 2) state.registers.input[attribute_register_map.attribute2_register] = &input.attr[2].x;
+    if (num_attributes > 3) state.registers.input[attribute_register_map.attribute3_register] = &input.attr[3].x;
+    if (num_attributes > 4) state.registers.input[attribute_register_map.attribute4_register] = &input.attr[4].x;
+    if (num_attributes > 5) state.registers.input[attribute_register_map.attribute5_register] = &input.attr[5].x;
+    if (num_attributes > 6) state.registers.input[attribute_register_map.attribute6_register] = &input.attr[6].x;
+    if (num_attributes > 7) state.registers.input[attribute_register_map.attribute7_register] = &input.attr[7].x;
+    if (num_attributes > 8) state.registers.input[attribute_register_map.attribute8_register] = &input.attr[8].x;
+    if (num_attributes > 9) state.registers.input[attribute_register_map.attribute9_register] = &input.attr[9].x;
+    if (num_attributes > 10) state.registers.input[attribute_register_map.attribute10_register] = &input.attr[10].x;
+    if (num_attributes > 11) state.registers.input[attribute_register_map.attribute11_register] = &input.attr[11].x;
+    if (num_attributes > 12) state.registers.input[attribute_register_map.attribute12_register] = &input.attr[12].x;
+    if (num_attributes > 13) state.registers.input[attribute_register_map.attribute13_register] = &input.attr[13].x;
+    if (num_attributes > 14) state.registers.input[attribute_register_map.attribute14_register] = &input.attr[14].x;
+    if (num_attributes > 15) state.registers.input[attribute_register_map.attribute15_register] = &input.attr[15].x;
+    state.conditional_code[0] = false;
+    state.conditional_code[1] = false;
+    RunInterpreter(state);
+    return state.debug;
 } // namespace Shader
 } // namespace Pica
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 2007a28445..58d21f7cd5 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -4,7 +4,10 @@
 #pragma once
+#include <vector>
 #include <boost/container/static_vector.hpp>
 #include <nihstro/shader_binary.h>
 #include "common/common_funcs.h"
@@ -72,12 +75,185 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
+// Helper structure used to keep track of data useful for inspection of shader emulation
+template<bool full_debugging>
+struct DebugData;
+struct DebugData<false> {
+    // TODO: Hide these behind and interface and move them to DebugData<true>
+    u32 max_offset; // maximum program counter ever reached
+    u32 max_opdesc_id; // maximum swizzle pattern index ever used
+struct DebugData<true> {
+    // Records store the input and output operands of a particular instruction.
+    struct Record {
+        enum Type {
+            // Floating point arithmetic operands
+            SRC1         = 0x1,
+            SRC2         = 0x2,
+            SRC3         = 0x4,
+            // Initial and final output operand value
+            DEST_IN      = 0x8,
+            DEST_OUT     = 0x10,
+            // Current and next instruction offset (in words)
+            CUR_INSTR    = 0x20,
+            NEXT_INSTR   = 0x40,
+            // Output address register value
+            ADDR_REG_OUT = 0x80,
+            // Result of a comparison instruction
+            CMP_RESULT   = 0x100,
+            // Input values for conditional flow control instructions
+            COND_BOOL_IN = 0x200,
+            COND_CMP_IN  = 0x400,
+            // Input values for a loop
+            LOOP_INT_IN  = 0x800,
+        };
+        Math::Vec4<float24> src1;
+        Math::Vec4<float24> src2;
+        Math::Vec4<float24> src3;
+        Math::Vec4<float24> dest_in;
+        Math::Vec4<float24> dest_out;
+        s32 address_registers[2];
+        bool conditional_code[2];
+        bool cond_bool;
+        bool cond_cmp[2];
+        Math::Vec4<u8> loop_int;
+        u32 instruction_offset;
+        u32 next_instruction;
+        // set of enabled fields (as a combination of Type flags)
+        unsigned mask = 0;
+    };
+    u32 max_offset; // maximum program counter ever reached
+    u32 max_opdesc_id; // maximum swizzle pattern index ever used
+    // List of records for each executed shader instruction
+    std::vector<DebugData<true>::Record> records;
+// Type alias for better readability
+using DebugDataRecord = DebugData<true>::Record;
+// Helper function to set a DebugData<true>::Record field based on the template enum parameter.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void SetField(DebugDataRecord& record, ValueType value);
+inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) {
+    record.src1.x = value[0];
+    record.src1.y = value[1];
+    record.src1.z = value[2];
+    record.src1.w = value[3];
+inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) {
+    record.src2.x = value[0];
+    record.src2.y = value[1];
+    record.src2.z = value[2];
+    record.src2.w = value[3];
+inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) {
+    record.src3.x = value[0];
+    record.src3.y = value[1];
+    record.src3.z = value[2];
+    record.src3.w = value[3];
+inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) {
+    record.dest_in.x = value[0];
+    record.dest_in.y = value[1];
+    record.dest_in.z = value[2];
+    record.dest_in.w = value[3];
+inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) {
+    record.dest_out.x = value[0];
+    record.dest_out.y = value[1];
+    record.dest_out.z = value[2];
+    record.dest_out.w = value[3];
+inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) {
+    record.address_registers[0] = value[0];
+    record.address_registers[1] = value[1];
+inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) {
+    record.conditional_code[0] = value[0];
+    record.conditional_code[1] = value[1];
+inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) {
+    record.cond_bool = value;
+inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) {
+    record.cond_cmp[0] = value[0];
+    record.cond_cmp[1] = value[1];
+inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) {
+    record.loop_int = value;
+inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) {
+    record.instruction_offset = value;
+inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) {
+    record.next_instruction = value;
+// Helper function to set debug information on the current shader iteration.
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) {
+    // Debugging disabled => nothing to do
+template<DebugDataRecord::Type type, typename ValueType>
+inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) {
+    if (offset >= debug_data.records.size())
+        debug_data.records.resize(offset + 1);
+   SetField<type, ValueType>(debug_data.records[offset], value);
+   debug_data.records[offset].mask |= type;
  * This structure contains the state information that needs to be unique for a shader unit. The 3DS
  * has four shader units that process shaders in parallel. At the present, Citra only implements a
  * single shader unit that processes all shaders serially. Putting the state information in a struct
  * here will make it easier for us to parallelize the shader processing later.
+template<bool Debug>
 struct UnitState {
     struct Registers {
         // The registers are accessed by the shader JIT using SSE instructions, and are therefore
@@ -111,10 +287,7 @@ struct UnitState {
     // TODO: Is there a maximal size for this?
     boost::container::static_vector<CallStackElement, 16> call_stack;
-    struct {
-        u32 max_offset; // maximum program counter ever reached
-        u32 max_opdesc_id; // maximum swizzle pattern index ever used
-    } debug;
+    DebugData<Debug> debug;
     static int InputOffset(const SourceRegister& reg) {
         switch (reg.GetRegisterType()) {
@@ -150,7 +323,7 @@ struct UnitState {
  * vertex, which would happen within the `Run` function).
  * @param state Shader unit state, must be setup per shader and per shader unit
-void Setup(UnitState& state);
+void Setup(UnitState<false>& state);
 /// Performs any cleanup when the emulator is shutdown
 void Shutdown();
@@ -162,7 +335,17 @@ void Shutdown();
  * @param num_attributes The number of vertex shader attributes
  * @return The output vertex, after having been processed by the vertex shader
-OutputVertex Run(UnitState& state, const InputVertex& input, int num_attributes);
+OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+ * Produce debug information based on the given shader and input vertex
+ * @param input Input vertex into the shader
+ * @param num_attributes The number of vertex shader attributes
+ * @param config Configuration object for the shader pipeline
+ * @param setup Setup object for the shader pipeline
+ * @return Debug information for this shader with regards to the given vertex
+ */
+DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const State::ShaderSetup& setup);
 } // namespace Shader
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp
index c8489f920e..e14de07685 100644
--- a/src/video_core/shader/shader_interpreter.cpp
+++ b/src/video_core/shader/shader_interpreter.cpp
@@ -21,7 +21,8 @@ namespace Pica {
 namespace Shader {
-void RunInterpreter(UnitState& state) {
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state) {
     const auto& uniforms = g_state.vs.uniforms;
     const auto& swizzle_data = g_state.vs.swizzle_data;
     const auto& program_code = g_state.vs.program_code;
@@ -29,7 +30,9 @@ void RunInterpreter(UnitState& state) {
     // Placeholder for invalid inputs
     static float24 dummy_vec4_float24[4];
-    while (true) {
+    unsigned iteration = 0;
+    bool exit_loop = false;
+    while (!exit_loop) {
         if (!state.call_stack.empty()) {
             auto& top = state.call_stack.back();
             if (state.program_counter == top.final_address) {
@@ -47,16 +50,19 @@ void RunInterpreter(UnitState& state) {
-        bool exit_loop = false;
         const Instruction instr = { program_code[state.program_counter] };
         const SwizzlePattern swizzle = { swizzle_data[instr.common.operand_desc_id] };
-        static auto call = [](UnitState& state, u32 offset, u32 num_instructions,
+        static auto call = [](UnitState<Debug>& state, u32 offset, u32 num_instructions,
                               u32 return_offset, u8 repeat_count, u8 loop_increment) {
             state.program_counter = offset - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
             ASSERT(state.call_stack.size() < state.call_stack.capacity());
             state.call_stack.push_back({ offset + num_instructions, return_offset, repeat_count, loop_increment, offset });
+        Record<DebugDataRecord::CUR_INSTR>(state.debug, iteration, state.program_counter);
+        if (iteration > 0)
+            Record<DebugDataRecord::NEXT_INSTR>(state.debug, iteration - 1, state.program_counter);
         state.debug.max_offset = std::max<u32>(state.debug.max_offset, 1 + state.program_counter);
         auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* {
@@ -123,58 +129,78 @@ void RunInterpreter(UnitState& state) {
             switch (instr.opcode.Value().EffectiveOpCode()) {
             case OpCode::Id::ADD:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = src1[i] + src2[i];
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::MUL:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = src1[i] * src2[i];
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::FLR:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32()));
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::MAX:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = std::max(src1[i], src2[i]);
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::MIN:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = std::min(src1[i], src2[i]);
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::DP3:
             case OpCode::Id::DP4:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 float24 dot = float24::FromFloat32(0.f);
                 int num_components = (instr.opcode.Value() == OpCode::Id::DP3) ? 3 : 4;
                 for (int i = 0; i < num_components; ++i)
@@ -186,12 +212,15 @@ void RunInterpreter(UnitState& state) {
                     dest[i] = dot;
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             // Reciprocal
             case OpCode::Id::RCP:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
@@ -200,13 +229,15 @@ void RunInterpreter(UnitState& state) {
                     // TODO: I think this might be wrong... we should only use one component here
                     dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32());
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             // Reciprocal Square Root
             case OpCode::Id::RSQ:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
@@ -215,12 +246,13 @@ void RunInterpreter(UnitState& state) {
                     // TODO: I think this might be wrong... we should only use one component here
                     dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32()));
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::MOVA:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
                 for (int i = 0; i < 2; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
@@ -228,32 +260,41 @@ void RunInterpreter(UnitState& state) {
                     // TODO: Figure out how the rounding is done on hardware
                     state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32());
+                Record<DebugDataRecord::ADDR_REG_OUT>(state.debug, iteration, state.address_registers);
             case OpCode::Id::MOV:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = src1[i];
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::SLT:
             case OpCode::Id::SLTI:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f);
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             case OpCode::Id::CMP:
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
                 for (int i = 0; i < 2; ++i) {
                     // TODO: Can you restrict to one compare via dest masking?
@@ -261,27 +302,27 @@ void RunInterpreter(UnitState& state) {
                     auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value();
                     switch (op) {
-                        case compare_op.Equal:
+                        case Instruction::Common::CompareOpType::Equal:
                             state.conditional_code[i] = (src1[i] == src2[i]);
-                        case compare_op.NotEqual:
+                        case Instruction::Common::CompareOpType::NotEqual:
                             state.conditional_code[i] = (src1[i] != src2[i]);
-                        case compare_op.LessThan:
+                        case Instruction::Common::CompareOpType::LessThan:
                             state.conditional_code[i] = (src1[i] <  src2[i]);
-                        case compare_op.LessEqual:
+                        case Instruction::Common::CompareOpType::LessEqual:
                             state.conditional_code[i] = (src1[i] <= src2[i]);
-                        case compare_op.GreaterThan:
+                        case Instruction::Common::CompareOpType::GreaterThan:
                             state.conditional_code[i] = (src1[i] >  src2[i]);
-                        case compare_op.GreaterEqual:
+                        case Instruction::Common::CompareOpType::GreaterEqual:
                             state.conditional_code[i] = (src1[i] >= src2[i]);
@@ -290,6 +331,7 @@ void RunInterpreter(UnitState& state) {
+                Record<DebugDataRecord::CMP_RESULT>(state.debug, iteration, state.conditional_code);
@@ -359,12 +401,17 @@ void RunInterpreter(UnitState& state) {
                             : (instr.mad.dest.Value() < 0x20) ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0]
                             : dummy_vec4_float24;
+                Record<DebugDataRecord::SRC1>(state.debug, iteration, src1);
+                Record<DebugDataRecord::SRC2>(state.debug, iteration, src2);
+                Record<DebugDataRecord::SRC3>(state.debug, iteration, src3);
+                Record<DebugDataRecord::DEST_IN>(state.debug, iteration, dest);
                 for (int i = 0; i < 4; ++i) {
                     if (!swizzle.DestComponentEnabled(i))
                     dest[i] = src1[i] * src2[i] + src3[i];
+                Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
             } else {
                 LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value().EffectiveOpCode(), instr.opcode.Value().GetInfo().name, instr.hex);
@@ -374,7 +421,7 @@ void RunInterpreter(UnitState& state) {
-            static auto evaluate_condition = [](const UnitState& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
+            static auto evaluate_condition = [](const UnitState<Debug>& state, bool refx, bool refy, Instruction::FlowControlType flow_control) {
                 bool results[2] = { refx == state.conditional_code[0],
                                     refy == state.conditional_code[1] };
@@ -400,12 +447,14 @@ void RunInterpreter(UnitState& state) {
             case OpCode::Id::JMPC:
+                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                     state.program_counter = instr.flow_control.dest_offset - 1;
             case OpCode::Id::JMPU:
+                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                     state.program_counter = instr.flow_control.dest_offset - 1;
@@ -419,6 +468,7 @@ void RunInterpreter(UnitState& state) {
             case OpCode::Id::CALLU:
+                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
@@ -428,6 +478,7 @@ void RunInterpreter(UnitState& state) {
             case OpCode::Id::CALLC:
+                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
@@ -440,6 +491,7 @@ void RunInterpreter(UnitState& state) {
             case OpCode::Id::IFU:
+                Record<DebugDataRecord::COND_BOOL_IN>(state.debug, iteration, uniforms.b[instr.flow_control.bool_uniform_id]);
                 if (uniforms.b[instr.flow_control.bool_uniform_id]) {
                          state.program_counter + 1,
@@ -458,6 +510,7 @@ void RunInterpreter(UnitState& state) {
                 // TODO: Do we need to consider swizzlers here?
+                Record<DebugDataRecord::COND_CMP_IN>(state.debug, iteration, state.conditional_code);
                 if (evaluate_condition(state, instr.flow_control.refx, instr.flow_control.refy, instr.flow_control)) {
                          state.program_counter + 1,
@@ -475,14 +528,19 @@ void RunInterpreter(UnitState& state) {
             case OpCode::Id::LOOP:
-                state.address_registers[2] = uniforms.i[instr.flow_control.int_uniform_id].y;
+                Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x,
+                                          uniforms.i[instr.flow_control.int_uniform_id].y,
+                                          uniforms.i[instr.flow_control.int_uniform_id].z,
+                                          uniforms.i[instr.flow_control.int_uniform_id].w);
+                state.address_registers[2] = loop_param.y;
+                Record<DebugDataRecord::LOOP_INT_IN>(state.debug, iteration, loop_param);
                      state.program_counter + 1,
                      instr.flow_control.dest_offset - state.program_counter + 1,
                      instr.flow_control.dest_offset + 1,
-                     uniforms.i[instr.flow_control.int_uniform_id].x,
-                     uniforms.i[instr.flow_control.int_uniform_id].z);
+                     loop_param.x,
+                     loop_param.z);
@@ -497,12 +555,14 @@ void RunInterpreter(UnitState& state) {
-        if (exit_loop)
-            break;
+        ++iteration;
+// Explicit instantiation
+template void RunInterpreter(UnitState<false>& state);
+template void RunInterpreter(UnitState<true>& state);
 } // namespace
 } // namespace
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h
index ad6e58e391..71bcad5acb 100644
--- a/src/video_core/shader/shader_interpreter.h
+++ b/src/video_core/shader/shader_interpreter.h
@@ -12,7 +12,8 @@ namespace Pica {
 namespace Shader {
-void RunInterpreter(UnitState& state);
+template<bool Debug>
+void RunInterpreter(UnitState<Debug>& state);
 } // namespace
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp
index ce47774d52..836942c6bb 100644
--- a/src/video_core/shader/shader_jit_x64.cpp
+++ b/src/video_core/shader/shader_jit_x64.cpp
@@ -141,7 +141,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source
         src_offset = src_reg.GetIndex() * sizeof(float24) * 4;
     } else {
         src_ptr = REGISTERS;
-        src_offset = UnitState::InputOffset(src_reg);
+        src_offset = UnitState<false>::InputOffset(src_reg);
     unsigned operand_desc_id;
@@ -217,11 +217,11 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
     // If all components are enabled, write the result to the destination register
     if (swiz.dest_mask == NO_DEST_REG_MASK) {
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), src);
+        MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), src);
     } else {
         // Not all components are enabled, so mask the result when storing to the destination register...
-        MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState::OutputOffset(dest)));
+        MOVAPS(SCRATCH, MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)));
         if (Common::GetCPUCaps().sse4_1) {
             u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
@@ -240,7 +240,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
         // Store dest back to memory
-        MOVAPS(MDisp(REGISTERS, UnitState::OutputOffset(dest)), SCRATCH);
+        MOVAPS(MDisp(REGISTERS, UnitState<false>::OutputOffset(dest)), SCRATCH);