From cd1fbfcf1b70e365d81480ec0f56db19ed02454f Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Thu, 5 Feb 2015 14:53:25 -0200
Subject: [PATCH 1/2] Add profiling infrastructure and widget

---
 src/citra_qt/CMakeLists.txt                   |   3 +
 src/citra_qt/debugger/profiler.cpp            | 138 +++++++++++++++
 src/citra_qt/debugger/profiler.h              |  50 ++++++
 src/citra_qt/debugger/profiler.ui             |  33 ++++
 src/citra_qt/main.cpp                         |   6 +
 src/citra_qt/main.h                           |   2 +
 src/common/CMakeLists.txt                     |   4 +
 src/common/profiler.cpp                       | 159 ++++++++++++++++++
 src/common/profiler.h                         | 134 +++++++++++++++
 src/common/profiler_reporting.h               | 108 ++++++++++++
 src/common/synchronized_wrapper.h             |  69 ++++++++
 src/common/thread.h                           |  19 +++
 .../arm/dyncom/arm_dyncom_interpreter.cpp     |   8 +
 src/core/hle/hle.cpp                          |   6 +
 src/video_core/command_processor.cpp          |   6 +
 .../renderer_opengl/renderer_opengl.cpp       |  12 ++
 16 files changed, 757 insertions(+)
 create mode 100644 src/citra_qt/debugger/profiler.cpp
 create mode 100644 src/citra_qt/debugger/profiler.h
 create mode 100644 src/citra_qt/debugger/profiler.ui
 create mode 100644 src/common/profiler.cpp
 create mode 100644 src/common/profiler.h
 create mode 100644 src/common/profiler_reporting.h
 create mode 100644 src/common/synchronized_wrapper.h

diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index 586bc84b08..a1ad00f574 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -13,6 +13,7 @@ set(SRCS
             debugger/graphics_cmdlists.cpp
             debugger/graphics_framebuffer.cpp
             debugger/graphics_vertex_shader.cpp
+            debugger/profiler.cpp
             debugger/ramview.cpp
             debugger/registers.cpp
             util/spinbox.cpp
@@ -35,6 +36,7 @@ set(HEADERS
             debugger/graphics_cmdlists.h
             debugger/graphics_framebuffer.h
             debugger/graphics_vertex_shader.h
+            debugger/profiler.h
             debugger/ramview.h
             debugger/registers.h
             util/spinbox.h
@@ -48,6 +50,7 @@ set(UIS
             config/controller_config.ui
             debugger/callstack.ui
             debugger/disassembler.ui
+            debugger/profiler.ui
             debugger/registers.ui
             hotkeys.ui
             main.ui
diff --git a/src/citra_qt/debugger/profiler.cpp b/src/citra_qt/debugger/profiler.cpp
new file mode 100644
index 0000000000..ae0568b6a2
--- /dev/null
+++ b/src/citra_qt/debugger/profiler.cpp
@@ -0,0 +1,138 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "profiler.h"
+
+#include "common/profiler_reporting.h"
+
+using namespace Common::Profiling;
+
+static QVariant GetDataForColumn(int col, const AggregatedDuration& duration)
+{
+    static auto duration_to_float = [](Duration dur) -> float {
+        using FloatMs = std::chrono::duration<float, std::chrono::milliseconds::period>;
+        return std::chrono::duration_cast<FloatMs>(dur).count();
+    };
+
+    switch (col) {
+    case 1: return duration_to_float(duration.avg);
+    case 2: return duration_to_float(duration.min);
+    case 3: return duration_to_float(duration.max);
+    default: return QVariant();
+    }
+}
+
+static const TimingCategoryInfo* GetCategoryInfo(int id)
+{
+    const auto& categories = GetProfilingManager().GetTimingCategoriesInfo();
+    if (id >= categories.size()) {
+        return nullptr;
+    } else {
+        return &categories[id];
+    }
+}
+
+ProfilerModel::ProfilerModel(QObject* parent) : QAbstractItemModel(parent)
+{
+    updateProfilingInfo();
+    const auto& categories = GetProfilingManager().GetTimingCategoriesInfo();
+    results.time_per_category.resize(categories.size());
+}
+
+QVariant ProfilerModel::headerData(int section, Qt::Orientation orientation, int role) const
+{
+    if (orientation == Qt::Horizontal && role == Qt::DisplayRole) {
+        switch (section) {
+        case 0: return tr("Category");
+        case 1: return tr("Avg");
+        case 2: return tr("Min");
+        case 3: return tr("Max");
+        }
+    }
+
+    return QVariant();
+}
+
+QModelIndex ProfilerModel::index(int row, int column, const QModelIndex& parent) const
+{
+    return createIndex(row, column);
+}
+
+QModelIndex ProfilerModel::parent(const QModelIndex& child) const
+{
+    return QModelIndex();
+}
+
+int ProfilerModel::columnCount(const QModelIndex& parent) const
+{
+    return 4;
+}
+
+int ProfilerModel::rowCount(const QModelIndex& parent) const
+{
+    if (parent.isValid()) {
+        return 0;
+    } else {
+        return results.time_per_category.size() + 2;
+    }
+}
+
+QVariant ProfilerModel::data(const QModelIndex& index, int role) const
+{
+    if (role == Qt::DisplayRole) {
+        if (index.row() == 0) {
+            if (index.column() == 0) {
+                return tr("Frame");
+            } else {
+                return GetDataForColumn(index.column(), results.frame_time);
+            }
+        } else if (index.row() == 1) {
+            if (index.column() == 0) {
+                return tr("Frame (with swapping)");
+            } else {
+                return GetDataForColumn(index.column(), results.interframe_time);
+            }
+        } else {
+            if (index.column() == 0) {
+                const TimingCategoryInfo* info = GetCategoryInfo(index.row() - 2);
+                return info != nullptr ? QString(info->name) : QVariant();
+            } else {
+                if (index.row() - 2 < results.time_per_category.size()) {
+                    return GetDataForColumn(index.column(), results.time_per_category[index.row() - 2]);
+                } else {
+                    return QVariant();
+                }
+            }
+        }
+    }
+
+    return QVariant();
+}
+
+void ProfilerModel::updateProfilingInfo()
+{
+    results = GetTimingResultsAggregator()->GetAggregatedResults();
+    emit dataChanged(createIndex(0, 1), createIndex(rowCount() - 1, 3));
+}
+
+ProfilerWidget::ProfilerWidget(QWidget* parent) : QDockWidget(parent)
+{
+    ui.setupUi(this);
+
+    model = new ProfilerModel(this);
+    ui.treeView->setModel(model);
+
+    connect(this, SIGNAL(visibilityChanged(bool)), SLOT(setProfilingInfoUpdateEnabled(bool)));
+    connect(&update_timer, SIGNAL(timeout()), model, SLOT(updateProfilingInfo()));
+}
+
+void ProfilerWidget::setProfilingInfoUpdateEnabled(bool enable)
+{
+    if (enable) {
+        update_timer.start(100);
+        model->updateProfilingInfo();
+    } else {
+        update_timer.stop();
+    }
+}
diff --git a/src/citra_qt/debugger/profiler.h b/src/citra_qt/debugger/profiler.h
new file mode 100644
index 0000000000..a6d87aa0f7
--- /dev/null
+++ b/src/citra_qt/debugger/profiler.h
@@ -0,0 +1,50 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <QAbstractItemModel>
+#include <QDockWidget>
+#include <QTimer>
+#include "ui_profiler.h"
+
+#include "common/profiler_reporting.h"
+
+class ProfilerModel : public QAbstractItemModel
+{
+    Q_OBJECT
+
+public:
+    ProfilerModel(QObject* parent);
+
+    QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const;
+    QModelIndex index(int row, int column, const QModelIndex& parent = QModelIndex()) const override;
+    QModelIndex parent(const QModelIndex& child) const override;
+    int columnCount(const QModelIndex& parent = QModelIndex()) const override;
+    int rowCount(const QModelIndex& parent = QModelIndex()) const override;
+    QVariant data(const QModelIndex& index, int role = Qt::DisplayRole) const override;
+
+public slots:
+    void updateProfilingInfo();
+
+private:
+    Common::Profiling::AggregatedFrameResult results;
+};
+
+class ProfilerWidget : public QDockWidget
+{
+    Q_OBJECT
+
+public:
+    ProfilerWidget(QWidget* parent = 0);
+
+private slots:
+    void setProfilingInfoUpdateEnabled(bool enable);
+
+private:
+    Ui::Profiler ui;
+    ProfilerModel* model;
+
+    QTimer update_timer;
+};
diff --git a/src/citra_qt/debugger/profiler.ui b/src/citra_qt/debugger/profiler.ui
new file mode 100644
index 0000000000..d3c9a9a1fb
--- /dev/null
+++ b/src/citra_qt/debugger/profiler.ui
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>Profiler</class>
+ <widget class="QDockWidget" name="Profiler">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>400</width>
+    <height>300</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Profiler</string>
+  </property>
+  <widget class="QWidget" name="dockWidgetContents">
+   <layout class="QVBoxLayout" name="verticalLayout">
+    <item>
+     <widget class="QTreeView" name="treeView">
+      <property name="alternatingRowColors">
+       <bool>true</bool>
+      </property>
+      <property name="uniformRowHeights">
+       <bool>true</bool>
+      </property>
+     </widget>
+    </item>
+   </layout>
+  </widget>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index 881c7d3378..e3db3c20ac 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -35,6 +35,7 @@
 #include "debugger/graphics_cmdlists.h"
 #include "debugger/graphics_framebuffer.h"
 #include "debugger/graphics_vertex_shader.h"
+#include "debugger/profiler.h"
 
 #include "core/settings.h"
 #include "core/system.h"
@@ -57,6 +58,10 @@ GMainWindow::GMainWindow()
     render_window = new GRenderWindow;
     render_window->hide();
 
+    profilerWidget = new ProfilerWidget(this);
+    addDockWidget(Qt::BottomDockWidgetArea, profilerWidget);
+    profilerWidget->hide();
+
     disasmWidget = new DisassemblerWidget(this, render_window->GetEmuThread());
     addDockWidget(Qt::BottomDockWidgetArea, disasmWidget);
     disasmWidget->hide();
@@ -90,6 +95,7 @@ GMainWindow::GMainWindow()
     graphicsVertexShaderWidget->hide();
 
     QMenu* debug_menu = ui.menu_View->addMenu(tr("Debugging"));
+    debug_menu->addAction(profilerWidget->toggleViewAction());
     debug_menu->addAction(disasmWidget->toggleViewAction());
     debug_menu->addAction(registersWidget->toggleViewAction());
     debug_menu->addAction(callstackWidget->toggleViewAction());
diff --git a/src/citra_qt/main.h b/src/citra_qt/main.h
index dd53489dd6..9b57c57729 100644
--- a/src/citra_qt/main.h
+++ b/src/citra_qt/main.h
@@ -11,6 +11,7 @@
 
 class GImageInfo;
 class GRenderWindow;
+class ProfilerWidget;
 class DisassemblerWidget;
 class RegistersWidget;
 class CallstackWidget;
@@ -54,6 +55,7 @@ private:
 
     GRenderWindow* render_window;
 
+    ProfilerWidget* profilerWidget;
     DisassemblerWidget* disasmWidget;
     RegistersWidget* registersWidget;
     CallstackWidget* callstackWidget;
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index b05c355460..daa2d59de2 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SRCS
             mem_arena.cpp
             memory_util.cpp
             misc.cpp
+            profiler.cpp
             scm_rev.cpp
             string_util.cpp
             symbols.cpp
@@ -48,11 +49,14 @@ set(HEADERS
             mem_arena.h
             memory_util.h
             platform.h
+            profiler.h
+            profiler_reporting.h
             scm_rev.h
             scope_exit.h
             string_util.h
             swap.h
             symbols.h
+            synchronized_wrapper.h
             thread.h
             thread_queue_list.h
             thunk.h
diff --git a/src/common/profiler.cpp b/src/common/profiler.cpp
new file mode 100644
index 0000000000..c37546af00
--- /dev/null
+++ b/src/common/profiler.cpp
@@ -0,0 +1,159 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/profiler.h"
+#include "common/profiler_reporting.h"
+#include "common/assert.h"
+
+namespace Common {
+namespace Profiling {
+
+#if ENABLE_PROFILING
+thread_local Timer* Timer::current_timer = nullptr;
+#endif
+
+TimingCategory::TimingCategory(const char* name, TimingCategory* parent)
+        : accumulated_duration(0) {
+
+    ProfilingManager& manager = GetProfilingManager();
+    category_id = manager.RegisterTimingCategory(this, name);
+    if (parent != nullptr)
+        manager.SetTimingCategoryParent(category_id, parent->category_id);
+}
+
+ProfilingManager::ProfilingManager()
+        : last_frame_end(Clock::now()), this_frame_start(Clock::now()) {
+}
+
+unsigned int ProfilingManager::RegisterTimingCategory(TimingCategory* category, const char* name) {
+    TimingCategoryInfo info;
+    info.category = category;
+    info.name = name;
+    info.parent = TimingCategoryInfo::NO_PARENT;
+
+    unsigned int id = (unsigned int)timing_categories.size();
+    timing_categories.push_back(std::move(info));
+
+    return id;
+}
+
+void ProfilingManager::SetTimingCategoryParent(unsigned int category, unsigned int parent) {
+    ASSERT(category < timing_categories.size());
+    ASSERT(parent < timing_categories.size());
+
+    timing_categories[category].parent = parent;
+}
+
+void ProfilingManager::BeginFrame() {
+    this_frame_start = Clock::now();
+}
+
+void ProfilingManager::FinishFrame() {
+    Clock::time_point now = Clock::now();
+
+    results.interframe_time = now - last_frame_end;
+    results.frame_time = now - this_frame_start;
+
+    results.time_per_category.resize(timing_categories.size());
+    for (size_t i = 0; i < timing_categories.size(); ++i) {
+        results.time_per_category[i] = timing_categories[i].category->GetAccumulatedTime();
+    }
+
+    last_frame_end = now;
+}
+
+TimingResultsAggregator::TimingResultsAggregator(size_t window_size)
+        : max_window_size(window_size), window_size(0) {
+    interframe_times.resize(window_size, Duration::zero());
+    frame_times.resize(window_size, Duration::zero());
+}
+
+void TimingResultsAggregator::Clear() {
+    window_size = cursor = 0;
+}
+
+void TimingResultsAggregator::SetNumberOfCategories(size_t n) {
+    size_t old_size = times_per_category.size();
+    if (n == old_size)
+        return;
+
+    times_per_category.resize(n);
+
+    for (size_t i = old_size; i < n; ++i) {
+        times_per_category[i].resize(max_window_size, Duration::zero());
+    }
+}
+
+void TimingResultsAggregator::AddFrame(const ProfilingFrameResult& frame_result) {
+    SetNumberOfCategories(frame_result.time_per_category.size());
+
+    interframe_times[cursor] = frame_result.interframe_time;
+    frame_times[cursor] = frame_result.frame_time;
+    for (size_t i = 0; i < frame_result.time_per_category.size(); ++i) {
+        times_per_category[i][cursor] = frame_result.time_per_category[i];
+    }
+
+    ++cursor;
+    if (cursor == max_window_size)
+        cursor = 0;
+    if (window_size < max_window_size)
+        ++window_size;
+}
+
+static AggregatedDuration AggregateField(const std::vector<Duration>& v, size_t len) {
+    AggregatedDuration result;
+    result.avg = Duration::zero();
+
+    result.min = result.max = (len == 0 ? Duration::zero() : v[0]);
+
+    for (size_t i = 1; i < len; ++i) {
+        Duration value = v[i];
+        result.avg += value;
+        result.min = std::min(result.min, value);
+        result.max = std::max(result.max, value);
+    }
+    if (len != 0)
+        result.avg /= len;
+
+    return result;
+}
+
+static float tof(Common::Profiling::Duration dur) {
+    using FloatMs = std::chrono::duration<float, std::chrono::milliseconds::period>;
+    return std::chrono::duration_cast<FloatMs>(dur).count();
+}
+
+AggregatedFrameResult TimingResultsAggregator::GetAggregatedResults() const {
+    AggregatedFrameResult result;
+
+    result.interframe_time = AggregateField(interframe_times, window_size);
+    result.frame_time = AggregateField(frame_times, window_size);
+
+    if (result.interframe_time.avg != Duration::zero()) {
+        result.fps = 1000.0f / tof(result.interframe_time.avg);
+    } else {
+        result.fps = 0.0f;
+    }
+
+    result.time_per_category.resize(times_per_category.size());
+    for (size_t i = 0; i < times_per_category.size(); ++i) {
+        result.time_per_category[i] = AggregateField(times_per_category[i], window_size);
+    }
+
+    return result;
+}
+
+ProfilingManager& GetProfilingManager() {
+    // Takes advantage of "magic" static initialization for race-free initialization.
+    static ProfilingManager manager;
+    return manager;
+}
+
+SynchronizedRef<TimingResultsAggregator> GetTimingResultsAggregator() {
+    static SynchronizedWrapper<TimingResultsAggregator> aggregator(30);
+    return SynchronizedRef<TimingResultsAggregator>(aggregator);
+}
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/profiler.h b/src/common/profiler.h
new file mode 100644
index 0000000000..53c4f6eaf2
--- /dev/null
+++ b/src/common/profiler.h
@@ -0,0 +1,134 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+
+#include "common/assert.h"
+#include "common/thread.h"
+
+namespace Common {
+namespace Profiling {
+
+// If this is defined to 0, it turns all Timers into no-ops.
+#ifndef ENABLE_PROFILING
+#define ENABLE_PROFILING 1
+#endif
+
+using Duration = std::chrono::nanoseconds;
+using Clock = std::chrono::high_resolution_clock;
+
+/**
+ * Represents a timing category that measured time can be accounted towards. Should be declared as a
+ * global variable and passed to Timers.
+ */
+class TimingCategory final {
+public:
+    TimingCategory(const char* name, TimingCategory* parent = nullptr);
+
+    unsigned int GetCategoryId() const {
+        return category_id;
+    }
+
+    /// Adds some time to this category. Can safely be called from multiple threads at the same time.
+    void AddTime(Duration amount) {
+        std::atomic_fetch_add_explicit(
+                &accumulated_duration, amount.count(),
+                std::memory_order_relaxed);
+    }
+
+    /**
+     * Atomically retrieves the accumulated measured time for this category and resets the counter
+     * to zero. Can be safely called concurrently with AddTime.
+     */
+    Duration GetAccumulatedTime() {
+        return Duration(std::atomic_exchange_explicit(
+                &accumulated_duration, (Duration::rep)0,
+                std::memory_order_relaxed));
+    }
+
+private:
+    unsigned int category_id;
+    std::atomic<Duration::rep> accumulated_duration;
+};
+
+/**
+ * Measures time elapsed between a call to Start and a call to Stop and attributes it to the given
+ * TimingCategory. Start/Stop can be called multiple times on the same timer, but each call must be
+ * appropriately paired.
+ *
+ * When a Timer is started, it automatically pauses a previously running timer on the same thread,
+ * which is resumed when it is stopped. As such, no special action needs to be taken to avoid
+ * double-accounting of time on two categories.
+ */
+class Timer {
+public:
+    Timer(TimingCategory& category) : category(category) {
+    }
+
+    void Start() {
+#if ENABLE_PROFILING
+        ASSERT(!running);
+        previous_timer = current_timer;
+        current_timer = this;
+        if (previous_timer != nullptr)
+            previous_timer->StopTiming();
+
+        StartTiming();
+#endif
+    }
+
+    void Stop() {
+#if ENABLE_PROFILING
+        ASSERT(running);
+        StopTiming();
+
+        if (previous_timer != nullptr)
+            previous_timer->StartTiming();
+        current_timer = previous_timer;
+#endif
+    }
+
+private:
+#if ENABLE_PROFILING
+    void StartTiming() {
+        start = Clock::now();
+        running = true;
+    }
+
+    void StopTiming() {
+        auto duration = Clock::now() - start;
+        running = false;
+        category.AddTime(std::chrono::duration_cast<Duration>(duration));
+    }
+
+    Clock::time_point start;
+    bool running = false;
+
+    Timer* previous_timer;
+    static thread_local Timer* current_timer;
+#endif
+
+    TimingCategory& category;
+};
+
+/**
+ * A Timer that automatically starts timing when created and stops at the end of the scope. Should
+ * be used in the majority of cases.
+ */
+class ScopeTimer : public Timer {
+public:
+    ScopeTimer(TimingCategory& category) : Timer(category) {
+        Start();
+    }
+
+    ~ScopeTimer() {
+        Stop();
+    }
+};
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/profiler_reporting.h b/src/common/profiler_reporting.h
new file mode 100644
index 0000000000..3abb733155
--- /dev/null
+++ b/src/common/profiler_reporting.h
@@ -0,0 +1,108 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <mutex>
+#include <utility>
+#include <vector>
+
+#include "common/profiler.h"
+#include "common/synchronized_wrapper.h"
+
+namespace Common {
+namespace Profiling {
+
+struct TimingCategoryInfo {
+    static const unsigned int NO_PARENT = -1;
+
+    TimingCategory* category;
+    const char* name;
+    unsigned int parent;
+};
+
+struct ProfilingFrameResult {
+    /// Time since the last delivered frame
+    Duration interframe_time;
+
+    /// Time spent processing a frame, excluding VSync
+    Duration frame_time;
+
+    /// Total amount of time spent inside each category in this frame. Indexed by the category id
+    std::vector<Duration> time_per_category;
+};
+
+class ProfilingManager final {
+public:
+    ProfilingManager();
+
+    unsigned int RegisterTimingCategory(TimingCategory* category, const char* name);
+    void SetTimingCategoryParent(unsigned int category, unsigned int parent);
+
+    const std::vector<TimingCategoryInfo>& GetTimingCategoriesInfo() const {
+        return timing_categories;
+    }
+
+    /// This should be called after swapping screen buffers.
+    void BeginFrame();
+    /// This should be called before swapping screen buffers.
+    void FinishFrame();
+
+    /// Get the timing results from the previous frame. This is updated when you call FinishFrame().
+    const ProfilingFrameResult& GetPreviousFrameResults() const {
+        return results;
+    }
+
+private:
+    std::vector<TimingCategoryInfo> timing_categories;
+    Clock::time_point last_frame_end;
+    Clock::time_point this_frame_start;
+
+    ProfilingFrameResult results;
+};
+
+struct AggregatedDuration {
+    Duration avg, min, max;
+};
+
+struct AggregatedFrameResult {
+    /// Time since the last delivered frame
+    AggregatedDuration interframe_time;
+
+    /// Time spent processing a frame, excluding VSync
+    AggregatedDuration frame_time;
+
+    float fps;
+
+    /// Total amount of time spent inside each category in this frame. Indexed by the category id
+    std::vector<AggregatedDuration> time_per_category;
+};
+
+class TimingResultsAggregator final {
+public:
+    TimingResultsAggregator(size_t window_size);
+
+    void Clear();
+    void SetNumberOfCategories(size_t n);
+
+    void AddFrame(const ProfilingFrameResult& frame_result);
+
+    AggregatedFrameResult GetAggregatedResults() const;
+
+    size_t max_window_size;
+    size_t window_size;
+    size_t cursor;
+
+    std::vector<Duration> interframe_times;
+    std::vector<Duration> frame_times;
+    std::vector<std::vector<Duration>> times_per_category;
+};
+
+ProfilingManager& GetProfilingManager();
+SynchronizedRef<TimingResultsAggregator> GetTimingResultsAggregator();
+
+} // namespace Profiling
+} // namespace Common
diff --git a/src/common/synchronized_wrapper.h b/src/common/synchronized_wrapper.h
new file mode 100644
index 0000000000..946252b8c1
--- /dev/null
+++ b/src/common/synchronized_wrapper.h
@@ -0,0 +1,69 @@
+// Copyright 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <mutex>
+
+namespace Common {
+
+/**
+ * Wraps an object, only allowing access to it via a locking reference wrapper. Good to ensure no
+ * one forgets to lock a mutex before acessing an object. To access the wrapped object construct a
+ * SyncronizedRef on this wrapper. Inspired by Rust's Mutex type (http://doc.rust-lang.org/std/sync/struct.Mutex.html).
+ */
+template <typename T>
+class SynchronizedWrapper {
+public:
+    template <typename... Args>
+    SynchronizedWrapper(Args&&... args) :
+        data(std::forward<Args>(args)...) {
+    }
+
+private:
+    template <typename U>
+    friend class SynchronizedRef;
+
+    std::mutex mutex;
+    T data;
+};
+
+/**
+ * Synchronized reference, that keeps a SynchronizedWrapper's mutex locked during its lifetime. This
+ * greatly reduces the chance that someone will access the wrapped resource without locking the
+ * mutex.
+ */
+template <typename T>
+class SynchronizedRef {
+public:
+    SynchronizedRef(SynchronizedWrapper<T>& wrapper) : wrapper(&wrapper) {
+        wrapper.mutex.lock();
+    }
+
+    SynchronizedRef(SynchronizedRef&) = delete;
+    SynchronizedRef(SynchronizedRef&& o) : wrapper(o.wrapper) {
+        o.wrapper = nullptr;
+    }
+
+    ~SynchronizedRef() {
+        if (wrapper)
+            wrapper->mutex.unlock();
+    }
+
+    SynchronizedRef& operator=(SynchronizedRef&) = delete;
+    SynchronizedRef& operator=(SynchronizedRef&& o) {
+        std::swap(wrapper, o.wrapper);
+    }
+
+    T& operator*() { return wrapper->data; }
+    const T& operator*() const { return wrapper->data; }
+
+    T* operator->() { return &wrapper->data; }
+    const T* operator->() const { return &wrapper->data; }
+
+private:
+    SynchronizedWrapper<T>* wrapper;
+};
+
+} // namespace Common
diff --git a/src/common/thread.h b/src/common/thread.h
index eaf1ba00cc..a45728e1e9 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -24,6 +24,25 @@
 #include <unistd.h>
 #endif
 
+// Support for C++11's thread_local keyword was surprisingly spotty in compilers until very
+// recently. Fortunately, thread local variables have been well supported for compilers for a while,
+// but with semantics supporting only POD types, so we can use a few defines to get some amount of
+// backwards compat support.
+// WARNING: This only works correctly with POD types.
+#if defined(__clang__)
+#   if !__has_feature(cxx_thread_local)
+#       define thread_local __thread
+#   endif
+#elif defined(__GNUC__)
+#   if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#       define thread_local __thread
+#   endif
+#elif defined(_MSC_VER)
+#   if _MSC_VER < 1900
+#       define thread_local __declspec(thread)
+#   endif
+#endif
+
 namespace Common
 {
 
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index d8a708b9e9..c3dba88821 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -9,6 +9,7 @@
 #include <unordered_map>
 
 #include "common/logging/log.h"
+#include "common/profiler.h"
 
 #include "core/mem_map.h"
 #include "core/hle/hle.h"
@@ -20,6 +21,9 @@
 #include "core/arm/skyeye_common/armmmu.h"
 #include "core/arm/skyeye_common/vfp/vfp.h"
 
+Common::Profiling::TimingCategory profile_execute("DynCom::Execute");
+Common::Profiling::TimingCategory profile_decode("DynCom::Decode");
+
 enum {
     COND            = (1 << 0),
     NON_BRANCH      = (1 << 1),
@@ -3569,6 +3573,8 @@ typedef struct instruction_set_encoding_item ISEITEM;
 extern const ISEITEM arm_instruction[];
 
 static int InterpreterTranslate(ARMul_State* cpu, int& bb_start, addr_t addr) {
+    Common::Profiling::ScopeTimer timer_decode(profile_decode);
+
     // Decode instruction, get index
     // Allocate memory and init InsCream
     // Go on next, until terminal instruction
@@ -3641,6 +3647,8 @@ static bool InAPrivilegedMode(ARMul_State* core) {
 }
 
 unsigned InterpreterMainLoop(ARMul_State* state) {
+    Common::Profiling::ScopeTimer timer_execute(profile_execute);
+
     #undef RM
     #undef RS
 
diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp
index b0066e15ea..c925279da7 100644
--- a/src/core/hle/hle.cpp
+++ b/src/core/hle/hle.cpp
@@ -4,6 +4,8 @@
 
 #include <vector>
 
+#include "common/profiler.h"
+
 #include "core/arm/arm_interface.h"
 #include "core/mem_map.h"
 #include "core/hle/hle.h"
@@ -19,6 +21,8 @@
 
 namespace HLE {
 
+Common::Profiling::TimingCategory profiler_svc("SVC Calls");
+
 static std::vector<ModuleDef> g_module_db;
 
 bool g_reschedule = false;  ///< If true, immediately reschedules the CPU to a new thread
@@ -33,6 +37,8 @@ static const FunctionDef* GetSVCInfo(u32 opcode) {
 }
 
 void CallSVC(u32 opcode) {
+    Common::Profiling::ScopeTimer timer_svc(profiler_svc);
+
     const FunctionDef *info = GetSVCInfo(opcode);
 
     if (!info) {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index 586ad62b6d..e031871e81 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -4,6 +4,8 @@
 
 #include <boost/range/algorithm/fill.hpp>
 
+#include "common/profiler.h"
+
 #include "clipper.h"
 #include "command_processor.h"
 #include "math.h"
@@ -25,6 +27,8 @@ static int float_regs_counter = 0;
 
 static u32 uniform_write_buffer[4];
 
+Common::Profiling::TimingCategory category_drawing("Drawing");
+
 static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
     if (id >= registers.NumIds())
@@ -53,6 +57,8 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX(trigger_draw):
         case PICA_REG_INDEX(trigger_draw_indexed):
         {
+            Common::Profiling::ScopeTimer scope_timer(category_drawing);
+
             DebugUtils::DumpTevStageConfig(registers.GetTevStages());
 
             if (g_debug_context)
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 2726951740..db7538ddd8 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -4,7 +4,10 @@
 
 #include "core/hw/gpu.h"
 #include "core/mem_map.h"
+
 #include "common/emu_window.h"
+#include "common/profiler_reporting.h"
+
 #include "video_core/video_core.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
@@ -75,9 +78,18 @@ void RendererOpenGL::SwapBuffers() {
 
     DrawScreens();
 
+    auto& profiler = Common::Profiling::GetProfilingManager();
+    profiler.FinishFrame();
+    {
+        auto aggregator = Common::Profiling::GetTimingResultsAggregator();
+        aggregator->AddFrame(profiler.GetPreviousFrameResults());
+    }
+
     // Swap buffers
     render_window->PollEvents();
     render_window->SwapBuffers();
+
+    profiler.BeginFrame();
 }
 
 /**

From dc8a3f8bc842df1b3eeeb5a283556ac644ab3183 Mon Sep 17 00:00:00 2001
From: Yuri Kunde Schlesner <yuriks@yuriks.net>
Date: Sun, 15 Feb 2015 15:49:27 -0200
Subject: [PATCH 2/2] Profiler: Implement QPCClock to get better precision on
 Win32

MSVC 2013 (at least) doesn't use QueryPerformanceCounter to implement
std::chrono::high_resolution_clock, so it has bad precision. Manually
implementing our own clock type using it works around this for now.
---
 src/common/profiler.cpp | 23 +++++++++++++++++++++++
 src/common/profiler.h   | 20 +++++++++++++++++++-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/common/profiler.cpp b/src/common/profiler.cpp
index c37546af00..65c3df167d 100644
--- a/src/common/profiler.cpp
+++ b/src/common/profiler.cpp
@@ -6,6 +6,12 @@
 #include "common/profiler_reporting.h"
 #include "common/assert.h"
 
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013.
+#define NOMINMAX
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h> // For QueryPerformanceCounter/Frequency
+#endif
+
 namespace Common {
 namespace Profiling {
 
@@ -13,6 +19,23 @@ namespace Profiling {
 thread_local Timer* Timer::current_timer = nullptr;
 #endif
 
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
+QPCClock::time_point QPCClock::now() {
+    static LARGE_INTEGER freq;
+    // Use this dummy local static to ensure this gets initialized once.
+    static BOOL dummy = QueryPerformanceFrequency(&freq);
+
+    LARGE_INTEGER ticks;
+    QueryPerformanceCounter(&ticks);
+
+    // This is prone to overflow when multiplying, which is why I'm using micro instead of nano. The
+    // correct way to approach this would be to just return ticks as a time_point and then subtract
+    // and do this conversion when creating a duration from two time_points, however, as far as I
+    // could tell the C++ requirements for these types are incompatible with this approach.
+    return time_point(duration(ticks.QuadPart * std::micro::den / freq.QuadPart));
+}
+#endif
+
 TimingCategory::TimingCategory(const char* name, TimingCategory* parent)
         : accumulated_duration(0) {
 
diff --git a/src/common/profiler.h b/src/common/profiler.h
index 53c4f6eaf2..3e967b4bce 100644
--- a/src/common/profiler.h
+++ b/src/common/profiler.h
@@ -18,8 +18,26 @@ namespace Profiling {
 #define ENABLE_PROFILING 1
 #endif
 
-using Duration = std::chrono::nanoseconds;
+#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013
+// MSVC up to 2013 doesn't use QueryPerformanceCounter for high_resolution_clock, so it has bad
+// precision. We manually implement a clock based on QPC to get good results.
+
+struct QPCClock {
+    using duration = std::chrono::microseconds;
+    using time_point = std::chrono::time_point<QPCClock>;
+    using rep = duration::rep;
+    using period = duration::period;
+    static const bool is_steady = false;
+
+    static time_point now();
+};
+
+using Clock = QPCClock;
+#else
 using Clock = std::chrono::high_resolution_clock;
+#endif
+
+using Duration = Clock::duration;
 
 /**
  * Represents a timing category that measured time can be accounted towards. Should be declared as a