diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h
index 7352c70c24..557da881b1 100644
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -28,7 +28,7 @@ pad_sright =
 
 [Core]
 cpu_core = ## 0: Interpreter (default), 1: FastInterpreter (experimental)
-gpu_refresh_rate = ## 60 (default), 1024 or 2048 may work better on the FastInterpreter
+gpu_refresh_rate = ## 60 (default)
 
 [Data Storage]
 use_virtual_sd =
diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp
index 8f3799351b..20824692d8 100644
--- a/src/citra_qt/bootmanager.cpp
+++ b/src/citra_qt/bootmanager.cpp
@@ -33,19 +33,16 @@ void EmuThread::run()
     stop_run = false;
     while (!stop_run)
     {
-        for (int tight_loop = 0; tight_loop < 10000; ++tight_loop)
+        if (cpu_running)
         {
-            if (cpu_running || exec_cpu_step)
-            {
-                if (exec_cpu_step)
-                    exec_cpu_step = false;
-
-                Core::SingleStep();
-                if (!cpu_running) {
-                    emit CPUStepped();
-                    yieldCurrentThread();
-                }
-            }
+            Core::RunLoop();
+        } 
+        else if (exec_cpu_step)
+        {
+            exec_cpu_step = false;
+            Core::SingleStep();
+            emit CPUStepped();
+            yieldCurrentThread();
         }
     }
     render_window->moveContext();
diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp
index 669b612fc4..a3ed3e31e5 100644
--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@@ -60,7 +60,7 @@ void ARM_DynCom::SetPC(u32 pc) {
  * @return Returns current PC
  */
 u32 ARM_DynCom::GetPC() const {
-    return state->pc;
+    return state->Reg[15];
 }
 
 /**
@@ -110,9 +110,12 @@ u64 ARM_DynCom::GetTicks() const {
  * @param num_instructions Number of instructions to executes
  */
 void ARM_DynCom::ExecuteInstructions(int num_instructions) {
-    ticks += num_instructions;
     state->NumInstrsToExecute = num_instructions;
-    InterpreterMainLoop(state.get());
+
+    // Dyncom only breaks on instruction dispatch. This only happens on every instruction when
+    // executing one instruction at a time. Otherwise, if a block is being executed, more 
+    // instructions may actually be executed than specified.
+    ticks += InterpreterMainLoop(state.get());
 }
 
 /**
@@ -126,7 +129,7 @@ void ARM_DynCom::SaveContext(ThreadContext& ctx) {
 
     ctx.sp = state->Reg[13];
     ctx.lr = state->Reg[14];
-    ctx.pc = state->pc;
+    ctx.pc = state->Reg[15];
     ctx.cpsr = state->Cpsr;
 
     ctx.fpscr = state->VFP[1];
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index fe1501b59c..f899e2e8a3 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -3718,7 +3718,7 @@ static bool InAPrivilegedMode(arm_core_t *core)
 }
 
 /* r15 = r15 + 8 */
-void InterpreterMainLoop(ARMul_State* state)
+unsigned InterpreterMainLoop(ARMul_State* state)
 {
 	#define CRn				inst_cream->crn
 	#define OPCODE_2			inst_cream->opcode_2
@@ -3747,16 +3747,22 @@ void InterpreterMainLoop(ARMul_State* state)
 	#endif
 
 	#define FETCH_INST			if (inst_base->br != NON_BRANCH)                                   \
-							goto PROFILING;                                             \
+							goto DISPATCH;                                             \
 						inst_base = (arm_inst *)&inst_buf[ptr]                             
 #define INC_PC(l)			ptr += sizeof(arm_inst) + l
 
 // GCC and Clang have a C++ extension to support a lookup table of labels. Otherwise, fallback to a
 // clunky switch statement.
 #if defined __GNUC__ || defined __clang__
-#define GOTO_NEXT_INST			goto *InstLabel[inst_base->idx]
+#define GOTO_NEXT_INST \
+    if (num_instrs >= cpu->NumInstrsToExecute) goto END; \
+    num_instrs++; \
+    goto *InstLabel[inst_base->idx]
 #else
-#define GOTO_NEXT_INST switch(inst_base->idx) { \
+#define GOTO_NEXT_INST \
+    if (num_instrs >= cpu->NumInstrsToExecute) goto END; \
+    num_instrs++; \
+    switch(inst_base->idx) { \
     case 0: goto VMLA_INST; \
     case 1: goto VMLS_INST; \
     case 2: goto VNMLA_INST; \
@@ -4028,20 +4034,15 @@ void InterpreterMainLoop(ARMul_State* state)
 	unsigned int addr;
 	unsigned int phys_addr;
 	unsigned int last_pc = 0;
+	unsigned int num_instrs = 0;
 	fault_t fault;
 	static unsigned int last_physical_base = 0, last_logical_base = 0;
 	int ptr;
+	bool single_step = (cpu->NumInstrsToExecute == 1);
 
 	LOAD_NZCVT;
 	DISPATCH:
 	{
-        if (cpu->NumInstrsToExecute == 0)
-            return;
-
-        cpu->NumInstrsToExecute--;
-
-        //NOTICE_LOG(ARM11, "instr!");
-
 		if (!cpu->NirqSig) {
                 	if (!(cpu->Cpsr & 0x80)) {
 				goto END;
@@ -4179,10 +4180,6 @@ void InterpreterMainLoop(ARMul_State* state)
 		inst_base = (arm_inst *)&inst_buf[ptr];
 		GOTO_NEXT_INST;
 	}
-	PROFILING:
-	{
-		goto DISPATCH;
-	}
 	ADC_INST:
 	{
 		INC_ICOUNTER;
@@ -4207,7 +4204,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(adc_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4241,7 +4238,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(add_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4272,7 +4269,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(and_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4290,11 +4287,11 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			SET_PC;
 			INC_PC(sizeof(bbl_inst));
-			goto PROFILING;
+			goto DISPATCH;
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
 		INC_PC(sizeof(bbl_inst));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	BIC_INST:
 	{
@@ -4322,7 +4319,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(bic_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4358,12 +4355,12 @@ void InterpreterMainLoop(ARMul_State* state)
 				//DEBUG_MSG;
 			}
 			INC_PC(sizeof(blx_inst));
-			goto PROFILING;
+			goto DISPATCH;
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
 //		INC_PC(sizeof(bx_inst));
 		INC_PC(sizeof(blx_inst));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	BX_INST:
 	{
@@ -4376,12 +4373,12 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[15] = cpu->Reg[inst_cream->Rm] & 0xfffffffe;
 //			cpu->TFlag = cpu->Reg[inst_cream->Rm] & 0x1;
 			INC_PC(sizeof(bx_inst));
-			goto PROFILING;
+			goto DISPATCH;
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
 //		INC_PC(sizeof(bx_inst));
 		INC_PC(sizeof(bx_inst));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	BXJ_INST:
 	CDP_INST:
@@ -4393,7 +4390,8 @@ void InterpreterMainLoop(ARMul_State* state)
 			#define CP_ACCESS_ALLOW 0
 			if(CP_ACCESS_ALLOW){
 				/* undefined instruction here */
-				return;
+				cpu->NumInstrsToExecute = 0;
+				return num_instrs;
 			}
 			ERROR_LOG(ARM11, "CDP insn inst=0x%x, pc=0x%x\n", inst_cream->inst, cpu->Reg[15]);
 			unsigned cpab = (cpu->CDP[inst_cream->cp_num]) (cpu, ARMul_FIRST, inst_cream->inst);
@@ -4522,7 +4520,7 @@ void InterpreterMainLoop(ARMul_State* state)
 //			RD = RM;
 			if ((inst_cream->Rd == 15)) {
 				INC_PC(sizeof(mov_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 //		DEBUG_LOG(ARM11, "cpy inst %x\n", cpu->Reg[15]);
@@ -4558,7 +4556,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(eor_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4717,7 +4715,7 @@ void InterpreterMainLoop(ARMul_State* state)
  			}
 			if (BIT(inst, 15)) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4764,7 +4762,7 @@ void InterpreterMainLoop(ARMul_State* state)
 				cpu->TFlag = value & 0x1;
 				cpu->Reg[15] &= 0xFFFFFFFE;
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		//}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4794,7 +4792,7 @@ void InterpreterMainLoop(ARMul_State* state)
 				cpu->TFlag = value & 0x1;
 				cpu->Reg[15] &= 0xFFFFFFFE;
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4848,7 +4846,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4869,7 +4867,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4926,7 +4924,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4953,7 +4951,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -4980,7 +4978,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5006,7 +5004,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5031,7 +5029,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5058,7 +5056,7 @@ void InterpreterMainLoop(ARMul_State* state)
 
 			if (BITS(inst_cream->inst, 12, 15) == 15) {
 				INC_PC(sizeof(ldst_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5228,7 +5226,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(mla_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5260,7 +5258,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(mov_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 //				return;
 		}
@@ -5422,7 +5420,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(mul_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5451,7 +5449,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(mvn_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5483,7 +5481,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(orr_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5575,7 +5573,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(rsb_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5612,7 +5610,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(rsc_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -5653,7 +5651,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(sbc_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -6066,7 +6064,7 @@ void InterpreterMainLoop(ARMul_State* state)
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
 		//if (BITS(inst_cream->inst, 12, 15) == 15)
-		//	goto PROFILING;
+		//	goto DISPATCH;
 		INC_PC(sizeof(ldst_inst));
 		FETCH_INST;
 		GOTO_NEXT_INST;
@@ -6175,7 +6173,7 @@ void InterpreterMainLoop(ARMul_State* state)
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
 		//if (BITS(inst_cream->inst, 12, 15) == 15)
-		//	goto PROFILING;
+		//	goto DISPATCH;
 		INC_PC(sizeof(ldst_inst));
 		FETCH_INST;
 		GOTO_NEXT_INST;
@@ -6225,7 +6223,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			}
 			if (inst_cream->Rd == 15) {
 				INC_PC(sizeof(sub_inst));
-				goto PROFILING;
+				goto DISPATCH;
 			}
 		}
 		cpu->Reg[15] += GET_INST_SIZE(cpu);
@@ -6449,7 +6447,7 @@ void InterpreterMainLoop(ARMul_State* state)
 		cpu->Reg[15] = cpu->Reg[15] + 4 + inst_cream->imm;
 		//DEBUG_LOG(ARM11, " BL_1_THUMB: imm=0x%x, r14=0x%x, r15=0x%x\n", inst_cream->imm, cpu->Reg[14], cpu->Reg[15]);
 		INC_PC(sizeof(b_2_thumb));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	B_COND_THUMB:
 	{
@@ -6461,7 +6459,7 @@ void InterpreterMainLoop(ARMul_State* state)
 			cpu->Reg[15] += 2;
 		//DEBUG_LOG(ARM11, " B_COND_THUMB: imm=0x%x, r15=0x%x\n", inst_cream->imm, cpu->Reg[15]);
 		INC_PC(sizeof(b_cond_thumb));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	BL_1_THUMB:
 	{
@@ -6487,7 +6485,7 @@ void InterpreterMainLoop(ARMul_State* state)
 		cpu->Reg[14] = tmp;
 		//DEBUG_LOG(ARM11, " BL_2_THUMB: imm=0x%x, r14=0x%x, r15=0x%x\n", inst_cream->imm, cpu->Reg[14], cpu->Reg[15]);
 		INC_PC(sizeof(bl_2_thumb));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 	BLX_1_THUMB:
 	{	
@@ -6503,7 +6501,7 @@ void InterpreterMainLoop(ARMul_State* state)
 		cpu->TFlag = 0;
 		//DEBUG_LOG(ARM11, "In BLX_1_THUMB, BLX(1),imm=0x%x,r14=0x%x, r15=0x%x, \n", inst_cream->imm, cpu->Reg[14], cpu->Reg[15]);
 		INC_PC(sizeof(blx_1_thumb));
-		goto PROFILING;
+		goto DISPATCH;
 	}
 
 	UQADD16_INST:
@@ -6532,12 +6530,14 @@ void InterpreterMainLoop(ARMul_State* state)
 		cpu->AbortAddr = addr;
 		cpu->CP15[CP15(CP15_FAULT_STATUS)] = fault & 0xff;
 		cpu->CP15[CP15(CP15_FAULT_ADDRESS)] = addr;
-		return;
+		cpu->NumInstrsToExecute = 0;
+		return num_instrs;
 	}
 	END:
 	{
 		SAVE_NZCVT;
-		return;
+		cpu->NumInstrsToExecute = 0;
+		return num_instrs;
 	}
 	INIT_INST_LENGTH:
 	{
@@ -6557,7 +6557,8 @@ void InterpreterMainLoop(ARMul_State* state)
 	DEBUG_LOG(ARM11, "%llx\n", InstLabel[1]);
 	DEBUG_LOG(ARM11, "%lld\n", (char *)InstEndLabel[1] - (char *)InstLabel[1]);
 #endif
-	return;
+	cpu->NumInstrsToExecute = 0;
+	return num_instrs;
 	}
 }
 
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.h b/src/core/arm/dyncom/arm_dyncom_interpreter.h
index d73f8f65f9..c65eb23f73 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.h
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.h
@@ -4,4 +4,4 @@
 
 #pragma once
 
-void InterpreterMainLoop(ARMul_State* state);
+unsigned InterpreterMainLoop(ARMul_State* state);
diff --git a/src/core/core.h b/src/core/core.h
index 872dc0cd1d..850bb0ab42 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -26,13 +26,13 @@ void Start();
 
 /**
  * Run the core CPU loop
- * This function loops for 100 instructions in the CPU before trying to update hardware. This is a
- * little bit faster than SingleStep, and should be pretty much equivalent. The number of
- * instructions chosen is fairly arbitrary, however a large number will more drastically affect the
- * frequency of GSP interrupts and likely break things. The point of this is to just loop in the CPU
- * for more than 1 instruction to reduce overhead and make it a little bit faster...
+ * This function runs the core for the specified number of CPU instructions before trying to update
+ * hardware. This is much faster than SingleStep (and should be equivalent), as the CPU is not
+ * required to do a full dispatch with each instruction. NOTE: the number of instructions requested
+ * is not guaranteed to run, as this will be interrupted preemptively if a hardware update is
+ * requested (e.g. on a thread switch).
  */
-void RunLoop(int tight_loop=100);
+void RunLoop(int tight_loop=1000);
 
 /// Step the CPU one instruction
 void SingleStep();