diff --git a/qemu/accel/tcg/cpu-exec.c b/qemu/accel/tcg/cpu-exec.c
index ff618cc0..74d5db29 100644
--- a/qemu/accel/tcg/cpu-exec.c
+++ b/qemu/accel/tcg/cpu-exec.c
@@ -211,13 +211,12 @@ static inline void tb_add_jump(TranslationBlock *tb, int n,
 
 static inline TranslationBlock *tb_find(CPUState *cpu,
                                         TranslationBlock *last_tb,
-                                        int tb_exit)
+                                        int tb_exit, uint32_t cf_mask)
 {
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags;
     bool acquired_tb_lock = false;
-    uint32_t cf_mask = curr_cflags(cpu->uc);
 
     tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
     if (tb == NULL) {
@@ -565,7 +564,21 @@ int cpu_exec(struct uc_struct *uc, CPUState *cpu)
         int tb_exit = 0;
 
         while (!cpu_handle_interrupt(cpu, &last_tb)) {
-            TranslationBlock *tb = tb_find(cpu, last_tb, tb_exit);
+            uint32_t cflags = cpu->cflags_next_tb;
+            TranslationBlock *tb;
+
+            /* When requested, use an exact setting for cflags for the next
+               execution.  This is used for icount, precise smc, and stop-
+               after-access watchpoints.  Since this request should never
+               have CF_INVALID set, -1 is a convenient invalid value that
+               does not require tcg headers for cpu_common_reset.  */
+            if (cflags == -1) {
+                cflags = curr_cflags(cpu->uc);
+            } else {
+                cpu->cflags_next_tb = -1;
+            }
+
+            tb = tb_find(cpu, last_tb, tb_exit, cflags);
             if (!tb) {   // invalid TB due to invalid code?
                 uc->invalid_error = UC_ERR_FETCH_UNMAPPED;
                 ret = EXCP_HLT;
diff --git a/qemu/accel/tcg/translate-all.c b/qemu/accel/tcg/translate-all.c
index 0137c4f1..17a46d45 100644
--- a/qemu/accel/tcg/translate-all.c
+++ b/qemu/accel/tcg/translate-all.c
@@ -1519,13 +1519,12 @@ void tb_invalidate_phys_page_range(struct uc_struct *uc, tb_page_addr_t start, t
                                    int is_cpu_write_access)
 {
     TranslationBlock *tb, *tb_next;
-#if defined(TARGET_HAS_PRECISE_SMC)
-    CPUArchState *env = NULL;
-#endif
     tb_page_addr_t tb_start, tb_end;
     PageDesc *p;
     int n;
 #ifdef TARGET_HAS_PRECISE_SMC
+    CPUState *cpu = current_cpu;
+    CPUArchState *env = NULL;
     int current_tb_not_found = is_cpu_write_access;
     TranslationBlock *current_tb = NULL;
     int current_tb_modified = 0;
@@ -1603,11 +1602,8 @@ void tb_invalidate_phys_page_range(struct uc_struct *uc, tb_page_addr_t start, t
 #endif
 #ifdef TARGET_HAS_PRECISE_SMC
     if (current_tb_modified) {
-        /* we generate a block containing just the instruction
-           modifying the memory. It will ensure that it cannot modify
-           itself */
-        tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
-                    1 | curr_cflags(uc));
+        /* Force execution of one insn next time.  */
+        cpu->cflags_next_tb = 1 | curr_cflags(uc);
         cpu_loop_exit_noexc(uc->cpu);
     }
 #endif
@@ -1716,11 +1712,8 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
     p->first_tb = NULL;
 #ifdef TARGET_HAS_PRECISE_SMC
     if (current_tb_modified) {
-        /* we generate a block containing just the instruction
-           modifying the memory. It will ensure that it cannot modify
-           itself */
-        tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
-                    1 | curr_cflags(uc));
+        /* Force execution of one insn next time.  */
+        cpu->cflags_next_tb = 1 | curr_cflags(uc);
         return true;
     }
 #endif
@@ -1814,9 +1807,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
 {
     CPUArchState *env = cpu->env_ptr;
     TranslationBlock *tb;
-    uint32_t n, cflags;
-    target_ulong pc, cs_base;
-    uint32_t flags;
+    uint32_t n;
 
     tb = tb_find_pc(env->uc, retaddr);
     if (!tb) {
@@ -1853,12 +1844,9 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
         cpu_abort(cpu, "TB too big during recompile");
     }
 
-    cflags = n | CF_LAST_IO;
-    cflags |= curr_cflags(cpu->uc);
-    pc = tb->pc;
-    cs_base = tb->cs_base;
-    flags = tb->flags;
-    tb_phys_invalidate(cpu->uc, tb, -1);
+    /* Adjust the execution state of the next TB.  */
+    cpu->cflags_next_tb = curr_cflags(cpu->uc) | CF_LAST_IO | n;
+
     if (tb->cflags & CF_NOCACHE) {
         if (tb->orig_tb) {
             /* Invalidate original TB if this TB was generated in
@@ -1867,9 +1855,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
         }
         tb_free(env->uc, tb);
     }
-    /* FIXME: In theory this could raise an exception.  In practice
-       we have already translated the block once so it's probably ok.  */
-    tb_gen_code(cpu, pc, cs_base, (int)flags, cflags);
+
     /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not
        the first in the TB) then we end up generating a whole new TB and
        repeating the fault, which is horribly inefficient.
diff --git a/qemu/include/qom/cpu.h b/qemu/include/qom/cpu.h
index afdce6b8..7f3f1dd4 100644
--- a/qemu/include/qom/cpu.h
+++ b/qemu/include/qom/cpu.h
@@ -278,6 +278,7 @@ struct CPUState {
     bool stop;
     bool stopped;
     bool crash_occurred;
+    uint32_t cflags_next_tb;
     bool tb_flushed;
     volatile sig_atomic_t exit_request;
     uint32_t interrupt_request;
diff --git a/qemu/qom/cpu.c b/qemu/qom/cpu.c
index f44a09af..992c1a40 100644
--- a/qemu/qom/cpu.c
+++ b/qemu/qom/cpu.c
@@ -176,6 +176,7 @@ static void cpu_common_reset(CPUState *cpu)
     cpu->can_do_io = 0;
     cpu->exception_index = -1;
     cpu->crash_occurred = false;
+    cpu->cflags_next_tb = -1;
 
     // TODO: Should be uncommented, but good 'ol
     //       unicorn's crappy symbol deduplication