diff --git a/qemu/accel/tcg/cputlb.c b/qemu/accel/tcg/cputlb.c
index 661cfe66..80ccb6ea 100644
--- a/qemu/accel/tcg/cputlb.c
+++ b/qemu/accel/tcg/cputlb.c
@@ -60,81 +60,208 @@
     } \
 } while (0)
 
-static void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr);
-static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe);
-static ram_addr_t qemu_ram_addr_from_host_nofail(struct uc_struct *uc, void *ptr);
-static void tlb_add_large_page(CPUArchState *env, target_ulong vaddr,
-                               target_ulong size);
-static void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr);
+#define ALL_MMUIDX_BITS ((1 << NB_MMU_MODES) - 1)
 
 void tlb_init(CPUState *cpu)
 {
 }
 
-/* This is OK because CPU architectures generally permit an
- * implementation to drop entries from the TLB at any time, so
- * flushing more entries than required is only an efficiency issue,
- * not a correctness issue.
- */
-void tlb_flush(CPUState *cpu)
+static void tlb_flush_one_mmuidx_locked(CPUArchState *env, int mmu_idx)
 {
-    CPUArchState *env = cpu->env_ptr;
-
-    memset(env->tlb_table, -1, sizeof(env->tlb_table));
-    memset(env->tlb_v_table, -1, sizeof(env->tlb_v_table));
-    cpu_tb_jmp_cache_clear(cpu);
-
-    env->vtlb_index = 0;
-    env->tlb_flush_addr = -1;
-    env->tlb_flush_mask = 0;
+    memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
+    memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
+    env->tlb_d[mmu_idx].large_page_addr = -1;
+    env->tlb_d[mmu_idx].large_page_mask = -1;
+    env->tlb_d[mmu_idx].vindex = 0;
 }
 
-void tlb_flush_page(CPUState *cpu, target_ulong addr)
+static void tlb_flush_by_mmuidx_async_work(CPUState *cpu, run_on_cpu_data data)
 {
     CPUArchState *env = cpu->env_ptr;
+    unsigned long mmu_idx_bitmask = data.host_int;
     int mmu_idx;
 
-    tlb_debug("page :" TARGET_FMT_lx "\n", addr);
+    tlb_debug("mmu_idx:0x%04lx\n", mmu_idx_bitmask);
+
+    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
+        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
+            tlb_flush_one_mmuidx_locked(env, mmu_idx);
+        }
+    }
+
+    cpu_tb_jmp_cache_clear(cpu);
+}
+
+void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
+{
+    tlb_debug("mmu_idx: 0x%" PRIx16 "\n", idxmap);
+
+    tlb_flush_by_mmuidx_async_work(cpu, RUN_ON_CPU_HOST_INT(idxmap));
+}
+
+void tlb_flush(CPUState *cpu)
+{
+    tlb_flush_by_mmuidx(cpu, ALL_MMUIDX_BITS);
+}
+
+static inline bool tlb_hit_page_anyprot(CPUTLBEntry *tlb_entry,
+                                        target_ulong page)
+{
+    return tlb_hit_page(tlb_entry->addr_read, page) ||
+           tlb_hit_page(tlb_addr_write(tlb_entry), page) ||
+           tlb_hit_page(tlb_entry->addr_code, page);
+}
+
+/* Called with tlb_c.lock held */
+static inline bool tlb_flush_entry_locked(CPUTLBEntry *tlb_entry,
+                                          target_ulong page)
+{
+    if (tlb_hit_page_anyprot(tlb_entry, page)) {
+        memset(tlb_entry, -1, sizeof(*tlb_entry));
+        return true;
+    }
+    return false;
+}
+
+/* Called with tlb_c.lock held */
+static inline void tlb_flush_vtlb_page_locked(CPUArchState *env, int mmu_idx,
+                                              target_ulong page)
+{
+    int k;
+    //assert_cpu_is_self(ENV_GET_CPU(env));
+    for (k = 0; k < CPU_VTLB_SIZE; k++) {
+        tlb_flush_entry_locked(&env->tlb_v_table[mmu_idx][k], page);
+    }
+}
+
+static void tlb_flush_page_locked(CPUArchState *env, int midx,
+                                  target_ulong page)
+{
+    target_ulong lp_addr = env->tlb_d[midx].large_page_addr;
+    target_ulong lp_mask = env->tlb_d[midx].large_page_mask;
 
     /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forcing full flush ("
+    if ((page & lp_mask) == lp_addr) {
+        tlb_debug("forcing full flush midx %d ("
                   TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
-
-        tlb_flush(cpu);
-        return;
+                  midx, lp_addr, lp_mask);
+        tlb_flush_one_mmuidx_locked(env, midx);
+    } else {
+        tlb_flush_entry_locked(tlb_entry(env, midx, page), page);
+        tlb_flush_vtlb_page_locked(env, midx, page);
     }
+}
+
+static void tlb_flush_page_async_work(CPUState *cpu, run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr = (target_ulong) data.target_ptr;
+    int mmu_idx;
+
+    tlb_debug("page addr:" TARGET_FMT_lx "\n", addr);
 
     addr &= TARGET_PAGE_MASK;
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_flush_entry(tlb_entry(env, mmu_idx, addr), addr);
+        tlb_flush_page_locked(env, mmu_idx, addr);
     }
 
-    /* check whether there are entries that need to be flushed in the vtlb */
+    tb_flush_jmp_cache(cpu, addr);
+}
+
+/* As we are going to hijack the bottom bits of the page address for a
+ * mmuidx bit mask we need to fail to build if we can't do that
+ */
+QEMU_BUILD_BUG_ON(NB_MMU_MODES > TARGET_PAGE_BITS_MIN);
+
+static void tlb_flush_page_by_mmuidx_async_work(CPUState *cpu,
+                                                run_on_cpu_data data)
+{
+    CPUArchState *env = cpu->env_ptr;
+    target_ulong addr_and_mmuidx = (target_ulong) data.target_ptr;
+    target_ulong addr = addr_and_mmuidx & TARGET_PAGE_MASK;
+    unsigned long mmu_idx_bitmap = addr_and_mmuidx & ALL_MMUIDX_BITS;
+    int mmu_idx;
+
+    tlb_debug("page addr:" TARGET_FMT_lx " mmu_map:0x%lx\n",
+              addr, mmu_idx_bitmap);
+
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        int k;
-        for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_flush_entry(&env->tlb_v_table[mmu_idx][k], addr);
+        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
+            tlb_flush_page_locked(env, mmu_idx, addr);
         }
     }
 
     tb_flush_jmp_cache(cpu, addr);
 }
 
-void tlb_reset_dirty_range(CPUTLBEntry *tlb_entry, uintptr_t start,
-                           uintptr_t length)
+void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
 {
-    uintptr_t addr;
+    target_ulong addr_and_mmu_idx;
 
-    if (tlb_is_dirty_ram(tlb_entry)) {
-        addr = (tlb_addr_write(tlb_entry) & TARGET_PAGE_MASK) + tlb_entry->addend;
+    tlb_debug("addr: "TARGET_FMT_lx" mmu_idx:%" PRIx16 "\n", addr, idxmap);
+
+    /* This should already be page aligned */
+    addr_and_mmu_idx = addr & TARGET_PAGE_MASK;
+    addr_and_mmu_idx |= idxmap;
+
+    tlb_flush_page_by_mmuidx_async_work(
+        cpu, RUN_ON_CPU_TARGET_PTR(addr_and_mmu_idx));
+}
+
+void tlb_flush_page(CPUState *cpu, target_ulong addr)
+{
+    tlb_flush_page_async_work(cpu, RUN_ON_CPU_TARGET_PTR(addr));
+}
+
+/*
+ * Dirty write flag handling
+ *
+ * When the TCG code writes to a location it looks up the address in
+ * the TLB and uses that data to compute the final address. If any of
+ * the lower bits of the address are set then the slow path is forced.
+ * There are a number of reasons to do this but for normal RAM the
+ * most usual is detecting writes to code regions which may invalidate
+ * generated code.
+ *
+ * Other vCPUs might be reading their TLBs during guest execution, so we update
+ * te->addr_write with atomic_set. We don't need to worry about this for
+ * oversized guests as MTTCG is disabled for them.
+ *
+ * Called with tlb_c.lock held.
+ */
+static void tlb_reset_dirty_range_locked(CPUTLBEntry *tlb_entry,
+                                         uintptr_t start, uintptr_t length)
+{
+    uintptr_t addr = tlb_entry->addr_write;
+
+    if ((addr & (TLB_INVALID_MASK | TLB_MMIO | TLB_NOTDIRTY)) == 0) {
+        addr &= TARGET_PAGE_MASK;
+        addr += tlb_entry->addend;
         if ((addr - start) < length) {
+#if TCG_OVERSIZED_GUEST
             tlb_entry->addr_write |= TLB_NOTDIRTY;
+#else
+            atomic_set(&tlb_entry->addr_write,
+                       tlb_entry->addr_write | TLB_NOTDIRTY);
+#endif
         }
     }
 }
 
+/*
+ * Called with tlb_c.lock held.
+ * Called only from the vCPU context, i.e. the TLB's owner thread.
+ */
+static inline void copy_tlb_helper_locked(CPUTLBEntry *d, const CPUTLBEntry *s)
+{
+    *d = *s;
+}
+
+/* This is a cross vCPU call (i.e. another vCPU resetting the flags of
+ * the target vCPU).
+ * We must take tlb_c.lock to avoid racing with another vCPU update. The only
+ * thing actually updated is the target TLB entry ->addr_write flags.
+ */
 void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
 {
     CPUArchState *env;
@@ -146,17 +273,26 @@ void tlb_reset_dirty(CPUState *cpu, ram_addr_t start1, ram_addr_t length)
         unsigned int i;
 
         for (i = 0; i < CPU_TLB_SIZE; i++) {
-            tlb_reset_dirty_range(&env->tlb_table[mmu_idx][i],
-                                  start1, length);
+            tlb_reset_dirty_range_locked(&env->tlb_table[mmu_idx][i], start1,
+                                         length);
         }
 
         for (i = 0; i < CPU_VTLB_SIZE; i++) {
-            tlb_reset_dirty_range(&env->tlb_v_table[mmu_idx][i],
-                                  start1, length);
+            tlb_reset_dirty_range_locked(&env->tlb_v_table[mmu_idx][i], start1,
+                                         length);
         }
     }
 }
 
+/* Called with tlb_c.lock held */
+static inline void tlb_set_dirty1_locked(CPUTLBEntry *tlb_entry,
+                                         target_ulong vaddr)
+{
+    if (tlb_entry->addr_write == (vaddr | TLB_NOTDIRTY)) {
+        tlb_entry->addr_write = vaddr;
+    }
+}
+
 /* update the TLB corresponding to virtual page vaddr
    so that it is no longer dirty */
 void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
@@ -166,21 +302,48 @@ void tlb_set_dirty(CPUState *cpu, target_ulong vaddr)
 
     vaddr &= TARGET_PAGE_MASK;
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        tlb_set_dirty1(tlb_entry(env, mmu_idx, vaddr), vaddr);
+        tlb_set_dirty1_locked(tlb_entry(env, mmu_idx, vaddr), vaddr);
     }
 
     for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
         int k;
         for (k = 0; k < CPU_VTLB_SIZE; k++) {
-            tlb_set_dirty1(&env->tlb_v_table[mmu_idx][k], vaddr);
+            tlb_set_dirty1_locked(&env->tlb_v_table[mmu_idx][k], vaddr);
         }
     }
 }
 
+/* Our TLB does not support large pages, so remember the area covered by
+   large pages and trigger a full TLB flush if these are invalidated.  */
+static void tlb_add_large_page(CPUArchState *env, int mmu_idx,
+                               target_ulong vaddr, target_ulong size)
+{
+    target_ulong lp_addr = env->tlb_d[mmu_idx].large_page_addr;
+    target_ulong lp_mask = ~(size - 1);
+
+    if (lp_addr == (target_ulong)-1) {
+        /* No previous large page.  */
+        lp_addr = vaddr;
+    } else {
+        /* Extend the existing region to include the new page.
+           This is a compromise between unnecessary flushes and
+           the cost of maintaining a full variable size TLB.  */
+        lp_mask &= env->tlb_d[mmu_idx].large_page_mask;
+        while (((lp_addr ^ vaddr) & lp_mask) != 0) {
+            lp_mask <<= 1;
+        }
+    }
+    env->tlb_d[mmu_idx].large_page_addr = lp_addr & lp_mask;
+    env->tlb_d[mmu_idx].large_page_mask = lp_mask;
+}
 
 /* Add a new TLB entry. At most one entry for a given virtual address
-   is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
-   supplied size is only used by tlb_flush_page.  */
+ * is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the
+ * supplied size is only used by tlb_flush_page.
+ *
+ * Called from TCG-generated code, which is under an RCU read-side
+ * critical section.
+ */
 void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
                              hwaddr paddr, MemTxAttrs attrs, int prot,
                              int mmu_idx, target_ulong size)
@@ -194,15 +357,13 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     CPUTLBEntry *te;
     hwaddr iotlb, xlat, sz, paddr_page;
     target_ulong vaddr_page;
-    unsigned vidx = env->vtlb_index++ % CPU_VTLB_SIZE;
+    unsigned vidx = env->tlb_d[mmu_idx].vindex++ % CPU_VTLB_SIZE;
     int asidx = cpu_asidx_from_attrs(cpu, attrs);
 
-    if (size < TARGET_PAGE_SIZE) {
+    if (size <= TARGET_PAGE_SIZE) {
         sz = TARGET_PAGE_SIZE;
     } else {
-        if (size > TARGET_PAGE_SIZE) {
-            tlb_add_large_page(env, vaddr, size);
-        }
+        tlb_add_large_page(env, mmu_idx, vaddr, size);
         sz = size;
     }
     vaddr_page = vaddr & TARGET_PAGE_MASK;
@@ -238,8 +399,8 @@ void tlb_set_page_with_attrs(CPUState *cpu, target_ulong vaddr,
     iotlb = memory_region_section_get_iotlb(cpu, section, vaddr_page,
                                             paddr_page, xlat, prot, &address);
 
-    index = (vaddr_page >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    te = &env->tlb_table[mmu_idx][index];
+    index = tlb_index(env, mmu_idx, vaddr_page);
+    te = tlb_entry(env, mmu_idx, vaddr_page);
 
     /* do not discard the translation in te, evict it into a victim tlb */
     env->tlb_v_table[mmu_idx][vidx] = *te;
@@ -298,7 +459,7 @@ void tlb_set_page(CPUState *cpu, target_ulong vaddr,
                             prot, mmu_idx, size);
 }
 
-static ram_addr_t qemu_ram_addr_from_host_nofail(struct uc_struct *uc, void *ptr)
+static inline ram_addr_t qemu_ram_addr_from_host_nofail(struct uc_struct *uc, void *ptr)
 {
     ram_addr_t ram_addr;
 
@@ -403,112 +564,10 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env, target_ulong addr)
     }
 }
 
-static void tlb_set_dirty1(CPUTLBEntry *tlb_entry, target_ulong vaddr)
-{
-    if (tlb_addr_write(tlb_entry) == (vaddr | TLB_NOTDIRTY)) {
-        tlb_entry->addr_write = vaddr;
-    }
-}
-
-/* Our TLB does not support large pages, so remember the area covered by
-   large pages and trigger a full TLB flush if these are invalidated.  */
-static void tlb_add_large_page(CPUArchState *env, target_ulong vaddr,
-                               target_ulong size)
-{
-    target_ulong mask = ~(size - 1);
-
-    if (env->tlb_flush_addr == (target_ulong)-1) {
-        env->tlb_flush_addr = vaddr & mask;
-        env->tlb_flush_mask = mask;
-        return;
-    }
-    /* Extend the existing region to include the new page.
-       This is a compromise between unnecessary flushes and the cost
-       of maintaining a full variable size TLB.  */
-    mask &= env->tlb_flush_mask;
-    while (((env->tlb_flush_addr ^ vaddr) & mask) != 0) {
-        mask <<= 1;
-    }
-    env->tlb_flush_addr &= mask;
-    env->tlb_flush_mask = mask;
-}
-
-static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe)
-{
-    return (tlb_addr_write(tlbe) & (TLB_INVALID_MASK|TLB_MMIO|TLB_NOTDIRTY)) == 0;
-}
-
-static inline void v_tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
-{
-    CPUArchState *env = cpu->env_ptr;
-    unsigned long mmu_idx_bitmask = idxmap;
-    int mmu_idx;
-
-    tlb_debug("start\n");
-
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        if (test_bit(mmu_idx, &mmu_idx_bitmask)) {
-            tlb_debug("%d\n", mmu_idx);
-
-            memset(env->tlb_table[mmu_idx], -1, sizeof(env->tlb_table[0]));
-            memset(env->tlb_v_table[mmu_idx], -1, sizeof(env->tlb_v_table[0]));
-        }
-    }
-
-    cpu_tb_jmp_cache_clear(cpu);
-}
-
-void tlb_flush_by_mmuidx(CPUState *cpu, uint16_t idxmap)
-{
-    v_tlb_flush_by_mmuidx(cpu, idxmap);
-}
-
-static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr)
-{
-    if (tlb_hit_page(tlb_entry->addr_read, addr) ||
-        tlb_hit_page(tlb_addr_write(tlb_entry), addr) ||
-        tlb_hit_page(tlb_entry->addr_code, addr)) {
-        memset(tlb_entry, -1, sizeof(*tlb_entry));
-    }
-}
-
-void tlb_flush_page_by_mmuidx(CPUState *cpu, target_ulong addr, uint16_t idxmap)
-{
-    CPUArchState *env = cpu->env_ptr;
-    unsigned long mmu_idx_bitmap = idxmap;
-    int i, page, mmu_idx;
-
-    tlb_debug("addr "TARGET_FMT_lx"\n", addr);
-
-    /* Check if we need to flush due to large pages.  */
-    if ((addr & env->tlb_flush_mask) == env->tlb_flush_addr) {
-        tlb_debug("forced full flush ("
-                  TARGET_FMT_lx "/" TARGET_FMT_lx ")\n",
-                  env->tlb_flush_addr, env->tlb_flush_mask);
-
-        v_tlb_flush_by_mmuidx(cpu, idxmap);
-        return;
-    }
-
-    addr &= TARGET_PAGE_MASK;
-    page = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
-    for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) {
-        if (test_bit(mmu_idx, &mmu_idx_bitmap)) {
-            tlb_flush_entry(&env->tlb_table[mmu_idx][page], addr);
-            /* check whether there are vltb entries that need to be flushed */
-            for (i = 0; i < CPU_VTLB_SIZE; i++) {
-                tlb_flush_entry(&env->tlb_v_table[mmu_idx][i], addr);
-            }
-        }
-    }
-
-    tb_flush_jmp_cache(cpu, addr);
-}
-
 static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
                          int mmu_idx,
                          target_ulong addr, uintptr_t retaddr,
-                         bool recheck, int size)
+                         bool recheck, MMUAccessType access_type, int size)
 {
     CPUState *cpu = ENV_GET_CPU(env);
     hwaddr mr_offset;
@@ -524,16 +583,16 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          * repeat the MMU check here. This tlb_fill() call might
          * longjump out if this access should cause a guest exception.
          */
-        int index;
+        CPUTLBEntry *entry;
         target_ulong tlb_addr;
 
         tlb_fill(cpu, addr, size, MMU_DATA_LOAD, mmu_idx, retaddr);
 
-        index = tlb_index(env, mmu_idx, addr);
-        tlb_addr = env->tlb_table[mmu_idx][index].addr_read;
+        entry = tlb_entry(env, mmu_idx, addr);
+        tlb_addr = entry->addr_read;
         if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
             /* RAM access */
-            uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
+            uintptr_t haddr = addr + entry->addend;
 
             return ldn_p((void *)haddr, size);
         }
@@ -549,6 +608,8 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     }
 
     cpu->mem_io_vaddr = addr;
+    cpu->mem_io_access_type = access_type;
+
     r = memory_region_dispatch_read(mr, mr_offset,
                                     &val, size, iotlbentry->attrs);
     if (r != MEMTX_OK) {
@@ -556,7 +617,7 @@ static uint64_t io_readx(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
             section->offset_within_address_space -
             section->offset_within_region;
 
-        cpu_transaction_failed(cpu, physaddr, addr, size, MMU_DATA_LOAD,
+        cpu_transaction_failed(cpu, physaddr, addr, size, access_type,
                                mmu_idx, iotlbentry->attrs, r, retaddr);
     }
     return val;
@@ -580,16 +641,16 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
          * repeat the MMU check here. This tlb_fill() call might
          * longjump out if this access should cause a guest exception.
          */
-        int index;
+        CPUTLBEntry *entry;
         target_ulong tlb_addr;
 
         tlb_fill(cpu, addr, size, MMU_DATA_STORE, mmu_idx, retaddr);
 
-        index = tlb_index(env, mmu_idx, addr);
-        tlb_addr = env->tlb_table[mmu_idx][index].addr_write;
+        entry = tlb_entry(env, mmu_idx, addr);
+        tlb_addr = tlb_addr_write(entry);
         if (!(tlb_addr & ~(TARGET_PAGE_MASK | TLB_RECHECK))) {
             /* RAM access */
-            uintptr_t haddr = addr + env->tlb_table[mmu_idx][index].addend;
+            uintptr_t haddr = addr + entry->addend;
 
             stn_p((void *)haddr, size, val);
             return;
@@ -603,7 +664,6 @@ static void io_writex(CPUArchState *env, CPUIOTLBEntry *iotlbentry,
     if (mr != &cpu->uc->io_mem_rom && mr != &cpu->uc->io_mem_notdirty && !cpu->can_do_io) {
         cpu_io_recompile(cpu, retaddr);
     }
-
     cpu->mem_io_vaddr = addr;
     cpu->mem_io_pc = retaddr;
     r = memory_region_dispatch_write(mr, mr_offset,
@@ -638,10 +698,13 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
         if (cmp == page) {
             /* Found entry in victim tlb, swap tlb and iotlb.  */
             CPUTLBEntry tmptlb, *tlb = &env->tlb_table[mmu_idx][index];
+
+            copy_tlb_helper_locked(&tmptlb, tlb);
+            copy_tlb_helper_locked(tlb, vtlb);
+            copy_tlb_helper_locked(vtlb, &tmptlb);
+
             CPUIOTLBEntry tmpio, *io = &env->iotlb[mmu_idx][index];
             CPUIOTLBEntry *vio = &env->iotlb_v[mmu_idx][vidx];
-
-            tmptlb = *tlb; *tlb = *vtlb; *vtlb = tmptlb;
             tmpio = *io; *io = *vio; *vio = tmpio;
             return true;
         }
@@ -651,7 +714,7 @@ static bool victim_tlb_hit(CPUArchState *env, size_t mmu_idx, size_t index,
 
 /* Macro to call the above, with local variables from the use context.  */
 #define VICTIM_TLB_HIT(TY, ADDR) \
-    victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
+  victim_tlb_hit(env, mmu_idx, index, offsetof(CPUTLBEntry, TY), \
                  (ADDR) & TARGET_PAGE_MASK)
 
 /* Probe for whether the specified guest write access is permitted.
@@ -715,7 +778,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
             index = tlb_index(env, mmu_idx, addr);
             tlbe = tlb_entry(env, mmu_idx, addr);
         }
-        tlb_addr = tlb_addr_write(tlbe);
+        tlb_addr = tlb_addr_write(tlbe) & ~TLB_INVALID_MASK;
     }
 
     /* Check notdirty */
@@ -732,7 +795,7 @@ static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
     }
 
     /* Let the guest notice RMW on a write-only page.  */
-    if (unlikely(tlbe->addr_read != tlb_addr)) {
+    if (unlikely(tlbe->addr_read != (tlb_addr & ~TLB_NOTDIRTY))) {
         tlb_fill(ENV_GET_CPU(env), addr, 1 << s_bits, MMU_DATA_LOAD,
                  mmu_idx, retaddr);
         /* Since we don't support reads and writes to different addresses,
diff --git a/qemu/accel/tcg/softmmu_template.h b/qemu/accel/tcg/softmmu_template.h
index a5915950..0229590b 100644
--- a/qemu/accel/tcg/softmmu_template.h
+++ b/qemu/accel/tcg/softmmu_template.h
@@ -101,11 +101,12 @@ static inline DATA_TYPE glue(io_read, SUFFIX)(CPUArchState *env,
                                               size_t mmu_idx, size_t index,
                                               target_ulong addr,
                                               uintptr_t retaddr,
-                                              bool recheck)
+                                              bool recheck,
+                                              MMUAccessType access_type)
 {
     CPUIOTLBEntry *iotlbentry = &env->iotlb[mmu_idx][index];
     return io_readx(env, iotlbentry, mmu_idx, addr, retaddr, recheck,
-                    DATA_SIZE);
+                    access_type, DATA_SIZE);
 }
 #endif
 
@@ -252,7 +253,8 @@ WORD_TYPE helper_le_ld_name(CPUArchState *env, target_ulong addr,
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
         res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr,
-                                    tlb_addr & TLB_RECHECK);
+                                    tlb_addr & TLB_RECHECK,
+                                    READ_ACCESS_TYPE);
         res = TGT_LE(res);
         goto _out;
     }
@@ -439,7 +441,8 @@ WORD_TYPE helper_be_ld_name(CPUArchState *env, target_ulong addr,
         /* ??? Note that the io helpers always read data in the target
            byte ordering.  We should push the LE/BE request down into io.  */
         res = glue(io_read, SUFFIX)(env, mmu_idx, index, addr, retaddr,
-                                    tlb_addr & TLB_RECHECK);
+                                    tlb_addr & TLB_RECHECK,
+                                    READ_ACCESS_TYPE);
         res = TGT_BE(res);
         goto _out;
     }
@@ -592,7 +595,7 @@ void helper_le_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             index = tlb_index(env, mmu_idx, addr);
             entry = tlb_entry(env, mmu_idx, addr);
         }
-        tlb_addr = tlb_addr_write(entry);
+        tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
     }
 
     /* Handle an IO access.  */
@@ -740,7 +743,7 @@ void helper_be_st_name(CPUArchState *env, target_ulong addr, DATA_TYPE val,
             index = tlb_index(env, mmu_idx, addr);
             entry = tlb_entry(env, mmu_idx, addr);
         }
-        tlb_addr = tlb_addr_write(entry);
+        tlb_addr = tlb_addr_write(entry) & ~TLB_INVALID_MASK;
     }
 
     /* Handle an IO access.  */
diff --git a/qemu/include/exec/cpu-defs.h b/qemu/include/exec/cpu-defs.h
index 5b989bbc..1400f85f 100644
--- a/qemu/include/exec/cpu-defs.h
+++ b/qemu/include/exec/cpu-defs.h
@@ -142,15 +142,26 @@ typedef struct CPUIOTLBEntry {
     MemTxAttrs attrs;
 } CPUIOTLBEntry;
 
+typedef struct CPUTLBDesc {
+    /*
+     * Describe a region covering all of the large pages allocated
+     * into the tlb.  When any page within this region is flushed,
+     * we must flush the entire tlb.  The region is matched if
+     * (addr & large_page_mask) == large_page_addr.
+     */
+    target_ulong large_page_addr;
+    target_ulong large_page_mask;
+    /* The next index to use in the tlb victim table.  */
+    size_t vindex;
+} CPUTLBDesc;
+
 #define CPU_COMMON_TLB \
     /* The meaning of the MMU modes is defined in the target code. */   \
+    CPUTLBDesc tlb_d[NB_MMU_MODES];                                     \
     CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];                  \
     CPUTLBEntry tlb_v_table[NB_MMU_MODES][CPU_VTLB_SIZE];               \
     CPUIOTLBEntry iotlb[NB_MMU_MODES][CPU_TLB_SIZE];                    \
-    CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];                 \
-    target_ulong tlb_flush_addr;                                        \
-    target_ulong tlb_flush_mask;                                        \
-    target_ulong vtlb_index;                                            \
+    CPUIOTLBEntry iotlb_v[NB_MMU_MODES][CPU_VTLB_SIZE];
 
 #else
 
diff --git a/qemu/include/qom/cpu.h b/qemu/include/qom/cpu.h
index 2b1eebc4..6b5d6380 100644
--- a/qemu/include/qom/cpu.h
+++ b/qemu/include/qom/cpu.h
@@ -215,6 +215,22 @@ struct kvm_run;
 #define TB_JMP_CACHE_BITS 12
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 
+/* The union type allows passing of 64 bit target pointers on 32 bit
+ * hosts in a single parameter
+ */
+typedef union {
+    int           host_int;
+    unsigned long host_ulong;
+    void         *host_ptr;
+    vaddr         target_ptr;
+} run_on_cpu_data;
+
+#define RUN_ON_CPU_HOST_PTR(p)    ((run_on_cpu_data){.host_ptr = (p)})
+#define RUN_ON_CPU_HOST_INT(i)    ((run_on_cpu_data){.host_int = (i)})
+#define RUN_ON_CPU_HOST_ULONG(ul) ((run_on_cpu_data){.host_ulong = (ul)})
+#define RUN_ON_CPU_TARGET_PTR(v)  ((run_on_cpu_data){.target_ptr = (v)})
+#define RUN_ON_CPU_NULL RUN_ON_CPU_HOST_PTR(NULL)
+
 typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
 
 // Unicorn: Moved CPUAddressSpace here from exec.c
@@ -325,6 +341,12 @@ struct CPUState {
      */
     uintptr_t mem_io_pc;
     vaddr mem_io_vaddr;
+    /*
+     * This is only needed for the legacy cpu_unassigned_access() hook;
+     * when all targets using it have been converted to use
+     * cpu_transaction_failed() instead it can be removed.
+     */
+    MMUAccessType mem_io_access_type;
 
     int kvm_fd;
     bool kvm_vcpu_dirty;
diff --git a/qemu/memory.c b/qemu/memory.c
index 97088259..f1a977a2 100644
--- a/qemu/memory.c
+++ b/qemu/memory.c
@@ -1136,7 +1136,8 @@ static uint64_t unassigned_mem_read(struct uc_struct* uc, hwaddr addr, unsigned
     printf("Unassigned mem read " TARGET_FMT_plx "\n", addr);
 #endif
     if (uc->current_cpu != NULL) {
-        cpu_unassigned_access(uc->current_cpu, addr, false, false, 0, size);
+        bool is_exec = uc->current_cpu->mem_io_access_type == MMU_INST_FETCH;
+        cpu_unassigned_access(uc->current_cpu, addr, false, is_exec, 0, size);
     }
     return 0;
 }