diff --git a/qemu/aarch64.h b/qemu/aarch64.h
index 1fa7a425..2bb7ce26 100644
--- a/qemu/aarch64.h
+++ b/qemu/aarch64.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_aarch64
 #define tcg_region_init tcg_region_init_aarch64
 #define tcg_region_reset_all tcg_region_reset_all_aarch64
+#define tcg_register_thread tcg_register_thread_aarch64
 #define tcg_set_frame tcg_set_frame_aarch64
 #define tcg_set_nop tcg_set_nop_aarch64
 #define tcg_swap_cond tcg_swap_cond_aarch64
diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h
index ad62fec1..cebfc7a8 100644
--- a/qemu/aarch64eb.h
+++ b/qemu/aarch64eb.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_aarch64eb
 #define tcg_region_init tcg_region_init_aarch64eb
 #define tcg_region_reset_all tcg_region_reset_all_aarch64eb
+#define tcg_register_thread tcg_register_thread_aarch64eb
 #define tcg_set_frame tcg_set_frame_aarch64eb
 #define tcg_set_nop tcg_set_nop_aarch64eb
 #define tcg_swap_cond tcg_swap_cond_aarch64eb
diff --git a/qemu/accel/tcg/translate-all.c b/qemu/accel/tcg/translate-all.c
index 2b3caafc..357fe069 100644
--- a/qemu/accel/tcg/translate-all.c
+++ b/qemu/accel/tcg/translate-all.c
@@ -836,6 +836,19 @@ void tcg_exec_init(struct uc_struct *uc, unsigned long tb_size)
        initialize the prologue now.  */
     tcg_prologue_init(tcg_ctx);
 #endif
+
+    /*
+     * Initialize TCG regions--once. Now is a good time, because:
+     * (1) TCG's init context, prologue and target globals have been set up.
+     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
+     *     -accel flag is processed, so the check doesn't work then).
+     */
+    if (!uc->tcg_region_inited) {
+        uc->tcg_region_inited = 1;
+        tcg_region_init(uc);
+    }
+
+    tcg_register_thread(uc);
 }
 
 bool tcg_enabled(struct uc_struct *uc)
diff --git a/qemu/arm.h b/qemu/arm.h
index a3635c95..4fa5e15b 100644
--- a/qemu/arm.h
+++ b/qemu/arm.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_arm
 #define tcg_region_init tcg_region_init_arm
 #define tcg_region_reset_all tcg_region_reset_all_arm
+#define tcg_register_thread tcg_register_thread_arm
 #define tcg_set_frame tcg_set_frame_arm
 #define tcg_set_nop tcg_set_nop_arm
 #define tcg_swap_cond tcg_swap_cond_arm
diff --git a/qemu/armeb.h b/qemu/armeb.h
index 2bbbeaa3..6ebac398 100644
--- a/qemu/armeb.h
+++ b/qemu/armeb.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_armeb
 #define tcg_region_init tcg_region_init_armeb
 #define tcg_region_reset_all tcg_region_reset_all_armeb
+#define tcg_register_thread tcg_register_thread_armeb
 #define tcg_set_frame tcg_set_frame_armeb
 #define tcg_set_nop tcg_set_nop_armeb
 #define tcg_swap_cond tcg_swap_cond_armeb
diff --git a/qemu/cpus.c b/qemu/cpus.c
index 9542c0b2..c444af12 100644
--- a/qemu/cpus.c
+++ b/qemu/cpus.c
@@ -135,19 +135,6 @@ static void *qemu_tcg_cpu_loop(struct uc_struct *uc)
 
 static int qemu_tcg_init_vcpu(CPUState *cpu)
 {
-    struct uc_struct *uc = cpu->uc;
-
-    /*
-     * Initialize TCG regions--once. Now is a good time, because:
-     * (1) TCG's init context, prologue and target globals have been set up.
-     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
-     *     -accel flag is processed, so the check doesn't work then).
-     */
-    if (!uc->tcg_region_inited) {
-        uc->tcg_region_inited = 1;
-        tcg_region_init(uc);
-    }
-
     return 0;
 }
 
@@ -160,6 +147,7 @@ static bool tcg_exec_all(struct uc_struct* uc)
 {
     int r;
     bool finish = false;
+
     while (!uc->cpu->exit_request) {
         CPUState *cpu = uc->cpu;
         CPUArchState *env = cpu->env_ptr;
diff --git a/qemu/header_gen.py b/qemu/header_gen.py
index e24ef8f2..7ba28009 100644
--- a/qemu/header_gen.py
+++ b/qemu/header_gen.py
@@ -2892,6 +2892,7 @@ symbols = (
     'tcg_reg_sync',
     'tcg_region_init',
     'tcg_region_reset_all',
+    'tcg_register_thread',
     'tcg_set_frame',
     'tcg_set_nop',
     'tcg_swap_cond',
diff --git a/qemu/m68k.h b/qemu/m68k.h
index e37a9741..ba6fb2b2 100644
--- a/qemu/m68k.h
+++ b/qemu/m68k.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_m68k
 #define tcg_region_init tcg_region_init_m68k
 #define tcg_region_reset_all tcg_region_reset_all_m68k
+#define tcg_register_thread tcg_register_thread_m68k
 #define tcg_set_frame tcg_set_frame_m68k
 #define tcg_set_nop tcg_set_nop_m68k
 #define tcg_swap_cond tcg_swap_cond_m68k
diff --git a/qemu/mips.h b/qemu/mips.h
index 3b891af1..5ce86978 100644
--- a/qemu/mips.h
+++ b/qemu/mips.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_mips
 #define tcg_region_init tcg_region_init_mips
 #define tcg_region_reset_all tcg_region_reset_all_mips
+#define tcg_register_thread tcg_register_thread_mips
 #define tcg_set_frame tcg_set_frame_mips
 #define tcg_set_nop tcg_set_nop_mips
 #define tcg_swap_cond tcg_swap_cond_mips
diff --git a/qemu/mips64.h b/qemu/mips64.h
index fd2b23af..e4a74586 100644
--- a/qemu/mips64.h
+++ b/qemu/mips64.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_mips64
 #define tcg_region_init tcg_region_init_mips64
 #define tcg_region_reset_all tcg_region_reset_all_mips64
+#define tcg_register_thread tcg_register_thread_mips64
 #define tcg_set_frame tcg_set_frame_mips64
 #define tcg_set_nop tcg_set_nop_mips64
 #define tcg_swap_cond tcg_swap_cond_mips64
diff --git a/qemu/mips64el.h b/qemu/mips64el.h
index 56cf5b9b..c6d0783f 100644
--- a/qemu/mips64el.h
+++ b/qemu/mips64el.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_mips64el
 #define tcg_region_init tcg_region_init_mips64el
 #define tcg_region_reset_all tcg_region_reset_all_mips64el
+#define tcg_register_thread tcg_register_thread_mips64el
 #define tcg_set_frame tcg_set_frame_mips64el
 #define tcg_set_nop tcg_set_nop_mips64el
 #define tcg_swap_cond tcg_swap_cond_mips64el
diff --git a/qemu/mipsel.h b/qemu/mipsel.h
index 232f9dee..126b77a8 100644
--- a/qemu/mipsel.h
+++ b/qemu/mipsel.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_mipsel
 #define tcg_region_init tcg_region_init_mipsel
 #define tcg_region_reset_all tcg_region_reset_all_mipsel
+#define tcg_register_thread tcg_register_thread_mipsel
 #define tcg_set_frame tcg_set_frame_mipsel
 #define tcg_set_nop tcg_set_nop_mipsel
 #define tcg_swap_cond tcg_swap_cond_mipsel
diff --git a/qemu/powerpc.h b/qemu/powerpc.h
index df55d0a9..d8d57200 100644
--- a/qemu/powerpc.h
+++ b/qemu/powerpc.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_powerpc
 #define tcg_region_init tcg_region_init_powerpc
 #define tcg_region_reset_all tcg_region_reset_all_powerpc
+#define tcg_register_thread tcg_register_thread_powerpc
 #define tcg_set_frame tcg_set_frame_powerpc
 #define tcg_set_nop tcg_set_nop_powerpc
 #define tcg_swap_cond tcg_swap_cond_powerpc
diff --git a/qemu/sparc.h b/qemu/sparc.h
index 7d24c55a..b1065ed1 100644
--- a/qemu/sparc.h
+++ b/qemu/sparc.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_sparc
 #define tcg_region_init tcg_region_init_sparc
 #define tcg_region_reset_all tcg_region_reset_all_sparc
+#define tcg_register_thread tcg_register_thread_sparc
 #define tcg_set_frame tcg_set_frame_sparc
 #define tcg_set_nop tcg_set_nop_sparc
 #define tcg_swap_cond tcg_swap_cond_sparc
diff --git a/qemu/sparc64.h b/qemu/sparc64.h
index b34239e9..b39d45cc 100644
--- a/qemu/sparc64.h
+++ b/qemu/sparc64.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_sparc64
 #define tcg_region_init tcg_region_init_sparc64
 #define tcg_region_reset_all tcg_region_reset_all_sparc64
+#define tcg_register_thread tcg_register_thread_sparc64
 #define tcg_set_frame tcg_set_frame_sparc64
 #define tcg_set_nop tcg_set_nop_sparc64
 #define tcg_swap_cond tcg_swap_cond_sparc64
diff --git a/qemu/tcg/tcg.c b/qemu/tcg/tcg.c
index b7209fd9..4bd70bac 100644
--- a/qemu/tcg/tcg.c
+++ b/qemu/tcg/tcg.c
@@ -329,6 +329,7 @@ static inline bool tcg_region_initial_alloc__locked(struct uc_struct *uc, TCGCon
 /* Call from a safe-work context */
 void tcg_region_reset_all(struct uc_struct *uc)
 {
+    unsigned int n_ctxs = atomic_read(&uc->n_tcg_ctxs);
     unsigned int i;
     TCGContext **tcg_ctxs = uc->tcg_ctxs;
 
@@ -337,8 +338,9 @@ void tcg_region_reset_all(struct uc_struct *uc)
     uc->region.current = 0;
     uc->region.agg_size_full = 0;
 
-    for (i = 0; i < uc->n_tcg_ctxs; i++) {
-        bool err = tcg_region_initial_alloc__locked(uc, tcg_ctxs[i]);
+    for (i = 0; i < n_ctxs; i++) {
+        TCGContext *s = atomic_read(&tcg_ctxs[i]);
+        bool err = tcg_region_initial_alloc__locked(uc, s);
 
         g_assert(!err);
     }
@@ -346,11 +348,76 @@ void tcg_region_reset_all(struct uc_struct *uc)
     //qemu_mutex_unlock(&region.lock);
 }
 
+#ifdef CONFIG_USER_ONLY
+static size_t tcg_n_regions(struct uc_struct *uc)
+{
+    return 1;
+}
+#else
+/*
+ * It is likely that some vCPUs will translate more code than others, so we
+ * first try to set more regions than max_cpus, with those regions being of
+ * reasonable size. If that's not possible we make do by evenly dividing
+ * the code_gen_buffer among the vCPUs.
+ */
+static size_t tcg_n_regions(struct uc_struct *uc)
+{
+    //size_t i;
+
+    return 1;
+
+    // Unicorn: if'd out
+#if 0
+    /* Use a single region if all we have is one vCPU thread */
+    if (max_cpus == 1 || !qemu_tcg_mttcg_enabled(uc)) {
+        return 1;
+    }
+
+    /* Try to have more regions than max_cpus, with each region being >= 2 MB */
+    for (i = 8; i > 0; i--) {
+        size_t regions_per_thread = i;
+        size_t region_size;
+
+        region_size = tcg_init_ctx.code_gen_buffer_size;
+        region_size /= max_cpus * regions_per_thread;
+
+        if (region_size >= 2 * 1024u * 1024) {
+            return max_cpus * regions_per_thread;
+        }
+    }
+    /* If we can't, then just allocate one region per vCPU thread */
+    return max_cpus;
+#endif
+}
+#endif
+
 /*
  * Initializes region partitioning.
  *
  * Called at init time from the parent thread (i.e. the one calling
  * tcg_context_init), after the target's TCG globals have been set.
+ *
+ * Region partitioning works by splitting code_gen_buffer into separate regions,
+ * and then assigning regions to TCG threads so that the threads can translate
+ * code in parallel without synchronization.
+ *
+ * In softmmu the number of TCG threads is bounded by max_cpus, so we use at
+ * least max_cpus regions in MTTCG. In !MTTCG we use a single region.
+ * Note that the TCG options from the command-line (i.e. -accel accel=tcg,[...])
+ * must have been parsed before calling this function, since it calls
+ * qemu_tcg_mttcg_enabled().
+ *
+ * In user-mode we use a single region.  Having multiple regions in user-mode
+ * is not supported, because the number of vCPU threads (recall that each thread
+ * spawned by the guest corresponds to a vCPU thread) is only bounded by the
+ * OS, and usually this number is huge (tens of thousands is not uncommon).
+ * Thus, given this large bound on the number of vCPU threads and the fact
+ * that code_gen_buffer is allocated at compile-time, we cannot guarantee
+ * that the availability of at least one region per vCPU thread.
+ *
+ * However, this user-mode limitation is unlikely to be a significant problem
+ * in practice. Multi-threaded guests share most if not all of their translated
+ * code, which makes parallel code generation less appealing than in softmmu.
  */
 void tcg_region_init(struct uc_struct *uc)
 {
@@ -365,7 +432,7 @@ void tcg_region_init(struct uc_struct *uc)
     size_t i;
 
     /* We do not yet support multiple TCG contexts, so use one region for now */
-    n_regions = 1;
+    n_regions = tcg_n_regions(uc);
 
     /* The first region will be 'aligned - buf' bytes larger than the others */
     aligned = QEMU_ALIGN_PTR_UP(buf, page_size);
@@ -404,14 +471,70 @@ void tcg_region_init(struct uc_struct *uc)
     }
 
     /* We do not yet support multiple TCG contexts so allocate the region now */
+#ifdef CONFIG_USER_ONLY
     {
         TCGContext *tcg_ctx = uc->tcg_ctx;
         bool err = tcg_region_initial_alloc__locked(uc, tcg_ctx);
 
         g_assert(!err);
     }
+#endif
 }
 
+/*
+ * All TCG threads except the parent (i.e. the one that called tcg_context_init
+ * and registered the target's TCG globals) must register with this function
+ * before initiating translation.
+ *
+ * In user-mode we just point tcg_ctx to tcg_init_ctx. See the documentation
+ * of tcg_region_init() for the reasoning behind this.
+ *
+ * In softmmu each caller registers its context in tcg_ctxs[]. Note that in
+ * softmmu tcg_ctxs[] does not track tcg_ctx_init, since the initial context
+ * is not used anymore for translation once this function is called.
+ *
+ * Not tracking tcg_init_ctx in tcg_ctxs[] in softmmu keeps code that iterates
+ * over the array (e.g. tcg_code_size() the same for both softmmu and user-mode.
+ */
+#ifdef CONFIG_USER_ONLY
+void tcg_register_thread(struct uc_struct *uc)
+{
+    uc->tcg_ctx = uc->tcg_init_ctx;
+}
+#else
+void tcg_register_thread(struct uc_struct *uc)
+{
+    TCGContext **tcg_ctxs = uc->tcg_ctxs;
+    TCGContext *tcg_init_ctx = uc->tcg_init_ctx;
+    TCGContext *s = g_malloc(sizeof(*s));
+    unsigned int i, n;
+    bool err;
+
+    *s = *tcg_init_ctx;
+
+    /* Relink mem_base.  */
+    for (i = 0, n = tcg_init_ctx->nb_globals; i < n; ++i) {
+        if (tcg_init_ctx->temps[i].mem_base) {
+            ptrdiff_t b = tcg_init_ctx->temps[i].mem_base - tcg_init_ctx->temps;
+            tcg_debug_assert(b >= 0 && b < n);
+            s->temps[i].mem_base = &s->temps[b];
+        }
+    }
+
+    /* Claim an entry in tcg_ctxs */
+    n = atomic_fetch_inc(&uc->n_tcg_ctxs);
+    // Unicorn: commented out
+    //g_assert(n < max_cpus);
+    atomic_set(&tcg_ctxs[n], s);
+
+    uc->tcg_ctx = s;
+    //qemu_mutex_lock(&region.lock);
+    err = tcg_region_initial_alloc__locked(uc, uc->tcg_ctx);
+    g_assert(!err);
+    //qemu_mutex_unlock(&region.lock);
+}
+#endif /* !CONFIG_USER_ONLY */
+
 /*
  * Returns the size (in bytes) of all translated code (i.e. from all regions)
  * currently in the cache.
@@ -421,15 +544,16 @@ void tcg_region_init(struct uc_struct *uc)
  */
 size_t tcg_code_size(struct uc_struct *uc)
 {
+    unsigned int n_ctxs = atomic_read(&uc->n_tcg_ctxs);
     unsigned int i;
     size_t total;
 
     // Unicorn: commented out
     //qemu_mutex_lock(&region.lock);
     total = uc->region.agg_size_full;
-    for (i = 0; i < uc->n_tcg_ctxs; i++) {
-        TCGContext **tcg_ctxs = uc->tcg_ctxs;
-        const TCGContext *s = tcg_ctxs[i];
+    for (i = 0; i < n_ctxs; i++) {
+        TCGContext **tcg_ctxs = atomic_read(&uc->tcg_ctxs);
+        const TCGContext *s = atomic_read(&tcg_ctxs[i]);
         size_t size;
 
         size = atomic_read(&s->code_gen_ptr) - s->code_gen_buffer;
@@ -588,8 +712,20 @@ void tcg_context_init(struct uc_struct *uc, TCGContext *s)
     }
 
     uc->tcg_ctx = s;
+    /*
+     * In user-mode we simply share the init context among threads, since we
+     * use a single region. See the documentation tcg_region_init() for the
+     * reasoning behind this.
+     * In softmmu we will have at most max_cpus TCG threads.
+     */
+#ifdef CONFIG_USER_ONLY
     uc->tcg_ctxs = &uc->tcg_ctx;
     uc->n_tcg_ctxs = 1;
+#else
+    // Unicorn: modified
+    //uc->tcg_ctxs = g_new(TCGContext *, max_cpus);
+    uc->tcg_ctxs = g_new(TCGContext *, 1);
+#endif
 }
 
 /*
diff --git a/qemu/tcg/tcg.h b/qemu/tcg/tcg.h
index 7df4fa3c..3949ff07 100644
--- a/qemu/tcg/tcg.h
+++ b/qemu/tcg/tcg.h
@@ -634,13 +634,14 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
 
 /* pool based memory allocation */
 
-/* tb_lock must be held for tcg_malloc_internal. */
+/* user-mode: tb_lock must be held for tcg_malloc_internal. */
 void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
 TranslationBlock *tcg_tb_alloc(TCGContext *s);
 
 void tcg_context_init(struct uc_struct *uc, TCGContext *s);
 void tcg_context_free(void *s);   // free memory allocated for @s
+void tcg_register_thread(struct uc_struct *uc);
 void tcg_prologue_init(TCGContext *s);
 void tcg_func_start(TCGContext *s);
 
@@ -1148,7 +1149,7 @@ void tcg_region_reset_all(struct uc_struct *uc);
 size_t tcg_code_size(struct uc_struct *uc);
 size_t tcg_code_capacity(struct uc_struct *uc);
 
-/* Called with tb_lock held.  */
+/* user-mode: Called with tb_lock held.  */
 static inline void *tcg_malloc(TCGContext *s, int size)
 {
     uint8_t *ptr, *ptr_end;
diff --git a/qemu/unicorn_common.h b/qemu/unicorn_common.h
index 66cc7f0e..f562e26a 100644
--- a/qemu/unicorn_common.h
+++ b/qemu/unicorn_common.h
@@ -59,19 +59,13 @@ static inline void free_tcg_temp_names(TCGContext *s)
 #endif
 }
 
-/** Freeing common resources */
-static void release_common(void *t)
+static inline void free_tcg_context(TCGContext *s)
 {
-    TCGPool *po, *to;
-    TCGContext *s = (TCGContext *)t;
-    struct uc_struct *uc = s->uc;
-
-    // Clean TCG.
     TCGOpDef* def = &s->tcg_op_defs[0];
+    TCGPool *po, *to;
+
     g_free(def->args_ct);
     g_free(def->sorted_args);
-    g_tree_destroy(uc->tb_ctx.tb_tree);
-    qht_destroy(&uc->tb_ctx.htable);
     g_free(s->tcg_op_defs);
 
     for (po = s->pool_first; po; po = to) {
@@ -79,20 +73,46 @@ static void release_common(void *t)
         g_free(po);
     }
     tcg_pool_reset(s);
+
     g_hash_table_destroy(s->helpers);
+    free_tcg_temp_names(s);
+    g_free(s);
+}
+
+static inline void free_tcg_contexts(struct uc_struct *uc)
+{
+    int i;
+    TCGContext **tcg_ctxs = uc->tcg_ctxs;
+
+    for (i = 0; i < uc->n_tcg_ctxs; i++) {
+        free_tcg_context(tcg_ctxs[i]);
+    }
+
+    g_free(tcg_ctxs);
+}
+
+/** Freeing common resources */
+static void release_common(void *t)
+{
+    TCGContext *s = (TCGContext *)t;
+    struct uc_struct *uc = s->uc;
+
+    // Clean TCG.
+    free_tcg_contexts(uc);
+    g_tree_destroy(uc->tb_ctx.tb_tree);
+    qht_destroy(&uc->tb_ctx.htable);
 
     // Destory flat view hash table
-    g_hash_table_destroy(s->uc->flat_views);
-    unicorn_free_empty_flat_view(s->uc);
+    g_hash_table_destroy(uc->flat_views);
+    unicorn_free_empty_flat_view(uc);
 
     // TODO(danghvu): these function is not available outside qemu
     // so we keep them here instead of outside uc_close.
-    free_address_spaces(s->uc);
-    memory_free(s->uc);
-    tb_cleanup(s->uc);
-    free_code_gen_buffer(s->uc);
-    free_machine_class_name(s->uc);
-    free_tcg_temp_names(s);
+    free_address_spaces(uc);
+    memory_free(uc);
+    tb_cleanup(uc);
+    free_code_gen_buffer(uc);
+    free_machine_class_name(uc);
 }
 
 static inline void uc_common_init(struct uc_struct* uc)
diff --git a/qemu/x86_64.h b/qemu/x86_64.h
index 0296f0a2..3dda8a95 100644
--- a/qemu/x86_64.h
+++ b/qemu/x86_64.h
@@ -2886,6 +2886,7 @@
 #define tcg_reg_sync tcg_reg_sync_x86_64
 #define tcg_region_init tcg_region_init_x86_64
 #define tcg_region_reset_all tcg_region_reset_all_x86_64
+#define tcg_register_thread tcg_register_thread_x86_64
 #define tcg_set_frame tcg_set_frame_x86_64
 #define tcg_set_nop tcg_set_nop_x86_64
 #define tcg_swap_cond tcg_swap_cond_x86_64