diff --git a/qemu/aarch64.h b/qemu/aarch64.h index 1fa7a425..2bb7ce26 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_aarch64 #define tcg_region_init tcg_region_init_aarch64 #define tcg_region_reset_all tcg_region_reset_all_aarch64 +#define tcg_register_thread tcg_register_thread_aarch64 #define tcg_set_frame tcg_set_frame_aarch64 #define tcg_set_nop tcg_set_nop_aarch64 #define tcg_swap_cond tcg_swap_cond_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index ad62fec1..cebfc7a8 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_aarch64eb #define tcg_region_init tcg_region_init_aarch64eb #define tcg_region_reset_all tcg_region_reset_all_aarch64eb +#define tcg_register_thread tcg_register_thread_aarch64eb #define tcg_set_frame tcg_set_frame_aarch64eb #define tcg_set_nop tcg_set_nop_aarch64eb #define tcg_swap_cond tcg_swap_cond_aarch64eb diff --git a/qemu/accel/tcg/translate-all.c b/qemu/accel/tcg/translate-all.c index 2b3caafc..357fe069 100644 --- a/qemu/accel/tcg/translate-all.c +++ b/qemu/accel/tcg/translate-all.c @@ -836,6 +836,19 @@ void tcg_exec_init(struct uc_struct *uc, unsigned long tb_size) initialize the prologue now. */ tcg_prologue_init(tcg_ctx); #endif + + /* + * Initialize TCG regions--once. Now is a good time, because: + * (1) TCG's init context, prologue and target globals have been set up. + * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the + * -accel flag is processed, so the check doesn't work then). + */ + if (!uc->tcg_region_inited) { + uc->tcg_region_inited = 1; + tcg_region_init(uc); + } + + tcg_register_thread(uc); } bool tcg_enabled(struct uc_struct *uc) diff --git a/qemu/arm.h b/qemu/arm.h index a3635c95..4fa5e15b 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_arm #define tcg_region_init tcg_region_init_arm #define tcg_region_reset_all tcg_region_reset_all_arm +#define tcg_register_thread tcg_register_thread_arm #define tcg_set_frame tcg_set_frame_arm #define tcg_set_nop tcg_set_nop_arm #define tcg_swap_cond tcg_swap_cond_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index 2bbbeaa3..6ebac398 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_armeb #define tcg_region_init tcg_region_init_armeb #define tcg_region_reset_all tcg_region_reset_all_armeb +#define tcg_register_thread tcg_register_thread_armeb #define tcg_set_frame tcg_set_frame_armeb #define tcg_set_nop tcg_set_nop_armeb #define tcg_swap_cond tcg_swap_cond_armeb diff --git a/qemu/cpus.c b/qemu/cpus.c index 9542c0b2..c444af12 100644 --- a/qemu/cpus.c +++ b/qemu/cpus.c @@ -135,19 +135,6 @@ static void *qemu_tcg_cpu_loop(struct uc_struct *uc) static int qemu_tcg_init_vcpu(CPUState *cpu) { - struct uc_struct *uc = cpu->uc; - - /* - * Initialize TCG regions--once. Now is a good time, because: - * (1) TCG's init context, prologue and target globals have been set up. - * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the - * -accel flag is processed, so the check doesn't work then). - */ - if (!uc->tcg_region_inited) { - uc->tcg_region_inited = 1; - tcg_region_init(uc); - } - return 0; } @@ -160,6 +147,7 @@ static bool tcg_exec_all(struct uc_struct* uc) { int r; bool finish = false; + while (!uc->cpu->exit_request) { CPUState *cpu = uc->cpu; CPUArchState *env = cpu->env_ptr; diff --git a/qemu/header_gen.py b/qemu/header_gen.py index e24ef8f2..7ba28009 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -2892,6 +2892,7 @@ symbols = ( 'tcg_reg_sync', 'tcg_region_init', 'tcg_region_reset_all', + 'tcg_register_thread', 'tcg_set_frame', 'tcg_set_nop', 'tcg_swap_cond', diff --git a/qemu/m68k.h b/qemu/m68k.h index e37a9741..ba6fb2b2 100644 --- a/qemu/m68k.h +++ b/qemu/m68k.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_m68k #define tcg_region_init tcg_region_init_m68k #define tcg_region_reset_all tcg_region_reset_all_m68k +#define tcg_register_thread tcg_register_thread_m68k #define tcg_set_frame tcg_set_frame_m68k #define tcg_set_nop tcg_set_nop_m68k #define tcg_swap_cond tcg_swap_cond_m68k diff --git a/qemu/mips.h b/qemu/mips.h index 3b891af1..5ce86978 100644 --- a/qemu/mips.h +++ b/qemu/mips.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_mips #define tcg_region_init tcg_region_init_mips #define tcg_region_reset_all tcg_region_reset_all_mips +#define tcg_register_thread tcg_register_thread_mips #define tcg_set_frame tcg_set_frame_mips #define tcg_set_nop tcg_set_nop_mips #define tcg_swap_cond tcg_swap_cond_mips diff --git a/qemu/mips64.h b/qemu/mips64.h index fd2b23af..e4a74586 100644 --- a/qemu/mips64.h +++ b/qemu/mips64.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_mips64 #define tcg_region_init tcg_region_init_mips64 #define tcg_region_reset_all tcg_region_reset_all_mips64 +#define tcg_register_thread tcg_register_thread_mips64 #define tcg_set_frame tcg_set_frame_mips64 #define tcg_set_nop tcg_set_nop_mips64 #define tcg_swap_cond tcg_swap_cond_mips64 diff --git a/qemu/mips64el.h b/qemu/mips64el.h index 56cf5b9b..c6d0783f 100644 --- a/qemu/mips64el.h +++ b/qemu/mips64el.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_mips64el #define tcg_region_init tcg_region_init_mips64el #define tcg_region_reset_all tcg_region_reset_all_mips64el +#define tcg_register_thread tcg_register_thread_mips64el #define tcg_set_frame tcg_set_frame_mips64el #define tcg_set_nop tcg_set_nop_mips64el #define tcg_swap_cond tcg_swap_cond_mips64el diff --git a/qemu/mipsel.h b/qemu/mipsel.h index 232f9dee..126b77a8 100644 --- a/qemu/mipsel.h +++ b/qemu/mipsel.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_mipsel #define tcg_region_init tcg_region_init_mipsel #define tcg_region_reset_all tcg_region_reset_all_mipsel +#define tcg_register_thread tcg_register_thread_mipsel #define tcg_set_frame tcg_set_frame_mipsel #define tcg_set_nop tcg_set_nop_mipsel #define tcg_swap_cond tcg_swap_cond_mipsel diff --git a/qemu/powerpc.h b/qemu/powerpc.h index df55d0a9..d8d57200 100644 --- a/qemu/powerpc.h +++ b/qemu/powerpc.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_powerpc #define tcg_region_init tcg_region_init_powerpc #define tcg_region_reset_all tcg_region_reset_all_powerpc +#define tcg_register_thread tcg_register_thread_powerpc #define tcg_set_frame tcg_set_frame_powerpc #define tcg_set_nop tcg_set_nop_powerpc #define tcg_swap_cond tcg_swap_cond_powerpc diff --git a/qemu/sparc.h b/qemu/sparc.h index 7d24c55a..b1065ed1 100644 --- a/qemu/sparc.h +++ b/qemu/sparc.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_sparc #define tcg_region_init tcg_region_init_sparc #define tcg_region_reset_all tcg_region_reset_all_sparc +#define tcg_register_thread tcg_register_thread_sparc #define tcg_set_frame tcg_set_frame_sparc #define tcg_set_nop tcg_set_nop_sparc #define tcg_swap_cond tcg_swap_cond_sparc diff --git a/qemu/sparc64.h b/qemu/sparc64.h index b34239e9..b39d45cc 100644 --- a/qemu/sparc64.h +++ b/qemu/sparc64.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_sparc64 #define tcg_region_init tcg_region_init_sparc64 #define tcg_region_reset_all tcg_region_reset_all_sparc64 +#define tcg_register_thread tcg_register_thread_sparc64 #define tcg_set_frame tcg_set_frame_sparc64 #define tcg_set_nop tcg_set_nop_sparc64 #define tcg_swap_cond tcg_swap_cond_sparc64 diff --git a/qemu/tcg/tcg.c b/qemu/tcg/tcg.c index b7209fd9..4bd70bac 100644 --- a/qemu/tcg/tcg.c +++ b/qemu/tcg/tcg.c @@ -329,6 +329,7 @@ static inline bool tcg_region_initial_alloc__locked(struct uc_struct *uc, TCGCon /* Call from a safe-work context */ void tcg_region_reset_all(struct uc_struct *uc) { + unsigned int n_ctxs = atomic_read(&uc->n_tcg_ctxs); unsigned int i; TCGContext **tcg_ctxs = uc->tcg_ctxs; @@ -337,8 +338,9 @@ void tcg_region_reset_all(struct uc_struct *uc) uc->region.current = 0; uc->region.agg_size_full = 0; - for (i = 0; i < uc->n_tcg_ctxs; i++) { - bool err = tcg_region_initial_alloc__locked(uc, tcg_ctxs[i]); + for (i = 0; i < n_ctxs; i++) { + TCGContext *s = atomic_read(&tcg_ctxs[i]); + bool err = tcg_region_initial_alloc__locked(uc, s); g_assert(!err); } @@ -346,11 +348,76 @@ void tcg_region_reset_all(struct uc_struct *uc) //qemu_mutex_unlock(®ion.lock); } +#ifdef CONFIG_USER_ONLY +static size_t tcg_n_regions(struct uc_struct *uc) +{ + return 1; +} +#else +/* + * It is likely that some vCPUs will translate more code than others, so we + * first try to set more regions than max_cpus, with those regions being of + * reasonable size. If that's not possible we make do by evenly dividing + * the code_gen_buffer among the vCPUs. + */ +static size_t tcg_n_regions(struct uc_struct *uc) +{ + //size_t i; + + return 1; + + // Unicorn: if'd out +#if 0 + /* Use a single region if all we have is one vCPU thread */ + if (max_cpus == 1 || !qemu_tcg_mttcg_enabled(uc)) { + return 1; + } + + /* Try to have more regions than max_cpus, with each region being >= 2 MB */ + for (i = 8; i > 0; i--) { + size_t regions_per_thread = i; + size_t region_size; + + region_size = tcg_init_ctx.code_gen_buffer_size; + region_size /= max_cpus * regions_per_thread; + + if (region_size >= 2 * 1024u * 1024) { + return max_cpus * regions_per_thread; + } + } + /* If we can't, then just allocate one region per vCPU thread */ + return max_cpus; +#endif +} +#endif + /* * Initializes region partitioning. * * Called at init time from the parent thread (i.e. the one calling * tcg_context_init), after the target's TCG globals have been set. + * + * Region partitioning works by splitting code_gen_buffer into separate regions, + * and then assigning regions to TCG threads so that the threads can translate + * code in parallel without synchronization. + * + * In softmmu the number of TCG threads is bounded by max_cpus, so we use at + * least max_cpus regions in MTTCG. In !MTTCG we use a single region. + * Note that the TCG options from the command-line (i.e. -accel accel=tcg,[...]) + * must have been parsed before calling this function, since it calls + * qemu_tcg_mttcg_enabled(). + * + * In user-mode we use a single region. Having multiple regions in user-mode + * is not supported, because the number of vCPU threads (recall that each thread + * spawned by the guest corresponds to a vCPU thread) is only bounded by the + * OS, and usually this number is huge (tens of thousands is not uncommon). + * Thus, given this large bound on the number of vCPU threads and the fact + * that code_gen_buffer is allocated at compile-time, we cannot guarantee + * that the availability of at least one region per vCPU thread. + * + * However, this user-mode limitation is unlikely to be a significant problem + * in practice. Multi-threaded guests share most if not all of their translated + * code, which makes parallel code generation less appealing than in softmmu. */ void tcg_region_init(struct uc_struct *uc) { @@ -365,7 +432,7 @@ void tcg_region_init(struct uc_struct *uc) size_t i; /* We do not yet support multiple TCG contexts, so use one region for now */ - n_regions = 1; + n_regions = tcg_n_regions(uc); /* The first region will be 'aligned - buf' bytes larger than the others */ aligned = QEMU_ALIGN_PTR_UP(buf, page_size); @@ -404,14 +471,70 @@ void tcg_region_init(struct uc_struct *uc) } /* We do not yet support multiple TCG contexts so allocate the region now */ +#ifdef CONFIG_USER_ONLY { TCGContext *tcg_ctx = uc->tcg_ctx; bool err = tcg_region_initial_alloc__locked(uc, tcg_ctx); g_assert(!err); } +#endif } +/* + * All TCG threads except the parent (i.e. the one that called tcg_context_init + * and registered the target's TCG globals) must register with this function + * before initiating translation. + * + * In user-mode we just point tcg_ctx to tcg_init_ctx. See the documentation + * of tcg_region_init() for the reasoning behind this. + * + * In softmmu each caller registers its context in tcg_ctxs[]. Note that in + * softmmu tcg_ctxs[] does not track tcg_ctx_init, since the initial context + * is not used anymore for translation once this function is called. + * + * Not tracking tcg_init_ctx in tcg_ctxs[] in softmmu keeps code that iterates + * over the array (e.g. tcg_code_size() the same for both softmmu and user-mode. + */ +#ifdef CONFIG_USER_ONLY +void tcg_register_thread(struct uc_struct *uc) +{ + uc->tcg_ctx = uc->tcg_init_ctx; +} +#else +void tcg_register_thread(struct uc_struct *uc) +{ + TCGContext **tcg_ctxs = uc->tcg_ctxs; + TCGContext *tcg_init_ctx = uc->tcg_init_ctx; + TCGContext *s = g_malloc(sizeof(*s)); + unsigned int i, n; + bool err; + + *s = *tcg_init_ctx; + + /* Relink mem_base. */ + for (i = 0, n = tcg_init_ctx->nb_globals; i < n; ++i) { + if (tcg_init_ctx->temps[i].mem_base) { + ptrdiff_t b = tcg_init_ctx->temps[i].mem_base - tcg_init_ctx->temps; + tcg_debug_assert(b >= 0 && b < n); + s->temps[i].mem_base = &s->temps[b]; + } + } + + /* Claim an entry in tcg_ctxs */ + n = atomic_fetch_inc(&uc->n_tcg_ctxs); + // Unicorn: commented out + //g_assert(n < max_cpus); + atomic_set(&tcg_ctxs[n], s); + + uc->tcg_ctx = s; + //qemu_mutex_lock(®ion.lock); + err = tcg_region_initial_alloc__locked(uc, uc->tcg_ctx); + g_assert(!err); + //qemu_mutex_unlock(®ion.lock); +} +#endif /* !CONFIG_USER_ONLY */ + /* * Returns the size (in bytes) of all translated code (i.e. from all regions) * currently in the cache. @@ -421,15 +544,16 @@ void tcg_region_init(struct uc_struct *uc) */ size_t tcg_code_size(struct uc_struct *uc) { + unsigned int n_ctxs = atomic_read(&uc->n_tcg_ctxs); unsigned int i; size_t total; // Unicorn: commented out //qemu_mutex_lock(®ion.lock); total = uc->region.agg_size_full; - for (i = 0; i < uc->n_tcg_ctxs; i++) { - TCGContext **tcg_ctxs = uc->tcg_ctxs; - const TCGContext *s = tcg_ctxs[i]; + for (i = 0; i < n_ctxs; i++) { + TCGContext **tcg_ctxs = atomic_read(&uc->tcg_ctxs); + const TCGContext *s = atomic_read(&tcg_ctxs[i]); size_t size; size = atomic_read(&s->code_gen_ptr) - s->code_gen_buffer; @@ -588,8 +712,20 @@ void tcg_context_init(struct uc_struct *uc, TCGContext *s) } uc->tcg_ctx = s; + /* + * In user-mode we simply share the init context among threads, since we + * use a single region. See the documentation tcg_region_init() for the + * reasoning behind this. + * In softmmu we will have at most max_cpus TCG threads. + */ +#ifdef CONFIG_USER_ONLY uc->tcg_ctxs = &uc->tcg_ctx; uc->n_tcg_ctxs = 1; +#else + // Unicorn: modified + //uc->tcg_ctxs = g_new(TCGContext *, max_cpus); + uc->tcg_ctxs = g_new(TCGContext *, 1); +#endif } /* diff --git a/qemu/tcg/tcg.h b/qemu/tcg/tcg.h index 7df4fa3c..3949ff07 100644 --- a/qemu/tcg/tcg.h +++ b/qemu/tcg/tcg.h @@ -634,13 +634,14 @@ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); /* pool based memory allocation */ -/* tb_lock must be held for tcg_malloc_internal. */ +/* user-mode: tb_lock must be held for tcg_malloc_internal. */ void *tcg_malloc_internal(TCGContext *s, int size); void tcg_pool_reset(TCGContext *s); TranslationBlock *tcg_tb_alloc(TCGContext *s); void tcg_context_init(struct uc_struct *uc, TCGContext *s); void tcg_context_free(void *s); // free memory allocated for @s +void tcg_register_thread(struct uc_struct *uc); void tcg_prologue_init(TCGContext *s); void tcg_func_start(TCGContext *s); @@ -1148,7 +1149,7 @@ void tcg_region_reset_all(struct uc_struct *uc); size_t tcg_code_size(struct uc_struct *uc); size_t tcg_code_capacity(struct uc_struct *uc); -/* Called with tb_lock held. */ +/* user-mode: Called with tb_lock held. */ static inline void *tcg_malloc(TCGContext *s, int size) { uint8_t *ptr, *ptr_end; diff --git a/qemu/unicorn_common.h b/qemu/unicorn_common.h index 66cc7f0e..f562e26a 100644 --- a/qemu/unicorn_common.h +++ b/qemu/unicorn_common.h @@ -59,19 +59,13 @@ static inline void free_tcg_temp_names(TCGContext *s) #endif } -/** Freeing common resources */ -static void release_common(void *t) +static inline void free_tcg_context(TCGContext *s) { - TCGPool *po, *to; - TCGContext *s = (TCGContext *)t; - struct uc_struct *uc = s->uc; - - // Clean TCG. TCGOpDef* def = &s->tcg_op_defs[0]; + TCGPool *po, *to; + g_free(def->args_ct); g_free(def->sorted_args); - g_tree_destroy(uc->tb_ctx.tb_tree); - qht_destroy(&uc->tb_ctx.htable); g_free(s->tcg_op_defs); for (po = s->pool_first; po; po = to) { @@ -79,20 +73,46 @@ static void release_common(void *t) g_free(po); } tcg_pool_reset(s); + g_hash_table_destroy(s->helpers); + free_tcg_temp_names(s); + g_free(s); +} + +static inline void free_tcg_contexts(struct uc_struct *uc) +{ + int i; + TCGContext **tcg_ctxs = uc->tcg_ctxs; + + for (i = 0; i < uc->n_tcg_ctxs; i++) { + free_tcg_context(tcg_ctxs[i]); + } + + g_free(tcg_ctxs); +} + +/** Freeing common resources */ +static void release_common(void *t) +{ + TCGContext *s = (TCGContext *)t; + struct uc_struct *uc = s->uc; + + // Clean TCG. + free_tcg_contexts(uc); + g_tree_destroy(uc->tb_ctx.tb_tree); + qht_destroy(&uc->tb_ctx.htable); // Destory flat view hash table - g_hash_table_destroy(s->uc->flat_views); - unicorn_free_empty_flat_view(s->uc); + g_hash_table_destroy(uc->flat_views); + unicorn_free_empty_flat_view(uc); // TODO(danghvu): these function is not available outside qemu // so we keep them here instead of outside uc_close. - free_address_spaces(s->uc); - memory_free(s->uc); - tb_cleanup(s->uc); - free_code_gen_buffer(s->uc); - free_machine_class_name(s->uc); - free_tcg_temp_names(s); + free_address_spaces(uc); + memory_free(uc); + tb_cleanup(uc); + free_code_gen_buffer(uc); + free_machine_class_name(uc); } static inline void uc_common_init(struct uc_struct* uc) diff --git a/qemu/x86_64.h b/qemu/x86_64.h index 0296f0a2..3dda8a95 100644 --- a/qemu/x86_64.h +++ b/qemu/x86_64.h @@ -2886,6 +2886,7 @@ #define tcg_reg_sync tcg_reg_sync_x86_64 #define tcg_region_init tcg_region_init_x86_64 #define tcg_region_reset_all tcg_region_reset_all_x86_64 +#define tcg_register_thread tcg_register_thread_x86_64 #define tcg_set_frame tcg_set_frame_x86_64 #define tcg_set_nop tcg_set_nop_x86_64 #define tcg_swap_cond tcg_swap_cond_x86_64