From d3ada2feb5810042f98b7edf3ea3e2ad1cd4f97e Mon Sep 17 00:00:00 2001 From: "Emilio G. Cota" Date: Sat, 3 Mar 2018 17:01:28 -0500 Subject: [PATCH] tcg: allocate TB structs before the corresponding translated code Allocating an arbitrarily-sized array of tbs results in either (a) a lot of memory wasted or (b) unnecessary flushes of the code cache when we run out of TB structs in the array. An obvious solution would be to just malloc a TB struct when needed, and keep the TB array as an array of pointers (recall that tb_find_pc() needs the TB array to run in O(log n)). Perhaps a better solution, which is implemented in this patch, is to allocate TB's right before the translated code they describe. This results in some memory waste due to padding to have code and TBs in separate cache lines--for instance, I measured 4.7% of padding in the used portion of code_gen_buffer when booting aarch64 Linux on a host with 64-byte cache lines. However, it can allow for optimizations in some host architectures, since TCG backends could safely assume that the TB and the corresponding translated code are very close to each other in memory. See this message by rth for a detailed explanation: https://lists.gnu.org/archive/html/qemu-devel/2017-03/msg05172.html Subject: Re: GSoC 2017 Proposal: TCG performance enhancements Backports commit 6e3b2bfd6af488a896f7936e99ef160f8f37e6f2 from qemu --- qemu/aarch64.h | 1 + qemu/aarch64eb.h | 1 + qemu/arm.h | 1 + qemu/armeb.h | 1 + qemu/header_gen.py | 1 + qemu/include/exec/tb-context.h | 3 ++- qemu/m68k.h | 1 + qemu/mips.h | 1 + qemu/mips64.h | 1 + qemu/mips64el.h | 1 + qemu/mipsel.h | 1 + qemu/powerpc.h | 1 + qemu/sparc.h | 1 + qemu/sparc64.h | 1 + qemu/tcg/tcg.c | 20 +++++++++++++++++++ qemu/tcg/tcg.h | 2 +- qemu/translate-all.c | 36 ++++++++++++++++++++++------------ qemu/x86_64.h | 1 + 18 files changed, 61 insertions(+), 14 deletions(-) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index e852738e..5e72cbeb 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_aarch64 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64 +#define tcg_tb_alloc tcg_tb_alloc_aarch64 #define tcg_temp_alloc tcg_temp_alloc_aarch64 #define tcg_temp_free_i32 tcg_temp_free_i32_aarch64 #define tcg_temp_free_i64 tcg_temp_free_i64_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 829f88e3..2980cd24 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_aarch64eb #define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64eb #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64eb +#define tcg_tb_alloc tcg_tb_alloc_aarch64eb #define tcg_temp_alloc tcg_temp_alloc_aarch64eb #define tcg_temp_free_i32 tcg_temp_free_i32_aarch64eb #define tcg_temp_free_i64 tcg_temp_free_i64_aarch64eb diff --git a/qemu/arm.h b/qemu/arm.h index 87e0158b..4d37a0ff 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_arm #define tcg_target_qemu_prologue tcg_target_qemu_prologue_arm #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_arm +#define tcg_tb_alloc tcg_tb_alloc_arm #define tcg_temp_alloc tcg_temp_alloc_arm #define tcg_temp_free_i32 tcg_temp_free_i32_arm #define tcg_temp_free_i64 tcg_temp_free_i64_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index 688bac5c..1b586568 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_armeb #define tcg_target_qemu_prologue tcg_target_qemu_prologue_armeb #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_armeb +#define tcg_tb_alloc tcg_tb_alloc_armeb #define tcg_temp_alloc tcg_temp_alloc_armeb #define tcg_temp_free_i32 tcg_temp_free_i32_armeb #define tcg_temp_free_i64 tcg_temp_free_i64_armeb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index 51dd01f2..6e152936 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -3285,6 +3285,7 @@ symbols = ( 'tcg_target_init', 'tcg_target_qemu_prologue', 'tcg_target_reg_alloc_order', + 'tcg_tb_alloc', 'tcg_temp_alloc', 'tcg_temp_free_i32', 'tcg_temp_free_i64', diff --git a/qemu/include/exec/tb-context.h b/qemu/include/exec/tb-context.h index 6680fc2f..7b9d0735 100644 --- a/qemu/include/exec/tb-context.h +++ b/qemu/include/exec/tb-context.h @@ -30,8 +30,9 @@ typedef struct TBContext TBContext; struct TBContext { - TranslationBlock *tbs; + TranslationBlock **tbs; TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE]; + size_t tbs_size; int nb_tbs; /* statistics */ diff --git a/qemu/m68k.h b/qemu/m68k.h index 0853579c..7f4af686 100644 --- a/qemu/m68k.h +++ b/qemu/m68k.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_m68k #define tcg_target_qemu_prologue tcg_target_qemu_prologue_m68k #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_m68k +#define tcg_tb_alloc tcg_tb_alloc_m68k #define tcg_temp_alloc tcg_temp_alloc_m68k #define tcg_temp_free_i32 tcg_temp_free_i32_m68k #define tcg_temp_free_i64 tcg_temp_free_i64_m68k diff --git a/qemu/mips.h b/qemu/mips.h index cb35d0fa..71ea7813 100644 --- a/qemu/mips.h +++ b/qemu/mips.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_mips #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips +#define tcg_tb_alloc tcg_tb_alloc_mips #define tcg_temp_alloc tcg_temp_alloc_mips #define tcg_temp_free_i32 tcg_temp_free_i32_mips #define tcg_temp_free_i64 tcg_temp_free_i64_mips diff --git a/qemu/mips64.h b/qemu/mips64.h index a16255d5..672d6f97 100644 --- a/qemu/mips64.h +++ b/qemu/mips64.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_mips64 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64 +#define tcg_tb_alloc tcg_tb_alloc_mips64 #define tcg_temp_alloc tcg_temp_alloc_mips64 #define tcg_temp_free_i32 tcg_temp_free_i32_mips64 #define tcg_temp_free_i64 tcg_temp_free_i64_mips64 diff --git a/qemu/mips64el.h b/qemu/mips64el.h index 7dd98b1a..4e040738 100644 --- a/qemu/mips64el.h +++ b/qemu/mips64el.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_mips64el #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64el #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64el +#define tcg_tb_alloc tcg_tb_alloc_mips64el #define tcg_temp_alloc tcg_temp_alloc_mips64el #define tcg_temp_free_i32 tcg_temp_free_i32_mips64el #define tcg_temp_free_i64 tcg_temp_free_i64_mips64el diff --git a/qemu/mipsel.h b/qemu/mipsel.h index aeb70ffe..c2739a08 100644 --- a/qemu/mipsel.h +++ b/qemu/mipsel.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_mipsel #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mipsel #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mipsel +#define tcg_tb_alloc tcg_tb_alloc_mipsel #define tcg_temp_alloc tcg_temp_alloc_mipsel #define tcg_temp_free_i32 tcg_temp_free_i32_mipsel #define tcg_temp_free_i64 tcg_temp_free_i64_mipsel diff --git a/qemu/powerpc.h b/qemu/powerpc.h index 5dc97b11..ada1e695 100644 --- a/qemu/powerpc.h +++ b/qemu/powerpc.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_powerpc #define tcg_target_qemu_prologue tcg_target_qemu_prologue_powerpc #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_powerpc +#define tcg_tb_alloc tcg_tb_alloc_powerpc #define tcg_temp_alloc tcg_temp_alloc_powerpc #define tcg_temp_free_i32 tcg_temp_free_i32_powerpc #define tcg_temp_free_i64 tcg_temp_free_i64_powerpc diff --git a/qemu/sparc.h b/qemu/sparc.h index 4ba5a8f0..79a1b824 100644 --- a/qemu/sparc.h +++ b/qemu/sparc.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_sparc #define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc +#define tcg_tb_alloc tcg_tb_alloc_sparc #define tcg_temp_alloc tcg_temp_alloc_sparc #define tcg_temp_free_i32 tcg_temp_free_i32_sparc #define tcg_temp_free_i64 tcg_temp_free_i64_sparc diff --git a/qemu/sparc64.h b/qemu/sparc64.h index 64f99e41..65de72a5 100644 --- a/qemu/sparc64.h +++ b/qemu/sparc64.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_sparc64 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc64 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc64 +#define tcg_tb_alloc tcg_tb_alloc_sparc64 #define tcg_temp_alloc tcg_temp_alloc_sparc64 #define tcg_temp_free_i32 tcg_temp_free_i32_sparc64 #define tcg_temp_free_i64 tcg_temp_free_i64_sparc64 diff --git a/qemu/tcg/tcg.c b/qemu/tcg/tcg.c index 4c09b750..5496f64e 100644 --- a/qemu/tcg/tcg.c +++ b/qemu/tcg/tcg.c @@ -374,6 +374,26 @@ void tcg_context_init(TCGContext *s) } } +/* + * Allocate TBs right before their corresponding translated code, making + * sure that TBs and code are on different cache lines. + */ +TranslationBlock *tcg_tb_alloc(TCGContext *s) +{ + uintptr_t align = s->uc->qemu_icache_linesize; + TranslationBlock *tb; + void *next; + + tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align); + next = (void *)ROUND_UP((uintptr_t)(tb + 1), align); + + if (unlikely(next > s->code_gen_highwater)) { + return NULL; + } + s->code_gen_ptr = next; + return tb; +} + void tcg_prologue_init(TCGContext *s) { size_t prologue_size, total_size; diff --git a/qemu/tcg/tcg.h b/qemu/tcg/tcg.h index 2c399b07..2b75ac68 100644 --- a/qemu/tcg/tcg.h +++ b/qemu/tcg/tcg.h @@ -649,6 +649,7 @@ QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8); /* tb_lock must be held for tcg_malloc_internal. */ void *tcg_malloc_internal(TCGContext *s, int size); void tcg_pool_reset(TCGContext *s); +TranslationBlock *tcg_tb_alloc(TCGContext *s); void tcg_context_init(TCGContext *s); void tcg_context_free(void *s); // free memory allocated for @s @@ -826,7 +827,6 @@ struct TCGContext { here, because there's too much arithmetic throughout that relies on addition and subtraction working on bytes. Rely on the GCC extension that allows arithmetic on void*. */ - int code_gen_max_blocks; void *code_gen_prologue; void *code_gen_epilogue; void *code_gen_buffer; diff --git a/qemu/translate-all.c b/qemu/translate-all.c index d267c8dd..0f4e64f3 100644 --- a/qemu/translate-all.c +++ b/qemu/translate-all.c @@ -823,9 +823,13 @@ static inline void code_gen_alloc(struct uc_struct *uc, size_t tb_size) /* Estimate a good size for the number of TBs we can support. We still haven't deducted the prologue from the buffer size here, but that's minimal and won't affect the estimate much. */ - tcg_ctx->code_gen_max_blocks - = tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE; - tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx->code_gen_max_blocks); + /* size this conservatively -- realloc later if needed */ + tcg_ctx->tb_ctx.tbs_size = + tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8; + if (unlikely(!tcg_ctx->tb_ctx.tbs_size)) { + tcg_ctx->tb_ctx.tbs_size = 64 * 1024; + } + tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx->tb_ctx.tbs_size); } /* Must be called before using the QEMU cpus. 'tb_size' is the size @@ -861,13 +865,20 @@ bool tcg_enabled(struct uc_struct *uc) */ static TranslationBlock *tb_alloc(struct uc_struct *uc, target_ulong pc) { - TranslationBlock *tb; TCGContext *tcg_ctx = uc->tcg_ctx; + TranslationBlock *tb; + TBContext *ctx; - if (tcg_ctx->tb_ctx.nb_tbs >= tcg_ctx->code_gen_max_blocks) { + tb = tcg_tb_alloc(tcg_ctx); + if (unlikely(tb == NULL)) { return NULL; } - tb = &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs++]; + ctx = &tcg_ctx->tb_ctx; + if (unlikely(ctx->nb_tbs == ctx->tbs_size)) { + ctx->tbs_size *= 2; + ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size); + } + ctx->tbs[ctx->nb_tbs++] = tb; tb->pc = pc; tb->cflags = 0; tb->invalid = false; @@ -883,8 +894,10 @@ void tb_free(struct uc_struct *uc, TranslationBlock *tb) Ignore the hard cases and just back up if this TB happens to be the last one generated. */ if (tcg_ctx->tb_ctx.nb_tbs > 0 && - tb == &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) { - tcg_ctx->code_gen_ptr = tb->tc_ptr; + tb == tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) { + size_t struct_size = ROUND_UP(sizeof(*tb), uc->qemu_icache_linesize); + + tcg_ctx->code_gen_ptr = tb->tc_ptr - struct_size; tcg_ctx->tb_ctx.nb_tbs--; } } @@ -1722,7 +1735,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr) m_max = tcg_ctx->tb_ctx.nb_tbs - 1; while (m_min <= m_max) { m = (m_min + m_max) >> 1; - tb = &tcg_ctx->tb_ctx.tbs[m]; + tb = tcg_ctx->tb_ctx.tbs[m]; v = (uintptr_t)tb->tc_ptr; if (v == tc_ptr) { return tb; @@ -1732,7 +1745,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr) m_min = m + 1; } } - return &tcg_ctx->tb_ctx.tbs[m_max]; + return tcg_ctx->tb_ctx.tbs[m_max]; } #if !defined(CONFIG_USER_ONLY) @@ -1898,8 +1911,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf) cpu_fprintf(f, "gen code size %td/%zd\n", tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer); - cpu_fprintf(f, "TB count %d/%d\n", - tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks); + cpu_fprintf(f, "TB count %d\n", tcg_ctx.tb_ctx.nb_tbs); cpu_fprintf(f, "TB avg target size %d max=%d bytes\n", tcg_ctx.tb_ctx.nb_tbs ? target_code_size / tcg_ctx.tb_ctx.nb_tbs : 0, diff --git a/qemu/x86_64.h b/qemu/x86_64.h index e5303da2..1091d9f2 100644 --- a/qemu/x86_64.h +++ b/qemu/x86_64.h @@ -3279,6 +3279,7 @@ #define tcg_target_init tcg_target_init_x86_64 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_x86_64 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_x86_64 +#define tcg_tb_alloc tcg_tb_alloc_x86_64 #define tcg_temp_alloc tcg_temp_alloc_x86_64 #define tcg_temp_free_i32 tcg_temp_free_i32_x86_64 #define tcg_temp_free_i64 tcg_temp_free_i64_x86_64