tcg: allocate TB structs before the corresponding translated code

Allocating an arbitrarily-sized array of tbs results in either
(a) a lot of memory wasted or (b) unnecessary flushes of the code
cache when we run out of TB structs in the array.

An obvious solution would be to just malloc a TB struct when needed,
and keep the TB array as an array of pointers (recall that tb_find_pc()
needs the TB array to run in O(log n)).

Perhaps a better solution, which is implemented in this patch, is to
allocate TB's right before the translated code they describe. This
results in some memory waste due to padding to have code and TBs in
separate cache lines--for instance, I measured 4.7% of padding in the
used portion of code_gen_buffer when booting aarch64 Linux on a
host with 64-byte cache lines. However, it can allow for optimizations
in some host architectures, since TCG backends could safely assume that
the TB and the corresponding translated code are very close to each
other in memory. See this message by rth for a detailed explanation:

https://lists.gnu.org/archive/html/qemu-devel/2017-03/msg05172.html
Subject: Re: GSoC 2017 Proposal: TCG performance enhancements

Backports commit 6e3b2bfd6af488a896f7936e99ef160f8f37e6f2 from qemu
This commit is contained in:
Emilio G. Cota 2018-03-03 17:01:28 -05:00 committed by Lioncash
parent 8e58c67968
commit d3ada2feb5
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7
18 changed files with 61 additions and 14 deletions

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_aarch64
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64
#define tcg_tb_alloc tcg_tb_alloc_aarch64
#define tcg_temp_alloc tcg_temp_alloc_aarch64
#define tcg_temp_free_i32 tcg_temp_free_i32_aarch64
#define tcg_temp_free_i64 tcg_temp_free_i64_aarch64

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_aarch64eb
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64eb
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64eb
#define tcg_tb_alloc tcg_tb_alloc_aarch64eb
#define tcg_temp_alloc tcg_temp_alloc_aarch64eb
#define tcg_temp_free_i32 tcg_temp_free_i32_aarch64eb
#define tcg_temp_free_i64 tcg_temp_free_i64_aarch64eb

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_arm
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_arm
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_arm
#define tcg_tb_alloc tcg_tb_alloc_arm
#define tcg_temp_alloc tcg_temp_alloc_arm
#define tcg_temp_free_i32 tcg_temp_free_i32_arm
#define tcg_temp_free_i64 tcg_temp_free_i64_arm

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_armeb
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_armeb
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_armeb
#define tcg_tb_alloc tcg_tb_alloc_armeb
#define tcg_temp_alloc tcg_temp_alloc_armeb
#define tcg_temp_free_i32 tcg_temp_free_i32_armeb
#define tcg_temp_free_i64 tcg_temp_free_i64_armeb

View file

@ -3285,6 +3285,7 @@ symbols = (
'tcg_target_init',
'tcg_target_qemu_prologue',
'tcg_target_reg_alloc_order',
'tcg_tb_alloc',
'tcg_temp_alloc',
'tcg_temp_free_i32',
'tcg_temp_free_i64',

View file

@ -30,8 +30,9 @@ typedef struct TBContext TBContext;
struct TBContext {
TranslationBlock *tbs;
TranslationBlock **tbs;
TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
size_t tbs_size;
int nb_tbs;
/* statistics */

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_m68k
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_m68k
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_m68k
#define tcg_tb_alloc tcg_tb_alloc_m68k
#define tcg_temp_alloc tcg_temp_alloc_m68k
#define tcg_temp_free_i32 tcg_temp_free_i32_m68k
#define tcg_temp_free_i64 tcg_temp_free_i64_m68k

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_mips
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips
#define tcg_tb_alloc tcg_tb_alloc_mips
#define tcg_temp_alloc tcg_temp_alloc_mips
#define tcg_temp_free_i32 tcg_temp_free_i32_mips
#define tcg_temp_free_i64 tcg_temp_free_i64_mips

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_mips64
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64
#define tcg_tb_alloc tcg_tb_alloc_mips64
#define tcg_temp_alloc tcg_temp_alloc_mips64
#define tcg_temp_free_i32 tcg_temp_free_i32_mips64
#define tcg_temp_free_i64 tcg_temp_free_i64_mips64

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_mips64el
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64el
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64el
#define tcg_tb_alloc tcg_tb_alloc_mips64el
#define tcg_temp_alloc tcg_temp_alloc_mips64el
#define tcg_temp_free_i32 tcg_temp_free_i32_mips64el
#define tcg_temp_free_i64 tcg_temp_free_i64_mips64el

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_mipsel
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_mipsel
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mipsel
#define tcg_tb_alloc tcg_tb_alloc_mipsel
#define tcg_temp_alloc tcg_temp_alloc_mipsel
#define tcg_temp_free_i32 tcg_temp_free_i32_mipsel
#define tcg_temp_free_i64 tcg_temp_free_i64_mipsel

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_powerpc
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_powerpc
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_powerpc
#define tcg_tb_alloc tcg_tb_alloc_powerpc
#define tcg_temp_alloc tcg_temp_alloc_powerpc
#define tcg_temp_free_i32 tcg_temp_free_i32_powerpc
#define tcg_temp_free_i64 tcg_temp_free_i64_powerpc

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_sparc
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc
#define tcg_tb_alloc tcg_tb_alloc_sparc
#define tcg_temp_alloc tcg_temp_alloc_sparc
#define tcg_temp_free_i32 tcg_temp_free_i32_sparc
#define tcg_temp_free_i64 tcg_temp_free_i64_sparc

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_sparc64
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc64
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc64
#define tcg_tb_alloc tcg_tb_alloc_sparc64
#define tcg_temp_alloc tcg_temp_alloc_sparc64
#define tcg_temp_free_i32 tcg_temp_free_i32_sparc64
#define tcg_temp_free_i64 tcg_temp_free_i64_sparc64

View file

@ -374,6 +374,26 @@ void tcg_context_init(TCGContext *s)
}
}
/*
* Allocate TBs right before their corresponding translated code, making
* sure that TBs and code are on different cache lines.
*/
TranslationBlock *tcg_tb_alloc(TCGContext *s)
{
uintptr_t align = s->uc->qemu_icache_linesize;
TranslationBlock *tb;
void *next;
tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
if (unlikely(next > s->code_gen_highwater)) {
return NULL;
}
s->code_gen_ptr = next;
return tb;
}
void tcg_prologue_init(TCGContext *s)
{
size_t prologue_size, total_size;

View file

@ -649,6 +649,7 @@ QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
/* tb_lock must be held for tcg_malloc_internal. */
void *tcg_malloc_internal(TCGContext *s, int size);
void tcg_pool_reset(TCGContext *s);
TranslationBlock *tcg_tb_alloc(TCGContext *s);
void tcg_context_init(TCGContext *s);
void tcg_context_free(void *s); // free memory allocated for @s
@ -826,7 +827,6 @@ struct TCGContext {
here, because there's too much arithmetic throughout that relies
on addition and subtraction working on bytes. Rely on the GCC
extension that allows arithmetic on void*. */
int code_gen_max_blocks;
void *code_gen_prologue;
void *code_gen_epilogue;
void *code_gen_buffer;

View file

@ -823,9 +823,13 @@ static inline void code_gen_alloc(struct uc_struct *uc, size_t tb_size)
/* Estimate a good size for the number of TBs we can support. We
still haven't deducted the prologue from the buffer size here,
but that's minimal and won't affect the estimate much. */
tcg_ctx->code_gen_max_blocks
= tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx->code_gen_max_blocks);
/* size this conservatively -- realloc later if needed */
tcg_ctx->tb_ctx.tbs_size =
tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
if (unlikely(!tcg_ctx->tb_ctx.tbs_size)) {
tcg_ctx->tb_ctx.tbs_size = 64 * 1024;
}
tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx->tb_ctx.tbs_size);
}
/* Must be called before using the QEMU cpus. 'tb_size' is the size
@ -861,13 +865,20 @@ bool tcg_enabled(struct uc_struct *uc)
*/
static TranslationBlock *tb_alloc(struct uc_struct *uc, target_ulong pc)
{
TranslationBlock *tb;
TCGContext *tcg_ctx = uc->tcg_ctx;
TranslationBlock *tb;
TBContext *ctx;
if (tcg_ctx->tb_ctx.nb_tbs >= tcg_ctx->code_gen_max_blocks) {
tb = tcg_tb_alloc(tcg_ctx);
if (unlikely(tb == NULL)) {
return NULL;
}
tb = &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs++];
ctx = &tcg_ctx->tb_ctx;
if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
ctx->tbs_size *= 2;
ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
}
ctx->tbs[ctx->nb_tbs++] = tb;
tb->pc = pc;
tb->cflags = 0;
tb->invalid = false;
@ -883,8 +894,10 @@ void tb_free(struct uc_struct *uc, TranslationBlock *tb)
Ignore the hard cases and just back up if this TB happens to
be the last one generated. */
if (tcg_ctx->tb_ctx.nb_tbs > 0 &&
tb == &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) {
tcg_ctx->code_gen_ptr = tb->tc_ptr;
tb == tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) {
size_t struct_size = ROUND_UP(sizeof(*tb), uc->qemu_icache_linesize);
tcg_ctx->code_gen_ptr = tb->tc_ptr - struct_size;
tcg_ctx->tb_ctx.nb_tbs--;
}
}
@ -1722,7 +1735,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr)
m_max = tcg_ctx->tb_ctx.nb_tbs - 1;
while (m_min <= m_max) {
m = (m_min + m_max) >> 1;
tb = &tcg_ctx->tb_ctx.tbs[m];
tb = tcg_ctx->tb_ctx.tbs[m];
v = (uintptr_t)tb->tc_ptr;
if (v == tc_ptr) {
return tb;
@ -1732,7 +1745,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr)
m_min = m + 1;
}
}
return &tcg_ctx->tb_ctx.tbs[m_max];
return tcg_ctx->tb_ctx.tbs[m_max];
}
#if !defined(CONFIG_USER_ONLY)
@ -1898,8 +1911,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
cpu_fprintf(f, "gen code size %td/%zd\n",
tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer,
tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer);
cpu_fprintf(f, "TB count %d/%d\n",
tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
cpu_fprintf(f, "TB count %d\n", tcg_ctx.tb_ctx.nb_tbs);
cpu_fprintf(f, "TB avg target size %d max=%d bytes\n",
tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
tcg_ctx.tb_ctx.nb_tbs : 0,

View file

@ -3279,6 +3279,7 @@
#define tcg_target_init tcg_target_init_x86_64
#define tcg_target_qemu_prologue tcg_target_qemu_prologue_x86_64
#define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_x86_64
#define tcg_tb_alloc tcg_tb_alloc_x86_64
#define tcg_temp_alloc tcg_temp_alloc_x86_64
#define tcg_temp_free_i32 tcg_temp_free_i32_x86_64
#define tcg_temp_free_i64 tcg_temp_free_i64_x86_64