tcg: allocate TB structs before the corresponding translated code

Allocating an arbitrarily-sized array of tbs results in either (a) a lot of memory wasted or (b) unnecessary flushes of the code cache when we run out of TB structs in the array. An obvious solution would be to just malloc a TB struct when needed, and keep the TB array as an array of pointers (recall that tb_find_pc() needs the TB array to run in O(log n)). Perhaps a better solution, which is implemented in this patch, is to allocate TB's right before the translated code they describe. This results in some memory waste due to padding to have code and TBs in separate cache lines--for instance, I measured 4.7% of padding in the used portion of code_gen_buffer when booting aarch64 Linux on a host with 64-byte cache lines. However, it can allow for optimizations in some host architectures, since TCG backends could safely assume that the TB and the corresponding translated code are very close to each other in memory. See this message by rth for a detailed explanation: https://lists.gnu.org/archive/html/qemu-devel/2017-03/msg05172.html Subject: Re: GSoC 2017 Proposal: TCG performance enhancements Backports commit 6e3b2bfd6af488a896f7936e99ef160f8f37e6f2 from qemu
2025-07-25 10:58:28 +00:00 · 2018-03-03 17:01:28 -05:00 · 2018-03-03 17:01:28 -05:00 · d3ada2feb5
parent 8e58c67968
commit d3ada2feb5
18 changed files with 61 additions and 14 deletions
--- a/qemu/aarch64.h
+++ b/qemu/aarch64.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_aarch64
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64
+#define tcg_tb_alloc tcg_tb_alloc_aarch64
 #define tcg_temp_alloc tcg_temp_alloc_aarch64
 #define tcg_temp_free_i32 tcg_temp_free_i32_aarch64
 #define tcg_temp_free_i64 tcg_temp_free_i64_aarch64
--- a/qemu/aarch64eb.h
+++ b/qemu/aarch64eb.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_aarch64eb
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_aarch64eb
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_aarch64eb
+#define tcg_tb_alloc tcg_tb_alloc_aarch64eb
 #define tcg_temp_alloc tcg_temp_alloc_aarch64eb
 #define tcg_temp_free_i32 tcg_temp_free_i32_aarch64eb
 #define tcg_temp_free_i64 tcg_temp_free_i64_aarch64eb
--- a/qemu/arm.h
+++ b/qemu/arm.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_arm
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_arm
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_arm
+#define tcg_tb_alloc tcg_tb_alloc_arm
 #define tcg_temp_alloc tcg_temp_alloc_arm
 #define tcg_temp_free_i32 tcg_temp_free_i32_arm
 #define tcg_temp_free_i64 tcg_temp_free_i64_arm
--- a/qemu/armeb.h
+++ b/qemu/armeb.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_armeb
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_armeb
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_armeb
+#define tcg_tb_alloc tcg_tb_alloc_armeb
 #define tcg_temp_alloc tcg_temp_alloc_armeb
 #define tcg_temp_free_i32 tcg_temp_free_i32_armeb
 #define tcg_temp_free_i64 tcg_temp_free_i64_armeb
--- a/qemu/header_gen.py
+++ b/qemu/header_gen.py
@ -3285,6 +3285,7 @@ symbols = (
    'tcg_target_init',
    'tcg_target_qemu_prologue',
    'tcg_target_reg_alloc_order',
+    'tcg_tb_alloc',
    'tcg_temp_alloc',
    'tcg_temp_free_i32',
    'tcg_temp_free_i64',
--- a/qemu/include/exec/tb-context.h
+++ b/qemu/include/exec/tb-context.h
@ -30,8 +30,9 @@ typedef struct TBContext TBContext;

 struct TBContext {

-    TranslationBlock *tbs;
+    TranslationBlock **tbs;
    TranslationBlock *tb_phys_hash[CODE_GEN_PHYS_HASH_SIZE];
+    size_t tbs_size;
    int nb_tbs;

    /* statistics */
--- a/qemu/m68k.h
+++ b/qemu/m68k.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_m68k
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_m68k
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_m68k
+#define tcg_tb_alloc tcg_tb_alloc_m68k
 #define tcg_temp_alloc tcg_temp_alloc_m68k
 #define tcg_temp_free_i32 tcg_temp_free_i32_m68k
 #define tcg_temp_free_i64 tcg_temp_free_i64_m68k
--- a/qemu/mips.h
+++ b/qemu/mips.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_mips
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips
+#define tcg_tb_alloc tcg_tb_alloc_mips
 #define tcg_temp_alloc tcg_temp_alloc_mips
 #define tcg_temp_free_i32 tcg_temp_free_i32_mips
 #define tcg_temp_free_i64 tcg_temp_free_i64_mips
--- a/qemu/mips64.h
+++ b/qemu/mips64.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_mips64
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64
+#define tcg_tb_alloc tcg_tb_alloc_mips64
 #define tcg_temp_alloc tcg_temp_alloc_mips64
 #define tcg_temp_free_i32 tcg_temp_free_i32_mips64
 #define tcg_temp_free_i64 tcg_temp_free_i64_mips64
--- a/qemu/mips64el.h
+++ b/qemu/mips64el.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_mips64el
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mips64el
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mips64el
+#define tcg_tb_alloc tcg_tb_alloc_mips64el
 #define tcg_temp_alloc tcg_temp_alloc_mips64el
 #define tcg_temp_free_i32 tcg_temp_free_i32_mips64el
 #define tcg_temp_free_i64 tcg_temp_free_i64_mips64el
--- a/qemu/mipsel.h
+++ b/qemu/mipsel.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_mipsel
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_mipsel
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_mipsel
+#define tcg_tb_alloc tcg_tb_alloc_mipsel
 #define tcg_temp_alloc tcg_temp_alloc_mipsel
 #define tcg_temp_free_i32 tcg_temp_free_i32_mipsel
 #define tcg_temp_free_i64 tcg_temp_free_i64_mipsel
--- a/qemu/powerpc.h
+++ b/qemu/powerpc.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_powerpc
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_powerpc
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_powerpc
+#define tcg_tb_alloc tcg_tb_alloc_powerpc
 #define tcg_temp_alloc tcg_temp_alloc_powerpc
 #define tcg_temp_free_i32 tcg_temp_free_i32_powerpc
 #define tcg_temp_free_i64 tcg_temp_free_i64_powerpc
--- a/qemu/sparc.h
+++ b/qemu/sparc.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_sparc
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc
+#define tcg_tb_alloc tcg_tb_alloc_sparc
 #define tcg_temp_alloc tcg_temp_alloc_sparc
 #define tcg_temp_free_i32 tcg_temp_free_i32_sparc
 #define tcg_temp_free_i64 tcg_temp_free_i64_sparc
--- a/qemu/sparc64.h
+++ b/qemu/sparc64.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_sparc64
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_sparc64
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_sparc64
+#define tcg_tb_alloc tcg_tb_alloc_sparc64
 #define tcg_temp_alloc tcg_temp_alloc_sparc64
 #define tcg_temp_free_i32 tcg_temp_free_i32_sparc64
 #define tcg_temp_free_i64 tcg_temp_free_i64_sparc64
--- a/qemu/tcg/tcg.c
+++ b/qemu/tcg/tcg.c
@ -374,6 +374,26 @@ void tcg_context_init(TCGContext *s)
    }
 }

+/*
+ * Allocate TBs right before their corresponding translated code, making
+ * sure that TBs and code are on different cache lines.
+ */
+TranslationBlock *tcg_tb_alloc(TCGContext *s)
+{
+    uintptr_t align = s->uc->qemu_icache_linesize;
+    TranslationBlock *tb;
+    void *next;
+
+    tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
+    next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
+
+    if (unlikely(next > s->code_gen_highwater)) {
+        return NULL;
+    }
+    s->code_gen_ptr = next;
+    return tb;
+}
+
 void tcg_prologue_init(TCGContext *s)
 {
    size_t prologue_size, total_size;
--- a/qemu/tcg/tcg.h
+++ b/qemu/tcg/tcg.h
@ -649,6 +649,7 @@ QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
 /* tb_lock must be held for tcg_malloc_internal. */
 void *tcg_malloc_internal(TCGContext *s, int size);
 void tcg_pool_reset(TCGContext *s);
+TranslationBlock *tcg_tb_alloc(TCGContext *s);

 void tcg_context_init(TCGContext *s);
 void tcg_context_free(void *s);   // free memory allocated for @s
@ -826,7 +827,6 @@ struct TCGContext {
       here, because there's too much arithmetic throughout that relies
       on addition and subtraction working on bytes.  Rely on the GCC
       extension that allows arithmetic on void*.  */
-    int code_gen_max_blocks;
    void *code_gen_prologue;
    void *code_gen_epilogue;
    void *code_gen_buffer;
--- a/qemu/translate-all.c
+++ b/qemu/translate-all.c
@ -823,9 +823,13 @@ static inline void code_gen_alloc(struct uc_struct *uc, size_t tb_size)
    /* Estimate a good size for the number of TBs we can support.  We
       still haven't deducted the prologue from the buffer size here,
       but that's minimal and won't affect the estimate much.  */
-    tcg_ctx->code_gen_max_blocks
-        = tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
-    tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx->code_gen_max_blocks);
+    /* size this conservatively -- realloc later if needed */
+    tcg_ctx->tb_ctx.tbs_size =
+        tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
+    if (unlikely(!tcg_ctx->tb_ctx.tbs_size)) {
+        tcg_ctx->tb_ctx.tbs_size = 64 * 1024;
+    }
+    tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx->tb_ctx.tbs_size);
 }

 /* Must be called before using the QEMU cpus. 'tb_size' is the size
@ -861,13 +865,20 @@ bool tcg_enabled(struct uc_struct *uc)
 */
 static TranslationBlock *tb_alloc(struct uc_struct *uc, target_ulong pc)
 {
-    TranslationBlock *tb;
    TCGContext *tcg_ctx = uc->tcg_ctx;
+    TranslationBlock *tb;
+    TBContext *ctx;

-    if (tcg_ctx->tb_ctx.nb_tbs >= tcg_ctx->code_gen_max_blocks) {
+    tb = tcg_tb_alloc(tcg_ctx);
+    if (unlikely(tb == NULL)) {
        return NULL;
    }
-    tb = &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs++];
+    ctx = &tcg_ctx->tb_ctx;
+    if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
+        ctx->tbs_size *= 2;
+        ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
+    }
+    ctx->tbs[ctx->nb_tbs++] = tb;
    tb->pc = pc;
    tb->cflags = 0;
    tb->invalid = false;
@ -883,8 +894,10 @@ void tb_free(struct uc_struct *uc, TranslationBlock *tb)
       Ignore the hard cases and just back up if this TB happens to
       be the last one generated.  */
    if (tcg_ctx->tb_ctx.nb_tbs > 0 &&
-            tb == &tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) {
-        tcg_ctx->code_gen_ptr = tb->tc_ptr;
+            tb == tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) {
+        size_t struct_size = ROUND_UP(sizeof(*tb), uc->qemu_icache_linesize);
+
+        tcg_ctx->code_gen_ptr = tb->tc_ptr - struct_size;
        tcg_ctx->tb_ctx.nb_tbs--;
    }
 }
@ -1722,7 +1735,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr)
    m_max = tcg_ctx->tb_ctx.nb_tbs - 1;
    while (m_min <= m_max) {
        m = (m_min + m_max) >> 1;
-        tb = &tcg_ctx->tb_ctx.tbs[m];
+        tb = tcg_ctx->tb_ctx.tbs[m];
        v = (uintptr_t)tb->tc_ptr;
        if (v == tc_ptr) {
            return tb;
@ -1732,7 +1745,7 @@ static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr)
            m_min = m + 1;
        }
    }
-    return &tcg_ctx->tb_ctx.tbs[m_max];
+    return tcg_ctx->tb_ctx.tbs[m_max];
 }

 #if !defined(CONFIG_USER_ONLY)
@ -1898,8 +1911,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
    cpu_fprintf(f, "gen code size       %td/%zd\n",
                tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer,
                tcg_ctx->code_gen_highwater - tcg_ctx->code_gen_buffer);
-    cpu_fprintf(f, "TB count            %d/%d\n",
-            tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
+    cpu_fprintf(f, "TB count            %d\n", tcg_ctx.tb_ctx.nb_tbs);
    cpu_fprintf(f, "TB avg target size  %d max=%d bytes\n",
            tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
                    tcg_ctx.tb_ctx.nb_tbs : 0,
--- a/qemu/x86_64.h
+++ b/qemu/x86_64.h
@ -3279,6 +3279,7 @@
 #define tcg_target_init tcg_target_init_x86_64
 #define tcg_target_qemu_prologue tcg_target_qemu_prologue_x86_64
 #define tcg_target_reg_alloc_order tcg_target_reg_alloc_order_x86_64
+#define tcg_tb_alloc tcg_tb_alloc_x86_64
 #define tcg_temp_alloc tcg_temp_alloc_x86_64
 #define tcg_temp_free_i32 tcg_temp_free_i32_x86_64
 #define tcg_temp_free_i64 tcg_temp_free_i64_x86_64