translate-all: use a binary search tree to track TBs in TBContext

This is a prerequisite for supporting multiple TCG contexts, since
we will have threads generating code in separate regions of
code_gen_buffer.

For this we need a new field (.size) in struct tb_tc to keep
track of the size of the translated code. This field uses a size_t
to avoid adding a hole to the struct, although really an unsigned
int would have been enough.

The comparison function we use is optimized for the common case:
insertions. Profiling shows that upon booting debian-arm, 98%
of comparisons are between existing tb's (i.e. a->size and b->size
are both !0), which happens during insertions (and removals, but
those are rare). The remaining cases are lookups. From reading the glib
sources we see that the first key is always the lookup key. However,
the code does not assume this to always be the case because this
behaviour is not guaranteed in the glib docs. However, we embed
this knowledge in the code as a branch hint for the compiler.

Note that tb_free does not free space in the code_gen_buffer anymore,
since we cannot easily know whether the tb is the last one inserted
in code_gen_buffer. The next patch in this series renames tb_free
to tb_remove to reflect this.

Performance-wise, lookups in tb_find_pc are the same as before:
O(log n). However, insertions are O(log n) instead of O(1), which
results in a small slowdown when booting debian-arm:

Performance counter stats for 'build/arm-softmmu/qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=img/arm/jessie-arm32.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel img/arm/aarch32-current-linux-kernel-only.img \
-append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):

- Before:

8048.598422 task-clock (msec) # 0.931 CPUs utilized ( +- 0.28% )
16,974 context-switches # 0.002 M/sec ( +- 0.12% )
0 cpu-migrations # 0.000 K/sec
10,125 page-faults # 0.001 M/sec ( +- 1.23% )
35,144,901,879 cycles # 4.367 GHz ( +- 0.14% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,758,252,643 instructions # 1.87 insns per cycle ( +- 0.33% )
10,871,298,668 branches # 1350.707 M/sec ( +- 0.41% )
192,322,212 branch-misses # 1.77% of all branches ( +- 0.32% )

8.640869419 seconds time elapsed ( +- 0.57% )

- After:
8146.242027 task-clock (msec) # 0.923 CPUs utilized ( +- 1.23% )
17,016 context-switches # 0.002 M/sec ( +- 0.40% )
0 cpu-migrations # 0.000 K/sec
18,769 page-faults # 0.002 M/sec ( +- 0.45% )
35,660,956,120 cycles # 4.378 GHz ( +- 1.22% )
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
65,095,366,607 instructions # 1.83 insns per cycle ( +- 1.73% )
10,803,480,261 branches # 1326.192 M/sec ( +- 1.95% )
195,601,289 branch-misses # 1.81% of all branches ( +- 0.39% )

8.828660235 seconds time elapsed ( +- 0.38% )

Backports commit 2ac01d6dafabd4a726254eea98824c798d416ee4 from qemu
This commit is contained in:
Emilio G. Cota 2018-03-13 15:34:46 -04:00 committed by Lioncash
parent 35e551dc45
commit f7c984d21f
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7
11 changed files with 1411 additions and 75 deletions

View file

@ -250,8 +250,6 @@ static int encode_search(TCGContext *tcg_ctx, TranslationBlock *tb, uint8_t *blo
uint8_t *p = block;
int i, j, n;
tb->tc.search = block;
for (i = 0, n = tb->icount; i < n; ++i) {
target_ulong prev;
@ -287,7 +285,7 @@ static int cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
target_ulong data[TARGET_INSN_START_WORDS] = { tb->pc };
uintptr_t host_pc = (uintptr_t)tb->tc.ptr;
CPUArchState *env = cpu->env_ptr;
uint8_t *p = tb->tc.search;
uint8_t *p = tb->tc.ptr + tb->tc.size;
int i, j, num_insns = tb->icount;
#ifdef CONFIG_PROFILER
int64_t ti = profile_getclock();
@ -806,6 +804,48 @@ void free_code_gen_buffer(struct uc_struct *uc)
}
#endif /* USE_STATIC_CODE_GEN_BUFFER, USE_MMAP */
/* compare a pointer @ptr and a tb_tc @s */
static int ptr_cmp_tb_tc(const void *ptr, const struct tb_tc *s)
{
if (ptr >= s->ptr + s->size) {
return 1;
} else if (ptr < s->ptr) {
return -1;
}
return 0;
}
static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
{
const struct tb_tc *a = ap;
const struct tb_tc *b = bp;
/*
* When both sizes are set, we know this isn't a lookup.
* This is the most likely case: every TB must be inserted; lookups
* are a lot less frequent.
*/
if (likely(a->size && b->size)) {
if (a->ptr > b->ptr) {
return 1;
} else if (a->ptr < b->ptr) {
return -1;
}
/* a->ptr == b->ptr should happen only on deletions */
g_assert(a->size == b->size);
return 0;
}
/*
* All lookups have either .size field set to 0.
* From the glib sources we see that @ap is always the lookup key. However
* the docs provide no guarantee, so we just mark this case as likely.
*/
if (likely(a->size == 0)) {
return ptr_cmp_tb_tc(a->ptr, b);
}
return ptr_cmp_tb_tc(b->ptr, a);
}
static inline void code_gen_alloc(struct uc_struct *uc, size_t tb_size)
{
TCGContext *tcg_ctx = uc->tcg_ctx;
@ -825,12 +865,7 @@ static inline void code_gen_alloc(struct uc_struct *uc, size_t tb_size)
still haven't deducted the prologue from the buffer size here,
but that's minimal and won't affect the estimate much. */
/* size this conservatively -- realloc later if needed */
tcg_ctx->tb_ctx.tbs_size =
tcg_ctx->code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
if (unlikely(!tcg_ctx->tb_ctx.tbs_size)) {
tcg_ctx->tb_ctx.tbs_size = 64 * 1024;
}
tcg_ctx->tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx->tb_ctx.tbs_size);
tcg_ctx->tb_ctx.tb_tree = g_tree_new(tb_tc_cmp);
}
static void tb_htable_init(struct uc_struct *uc)
@ -877,18 +912,11 @@ static TranslationBlock *tb_alloc(struct uc_struct *uc, target_ulong pc)
{
TCGContext *tcg_ctx = uc->tcg_ctx;
TranslationBlock *tb;
TBContext *ctx;
tb = tcg_tb_alloc(tcg_ctx);
if (unlikely(tb == NULL)) {
return NULL;
}
ctx = &tcg_ctx->tb_ctx;
if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
ctx->tbs_size *= 2;
ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
}
ctx->tbs[ctx->nb_tbs++] = tb;
return tb;
}
@ -897,16 +925,7 @@ void tb_free(struct uc_struct *uc, TranslationBlock *tb)
{
TCGContext *tcg_ctx = uc->tcg_ctx;
/* In practice this is mostly used for single use temporary TB
Ignore the hard cases and just back up if this TB happens to
be the last one generated. */
if (tcg_ctx->tb_ctx.nb_tbs > 0 &&
tb == tcg_ctx->tb_ctx.tbs[tcg_ctx->tb_ctx.nb_tbs - 1]) {
size_t struct_size = ROUND_UP(sizeof(*tb), uc->qemu_icache_linesize);
tcg_ctx->code_gen_ptr = tb->tc.ptr - struct_size;
tcg_ctx->tb_ctx.nb_tbs--;
}
g_tree_remove(tcg_ctx->tb_ctx.tb_tree, &tb->tc);
}
static inline void invalidate_page_bitmap(PageDesc *p)
@ -963,11 +982,12 @@ void tb_flush(CPUState *cpu)
TCGContext *tcg_ctx = uc->tcg_ctx;
if (DEBUG_TB_FLUSH_GATE) {
printf("qemu: flush code_size=%td nb_tbs=%d avg_tb_size=%td\n",
tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer,
tcg_ctx->tb_ctx.nb_tbs, tcg_ctx->tb_ctx.nb_tbs > 0 ?
(tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer) /
tcg_ctx->tb_ctx.nb_tbs : 0);
size_t nb_tbs = g_tree_nnodes(tcg_ctx->tb_ctx.tb_tree);
printf("qemu: flush code_size=%td nb_tbs=%zu avg_tb_size=%zu\n",
tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer, nb_tbs,
nb_tbs > 0 ?
(size_t)((tcg_ctx->code_gen_ptr - tcg_ctx->code_gen_buffer) / nb_tbs : 0));
}
if ((unsigned long)((char*)tcg_ctx->code_gen_ptr - (char*)tcg_ctx->code_gen_buffer)
> tcg_ctx->code_gen_buffer_size) {
@ -977,7 +997,10 @@ void tb_flush(CPUState *cpu)
cpu_tb_jmp_cache_clear(cpu);
atomic_mb_set(&cpu->tb_flushed, true);
tcg_ctx->tb_ctx.nb_tbs = 0;
/* Increment the refcount first so that destroy acts as a reset */
g_tree_ref(tcg_ctx->tb_ctx.tb_tree);
g_tree_destroy(tcg_ctx->tb_ctx.tb_tree);
qht_reset_size(&tcg_ctx->tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
page_flush_tb(uc);
@ -1393,6 +1416,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
if (unlikely(search_size < 0)) {
goto buffer_overflow;
}
tb->tc.size = gen_code_size;
#ifdef CONFIG_PROFILER
tcg_ctx.code_time += profile_getclock();
@ -1464,6 +1488,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
* through the physical hash table and physical page list.
*/
tb_link_page(cpu->uc, tb, phys_pc, phys_page2);
g_tree_insert(tcg_ctx->tb_ctx.tb_tree, &tb->tc, tb);
return tb;
}
@ -1720,38 +1745,18 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
}
#endif
/* find the TB 'tb' such that tb[0].tc.ptr <= tc_ptr <
tb[1].tc.ptr. Return NULL if not found */
/*
* Find the TB 'tb' such that
* tb->tc.ptr <= tc_ptr < tb->tc.ptr + tb->tc.size
* Return NULL if not found.
*/
static TranslationBlock *tb_find_pc(struct uc_struct *uc, uintptr_t tc_ptr)
{
TCGContext *tcg_ctx = uc->tcg_ctx;
int m_min, m_max, m;
uintptr_t v;
TranslationBlock *tb;
struct tb_tc s = {0};
s.ptr = (void *)tc_ptr;
if (tcg_ctx->tb_ctx.nb_tbs <= 0) {
return NULL;
}
if (tc_ptr < (uintptr_t)tcg_ctx->code_gen_buffer ||
tc_ptr >= (uintptr_t)tcg_ctx->code_gen_ptr) {
return NULL;
}
/* binary search (cf Knuth) */
m_min = 0;
m_max = tcg_ctx->tb_ctx.nb_tbs - 1;
while (m_min <= m_max) {
m = (m_min + m_max) >> 1;
tb = tcg_ctx->tb_ctx.tbs[m];
v = (uintptr_t)tb->tc.ptr;
if (v == tc_ptr) {
return tb;
} else if (tc_ptr < v) {
m_max = m - 1;
} else {
m_min = m + 1;
}
}
return tcg_ctx->tb_ctx.tbs[m_max];
return g_tree_lookup(tcg_ctx->tb_ctx.tb_tree, &s);
}
#if !defined(CONFIG_USER_ONLY)

File diff suppressed because it is too large Load diff

View file

@ -210,10 +210,14 @@ static inline void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr)
/*
* Translation Cache-related fields of a TB.
* This struct exists just for convenience; we keep track of TB's in a binary
* search tree, and the only fields needed to compare TB's in the tree are
* @ptr and @size.
* Note: the address of search data can be obtained by adding @size to @ptr.
*/
struct tb_tc {
void *ptr; /* pointer to the translated code */
uint8_t *search; /* pointer to search data */
size_t size;
};
struct TranslationBlock {

View file

@ -31,10 +31,8 @@ typedef struct TBContext TBContext;
struct TBContext {
TranslationBlock **tbs;
GTree *tb_tree;
struct qht htable;
size_t tbs_size;
int nb_tbs;
/* statistics */
int tb_flush_count;

View file

@ -61,6 +61,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
typedef void* gpointer;
typedef const void *gconstpointer;
typedef int gint;
typedef signed char gint8;
typedef unsigned char guint8;
typedef signed short gint16;
typedef unsigned short guint16;
typedef uint32_t guint32;
typedef uint64_t guint64;
typedef unsigned int guint;
@ -71,12 +75,24 @@ typedef unsigned long gulong;
typedef unsigned long gsize;
typedef signed long gssize;
typedef gint (*GCompareDataFunc)(gconstpointer a,
gconstpointer b,
gpointer user_data);
typedef void (*GFunc)(gpointer data, gpointer user_data);
typedef gint (*GCompareFunc)(gconstpointer v1, gconstpointer v2);
typedef gint (*GCompareFunc)(gconstpointer a, gconstpointer b);
typedef gint (*GCompareDataFunc)(gconstpointer a, gconstpointer b,
gpointer user_data);
typedef gboolean (*GEqualFunc)(gconstpointer a, gconstpointer b);
typedef void (*GDestroyNotify)(gpointer data);
typedef void (*GFunc)(gpointer data, gpointer user_data);
typedef guint (*GHashFunc)(gconstpointer key);
typedef void (*GHFunc)(gpointer key, gpointer value, gpointer user_data);
typedef void (*GFreeFunc)(gpointer data);
/* Tree traverse orders */
typedef enum
{
G_IN_ORDER,
G_PRE_ORDER,
G_POST_ORDER,
G_LEVEL_ORDER
} GTraverseType;
guint g_direct_hash(gconstpointer v);
gboolean g_direct_equal(gconstpointer v1, gconstpointer v2);
@ -178,6 +194,36 @@ GHashTable *g_hash_table_iter_get_hash_table(GHashTableIter *iter);
void g_hash_table_iter_remove(GHashTableIter *iter);
void g_hash_table_iter_steal(GHashTableIter *iter);
/* Tree code */
typedef struct _GTree GTree;
typedef gboolean (*GTraverseFunc) (gpointer key,
gpointer value,
gpointer data);
GTree *g_tree_new(GCompareFunc key_compare_func);
GTree *g_tree_new_with_data(GCompareDataFunc key_compare_func,
gpointer key_compare_data);
GTree *g_tree_new_full(GCompareDataFunc key_compare_func,
gpointer key_compare_data,
GDestroyNotify key_destroy_func,
GDestroyNotify value_destroy_func);
GTree *g_tree_ref(GTree *tree);
void g_tree_unref(GTree *tree);
void g_tree_destroy(GTree *tree);
void g_tree_insert(GTree *tree, gpointer key, gpointer value);
void g_tree_replace(GTree *tree, gpointer key, gpointer value);
gboolean g_tree_remove(GTree *tree, gconstpointer key);
gboolean g_tree_steal(GTree *tree, gconstpointer key);
gpointer g_tree_lookup(GTree *tree, gconstpointer key);
gboolean g_tree_lookup_extended(GTree *tree, gconstpointer lookup_key,
gpointer *orig_key, gpointer *value);
void g_tree_foreach(GTree *tree, GTraverseFunc func, gpointer user_data);
gpointer g_tree_search(GTree *tree, GCompareFunc search_func, gconstpointer user_data);
gint g_tree_height(GTree *tree);
gint g_tree_nnodes(GTree *tree);
void g_tree_traverse(GTree *tree, GTraverseFunc traverse_func, GTraverseType traverse_type, gpointer user_data);
/* replacement for g_malloc dependency */
void g_free(gpointer ptr);
gpointer g_malloc(size_t size);

View file

@ -27,7 +27,7 @@ void arm64_release(void* ctx)
struct uc_struct* uc = s->uc;
ARMCPU* cpu = ARM_CPU(uc, uc->cpu);
g_free(s->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
g_free(cpu->cpreg_indexes);
g_free(cpu->cpreg_values);
g_free(cpu->cpreg_vmstate_indexes);

View file

@ -29,7 +29,7 @@ void arm_release(void* ctx)
ARMCPU* cpu = ARM_CPU(uc, uc->cpu);
CPUArchState *env = &cpu->env;
g_free(s->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
g_free(cpu->cpreg_indexes);
g_free(cpu->cpreg_values);
g_free(cpu->cpreg_vmstate_indexes);

View file

@ -58,7 +58,7 @@ void x86_release(void *ctx)
release_common(ctx);
// arch specific
g_free(s->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
}
void x86_reg_reset(struct uc_struct *uc)

View file

@ -25,7 +25,7 @@ void m68k_release(void* ctx)
TCGContext *tcg_ctx = ctx;;
release_common(ctx);
g_free(tcg_ctx->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
}
void m68k_reg_reset(struct uc_struct *uc)

View file

@ -54,7 +54,7 @@ void mips_release(void *ctx)
release_common(ctx);
g_free(cpu->env.tlb);
g_free(cpu->env.mvp);
g_free(tcg_ctx->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
}
void mips_reg_reset(struct uc_struct *uc)

View file

@ -35,7 +35,7 @@ void sparc_release(void *ctx)
{
TCGContext *tcg_ctx = (TCGContext *) ctx;
release_common(ctx);
g_free(tcg_ctx->tb_ctx.tbs);
g_tree_destroy(s->tb_ctx.tb_tree);
}
void sparc_reg_reset(struct uc_struct *uc)