tcg: consolidate TB lookups in tb_lookup__cpu_state

This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.

Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:

(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache

Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.

Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:

Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):

Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]

19.648474449 seconds time elapsed ( +- 0.82% )

After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]

19.340502161 seconds time elapsed ( +- 0.56% )

After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]

19.700174670 seconds time elapsed ( +- 0.56% )

Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
This commit is contained in:
Emilio G. Cota 2018-03-05 02:41:31 -05:00 committed by Lioncash
parent 5fae6dd433
commit 210d13ec49
No known key found for this signature in database
GPG key ID: 4E3C3CC1031BA9C7
3 changed files with 70 additions and 46 deletions

View file

@ -28,6 +28,7 @@
#include "sysemu/sysemu.h"
#include "exec/address-spaces.h"
#include "exec/tb-hash.h"
#include "exec/tb-lookup.h"
#include "uc_priv.h"
@ -201,44 +202,32 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
TranslationBlock *last_tb,
int tb_exit)
{
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags;
bool acquired_tb_lock = false;
/* we record a subset of the CPU state. It will
always be the same before a given translated block
is executed. */
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
// Unicorn: atomic_read used instead of atomic_rcu_read
tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
tb->flags != flags)) {
tb = tb_htable_lookup(cpu, pc, cs_base, flags);
if (!tb) {
/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
* taken outside tb_lock. As system emulation is currently
* single threaded the locks are NOPs.
*/
mmap_lock();
// Unicorn: commented out
//tb_lock();
acquired_tb_lock = true;
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
if (tb == NULL) {
/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
* taken outside tb_lock. As system emulation is currently
* single threaded the locks are NOPs.
*/
mmap_lock();
//tb_lock();
acquired_tb_lock = true;
/* There's a chance that our desired tb has been translated while
* taking the locks so we check again inside the lock.
*/
tb = tb_htable_lookup(cpu, pc, cs_base, flags);
if (!tb) {
/* if no translated code available, then translate it now */
tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
}
// Unicorn: commented out
//tb_unlock();
mmap_unlock();
/* There's a chance that our desired tb has been translated while
* taking the locks so we check again inside the lock.
*/
tb = tb_htable_lookup(cpu, pc, cs_base, flags);
if (likely(tb == NULL)) {
/* if no translated code available, then translate it now */
tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
}
mmap_unlock();
/* We add the TB in the virtual pc hash table for the fast lookup */
atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
}

View file

@ -0,0 +1,46 @@
/*
* Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
*
* License: GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*/
#ifndef EXEC_TB_LOOKUP_H
#define EXEC_TB_LOOKUP_H
#include "qemu/osdep.h"
#ifdef NEED_CPU_H
#include "cpu.h"
#endif
#include "exec/exec-all.h"
#include "exec/tb-hash.h"
/* Might cause an exception, so have a longjmp destination ready */
static inline TranslationBlock *
tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
uint32_t *flags)
{
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
uint32_t hash;
cpu_get_tb_cpu_state(env, pc, cs_base, flags);
hash = tb_jmp_cache_hash_func(*pc);
tb = atomic_read(&cpu->tb_jmp_cache[hash]);
if (likely(tb &&
tb->pc == *pc &&
tb->cs_base == *cs_base &&
tb->flags == *flags &&
!atomic_read(&tb->invalid))) {
return tb;
}
tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags);
if (tb == NULL) {
return NULL;
}
atomic_set(&cpu->tb_jmp_cache[hash], tb);
return tb;
}
#endif /* EXEC_TB_LOOKUP_H */

View file

@ -28,7 +28,7 @@
#include "exec/helper-proto.h"
#include "exec/cpu_ldst.h"
#include "exec/exec-all.h"
#include "exec/tb-hash.h"
#include "exec/tb-lookup.h"
/* 32-bit helpers */
@ -149,22 +149,11 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
CPUState *cpu = ENV_GET_CPU(env);
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags, hash;
uint32_t flags;
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
hash = tb_jmp_cache_hash_func(pc);
// Unicorn: atomic_read used instead of atomic_rcu_read
tb = atomic_read(&cpu->tb_jmp_cache[hash]);
if (unlikely(!(tb
&& tb->pc == pc
&& tb->cs_base == cs_base
&& tb->flags == flags))) {
tb = tb_htable_lookup(cpu, pc, cs_base, flags);
if (!tb) {
return tcg_ctx->code_gen_epilogue;
}
atomic_set(&cpu->tb_jmp_cache[hash], tb);
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
if (tb == NULL) {
return tcg_ctx->code_gen_epilogue;
}
// Unicorn: commented out