2015-08-21 07:04:50 +00:00
|
|
|
/*
|
|
|
|
* emulator main execution loop
|
|
|
|
*
|
|
|
|
* Copyright (c) 2003-2005 Fabrice Bellard
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
2019-02-03 22:30:53 +00:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2015-08-21 07:04:50 +00:00
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Modified for Unicorn Engine by Nguyen Anh Quynh, 2015 */
|
|
|
|
|
2018-02-19 05:49:52 +00:00
|
|
|
#include "qemu/osdep.h"
|
2018-02-24 07:26:26 +00:00
|
|
|
#include "cpu.h"
|
|
|
|
#include "exec/exec-all.h"
|
2015-08-21 07:04:50 +00:00
|
|
|
#include "tcg.h"
|
2018-02-27 16:12:36 +00:00
|
|
|
#include "qemu/atomic.h"
|
|
|
|
#include "qemu/timer.h"
|
2015-08-21 07:04:50 +00:00
|
|
|
#include "sysemu/sysemu.h"
|
2018-02-12 20:20:15 +00:00
|
|
|
#include "exec/address-spaces.h"
|
2018-02-14 13:12:24 +00:00
|
|
|
#include "exec/tb-hash.h"
|
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.
Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:
(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache
Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.
Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:
Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]
19.648474449 seconds time elapsed ( +- 0.82% )
After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]
19.340502161 seconds time elapsed ( +- 0.56% )
After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]
19.700174670 seconds time elapsed ( +- 0.56% )
Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
2018-03-05 07:41:31 +00:00
|
|
|
#include "exec/tb-lookup.h"
|
2015-08-21 07:04:50 +00:00
|
|
|
|
|
|
|
#include "uc_priv.h"
|
|
|
|
|
2018-02-24 04:50:26 +00:00
|
|
|
/* Execute a TB, and fix up the CPU state afterwards if necessary */
|
|
|
|
static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, TranslationBlock *itb)
|
|
|
|
{
|
|
|
|
CPUArchState *env = cpu->env_ptr;
|
|
|
|
TCGContext *tcg_ctx = env->uc->tcg_ctx;
|
|
|
|
uintptr_t ret;
|
|
|
|
TranslationBlock *last_tb;
|
|
|
|
int tb_exit;
|
2018-03-05 07:57:19 +00:00
|
|
|
uint8_t *tb_ptr = itb->tc.ptr;
|
2018-02-24 04:50:26 +00:00
|
|
|
|
|
|
|
ret = tcg_qemu_tb_exec(env, tb_ptr);
|
2019-04-26 20:02:57 +00:00
|
|
|
cpu->can_do_io = 1;
|
2018-02-24 04:50:26 +00:00
|
|
|
last_tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
|
|
|
|
tb_exit = ret & TB_EXIT_MASK;
|
|
|
|
//trace_exec_tb_exit(last_tb, tb_exit);
|
|
|
|
|
|
|
|
if (tb_exit > TB_EXIT_IDX1) {
|
|
|
|
/* We didn't start executing this TB (eg because the instruction
|
|
|
|
* counter hit zero); we must restore the guest PC to the address
|
|
|
|
* of the start of the TB.
|
|
|
|
*/
|
|
|
|
CPUClass *cc = CPU_GET_CLASS(env->uc, cpu);
|
2020-01-14 14:49:41 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Both set_pc() & synchronize_fromtb() can be ignored when code tracing hook is installed,
|
|
|
|
* or timer mode is in effect, since these already fix the PC.
|
|
|
|
*/
|
|
|
|
if (!HOOK_EXISTS(env->uc, UC_HOOK_CODE) && !env->uc->timeout) {
|
2021-03-04 21:44:25 +00:00
|
|
|
if (cc->tcg_ops.synchronize_from_tb) {
|
2020-01-14 14:49:41 +00:00
|
|
|
// avoid sync twice when helper_uc_tracecode() already did this.
|
|
|
|
if (env->uc->emu_counter <= env->uc->emu_count &&
|
2021-03-04 21:44:25 +00:00
|
|
|
!env->uc->stop_request && !env->uc->quit_request) {
|
|
|
|
cc->tcg_ops.synchronize_from_tb(cpu, last_tb);
|
|
|
|
}
|
2020-01-14 14:49:41 +00:00
|
|
|
} else {
|
|
|
|
assert(cc->set_pc);
|
|
|
|
// avoid sync twice when helper_uc_tracecode() already did this.
|
|
|
|
if (env->uc->emu_counter <= env->uc->emu_count &&
|
2021-03-04 21:44:25 +00:00
|
|
|
!env->uc->stop_request && !env->uc->quit_request) {
|
2020-01-14 14:49:41 +00:00
|
|
|
cc->set_pc(cpu, last_tb->pc);
|
2021-03-04 21:44:25 +00:00
|
|
|
}
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-01-14 14:49:41 +00:00
|
|
|
|
2018-02-24 04:50:26 +00:00
|
|
|
if (tb_exit == TB_EXIT_REQUESTED) {
|
|
|
|
/* We were asked to stop executing TBs (probably a pending
|
|
|
|
* interrupt. We've now stopped, so clear the flag.
|
|
|
|
*/
|
2018-02-26 10:11:14 +00:00
|
|
|
atomic_set(&cpu->tcg_exit_req, 0);
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
2020-01-14 14:49:41 +00:00
|
|
|
|
2018-02-24 04:50:26 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2019-04-26 20:02:57 +00:00
|
|
|
#ifndef CONFIG_USER_ONLY
|
|
|
|
/* Execute the code without caching the generated code. An interpreter
|
|
|
|
could be used if available. */
|
2018-02-27 16:12:36 +00:00
|
|
|
static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
|
|
|
|
TranslationBlock *orig_tb, bool ignore_icount)
|
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
|
|
|
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
|
2019-05-06 04:57:07 +00:00
|
|
|
uint32_t cflags = curr_cflags(cpu->uc) | CF_NOCACHE;
|
|
|
|
|
|
|
|
if (ignore_icount) {
|
|
|
|
cflags &= ~CF_USE_ICOUNT;
|
|
|
|
}
|
2018-02-27 16:12:36 +00:00
|
|
|
|
|
|
|
/* Should never happen.
|
|
|
|
We only end up here when an existing TB is too long. */
|
2019-05-06 04:57:07 +00:00
|
|
|
cflags |= MIN(max_cycles, CF_COUNT_MASK);
|
2018-02-27 16:12:36 +00:00
|
|
|
|
2019-04-26 20:02:57 +00:00
|
|
|
mmap_lock();
|
2019-05-06 04:57:07 +00:00
|
|
|
tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
|
|
|
|
orig_tb->flags, cflags);
|
2018-02-27 16:12:36 +00:00
|
|
|
tb->orig_tb = orig_tb;
|
2019-04-26 20:02:57 +00:00
|
|
|
mmap_unlock();
|
|
|
|
|
2018-02-27 16:12:36 +00:00
|
|
|
/* execute the generated code */
|
|
|
|
cpu_tb_exec(cpu, tb);
|
2019-04-26 20:02:57 +00:00
|
|
|
|
|
|
|
mmap_lock();
|
2018-02-27 16:12:36 +00:00
|
|
|
tb_phys_invalidate(env->uc, tb, -1);
|
2019-04-26 20:02:57 +00:00
|
|
|
mmap_unlock();
|
2018-02-27 16:12:36 +00:00
|
|
|
tb_free(env->uc, tb);
|
|
|
|
}
|
2019-04-26 20:02:57 +00:00
|
|
|
#endif
|
2018-02-27 16:12:36 +00:00
|
|
|
|
2018-03-03 01:56:29 +00:00
|
|
|
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
|
2019-05-05 02:17:43 +00:00
|
|
|
target_ulong cs_base, uint32_t flags,
|
|
|
|
uint32_t cf_mask)
|
2018-02-24 04:50:26 +00:00
|
|
|
{
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
TCGContext *tcg_ctx = cpu->uc->tcg_ctx;
|
2018-02-24 04:50:26 +00:00
|
|
|
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
TranslationBlock *tb, **tb_hash_head, **ptb1;
|
|
|
|
uint32_t h;
|
2018-02-24 04:50:26 +00:00
|
|
|
tb_page_addr_t phys_pc, phys_page1;
|
|
|
|
|
|
|
|
/* find translated block using physical mappings */
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
phys_pc = get_page_addr_code(env, pc);
|
2018-02-24 04:50:26 +00:00
|
|
|
phys_page1 = phys_pc & TARGET_PAGE_MASK;
|
2019-05-05 02:17:43 +00:00
|
|
|
h = tb_hash_func(phys_pc, pc, flags, cf_mask);
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
|
|
|
|
/* Start at head of the hash entry */
|
|
|
|
ptb1 = tb_hash_head = &tcg_ctx->tb_ctx.tb_phys_hash[h];
|
|
|
|
tb = *ptb1;
|
|
|
|
|
|
|
|
while (tb) {
|
2018-02-24 04:50:26 +00:00
|
|
|
if (tb->pc == pc &&
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
tb->page_addr[0] == phys_page1 &&
|
|
|
|
tb->cs_base == cs_base &&
|
2018-03-05 07:46:17 +00:00
|
|
|
tb->flags == flags &&
|
2019-05-05 02:17:43 +00:00
|
|
|
(tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask) {
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
if (tb->page_addr[1] == -1) {
|
|
|
|
/* done, we have a match */
|
|
|
|
break;
|
2018-02-24 04:50:26 +00:00
|
|
|
} else {
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
/* check next page if needed */
|
|
|
|
target_ulong virt_page2 = (pc & TARGET_PAGE_MASK) +
|
|
|
|
TARGET_PAGE_SIZE;
|
|
|
|
tb_page_addr_t phys_page2 = get_page_addr_code(env, virt_page2);
|
|
|
|
|
|
|
|
if (tb->page_addr[1] == phys_page2) {
|
|
|
|
break;
|
|
|
|
}
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
|
|
|
}
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
|
2018-02-24 04:50:26 +00:00
|
|
|
ptb1 = &tb->phys_hash_next;
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
tb = *ptb1;
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
|
|
|
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
if (tb) {
|
|
|
|
/* Move the TB to the head of the list */
|
2018-02-24 04:50:26 +00:00
|
|
|
*ptb1 = tb->phys_hash_next;
|
tb hash: hash phys_pc, pc, and flags with xxhash
For some workloads such as arm bootup, tb_phys_hash is performance-critical.
The is due to the high frequency of accesses to the hash table, originated
by (frequent) TLB flushes that wipe out the cpu-private tb_jmp_cache's.
More info:
https://lists.nongnu.org/archive/html/qemu-devel/2016-03/msg05098.html
To dig further into this I modified an arm image booting debian jessie to
immediately shut down after boot. Analysis revealed that quite a bit of time
is unnecessarily spent in tb_phys_hash: the cause is poor hashing that
results in very uneven loading of chains in the hash table's buckets;
the longest observed chain had ~550 elements.
The appended addresses this with two changes:
1) Use xxhash as the hash table's hash function. xxhash is a fast,
high-quality hashing function.
2) Feed the hashing function with not just tb_phys, but also pc and flags.
This improves performance over using just tb_phys for hashing, since that
resulted in some hash buckets having many TB's, while others getting very few;
with these changes, the longest observed chain on a single hash bucket is
brought down from ~550 to ~40.
Tests show that the other element checked for in tb_find_physical,
cs_base, is always a match when tb_phys+pc+flags are a match,
so hashing cs_base is wasteful. It could be that this is an ARM-only
thing, though. UPDATE:
On Tue, Apr 05, 2016 at 08:41:43 -0700, Richard Henderson wrote:
> The cs_base field is only used by i386 (in 16-bit modes), and sparc (for a TB
> consisting of only a delay slot).
> It may well still turn out to be reasonable to ignore cs_base for hashing.
BTW, after this change the hash table should not be called "tb_hash_phys"
anymore; this is addressed later in this series.
This change gives consistent bootup time improvements. I tested two
host machines:
- Intel Xeon E5-2690: 11.6% less time
- Intel i7-4790K: 19.2% less time
Increasing the number of hash buckets yields further improvements. However,
using a larger, fixed number of buckets can degrade performance for other
workloads that do not translate as many blocks (600K+ for debian-jessie arm
bootup). This is dealt with later in this series.
Backports commit 42bd32287f3a18d823f2258b813824a39ed7c6d9 from qemu
2018-02-24 22:45:39 +00:00
|
|
|
tb->phys_hash_next = *tb_hash_head;
|
|
|
|
*tb_hash_head = tb;
|
|
|
|
}
|
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2018-03-05 02:47:57 +00:00
|
|
|
void tb_set_jmp_target(TranslationBlock *tb, int n, uintptr_t addr)
|
|
|
|
{
|
|
|
|
if (TCG_TARGET_HAS_direct_jump) {
|
|
|
|
uintptr_t offset = tb->jmp_target_arg[n];
|
2018-03-05 07:57:19 +00:00
|
|
|
uintptr_t tc_ptr = (uintptr_t)tb->tc.ptr;
|
2018-03-05 02:47:57 +00:00
|
|
|
tb_target_set_jmp_target(tc_ptr, tc_ptr + offset, addr);
|
|
|
|
} else {
|
|
|
|
tb->jmp_target_arg[n] = addr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void tb_add_jump(TranslationBlock *tb, int n,
|
|
|
|
TranslationBlock *tb_next)
|
|
|
|
{
|
|
|
|
assert(n < ARRAY_SIZE(tb->jmp_list_next));
|
|
|
|
if (tb->jmp_list_next[n]) {
|
|
|
|
/* Another thread has already done this while we were
|
|
|
|
* outside of the lock; nothing to do in this case */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc,
|
|
|
|
"Linking TBs %p [" TARGET_FMT_lx
|
|
|
|
"] index %d -> %p [" TARGET_FMT_lx "]\n",
|
2018-03-05 07:57:19 +00:00
|
|
|
tb->tc.ptr, tb->pc, n,
|
|
|
|
tb_next->tc.ptr, tb_next->pc);
|
2018-03-05 02:47:57 +00:00
|
|
|
|
|
|
|
/* patch the native jump address */
|
2018-03-05 07:57:19 +00:00
|
|
|
tb_set_jmp_target(tb, n, (uintptr_t)tb_next->tc.ptr);
|
2018-03-05 02:47:57 +00:00
|
|
|
|
|
|
|
/* add in TB jmp circular list */
|
|
|
|
tb->jmp_list_next[n] = tb_next->jmp_list_first;
|
|
|
|
tb_next->jmp_list_first = (uintptr_t)tb | n;
|
|
|
|
}
|
|
|
|
|
2018-02-26 07:05:14 +00:00
|
|
|
static inline TranslationBlock *tb_find(CPUState *cpu,
|
|
|
|
TranslationBlock *last_tb,
|
2019-05-05 02:30:16 +00:00
|
|
|
int tb_exit, uint32_t cf_mask)
|
2018-02-24 04:50:26 +00:00
|
|
|
{
|
|
|
|
TranslationBlock *tb;
|
|
|
|
target_ulong cs_base, pc;
|
|
|
|
uint32_t flags;
|
2018-03-05 07:09:40 +00:00
|
|
|
bool acquired_tb_lock = false;
|
2018-02-24 04:50:26 +00:00
|
|
|
|
2019-05-05 02:17:43 +00:00
|
|
|
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
|
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.
Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:
(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache
Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.
Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:
Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]
19.648474449 seconds time elapsed ( +- 0.82% )
After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]
19.340502161 seconds time elapsed ( +- 0.56% )
After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]
19.700174670 seconds time elapsed ( +- 0.56% )
Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
2018-03-05 07:41:31 +00:00
|
|
|
if (tb == NULL) {
|
|
|
|
mmap_lock();
|
|
|
|
//tb_lock();
|
|
|
|
acquired_tb_lock = true;
|
2018-02-26 07:05:14 +00:00
|
|
|
|
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.
Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:
(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache
Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.
Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:
Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]
19.648474449 seconds time elapsed ( +- 0.82% )
After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]
19.340502161 seconds time elapsed ( +- 0.56% )
After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]
19.700174670 seconds time elapsed ( +- 0.56% )
Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
2018-03-05 07:41:31 +00:00
|
|
|
/* There's a chance that our desired tb has been translated while
|
|
|
|
* taking the locks so we check again inside the lock.
|
|
|
|
*/
|
2019-05-05 02:17:43 +00:00
|
|
|
tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
|
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.
Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:
(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache
Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.
Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:
Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]
19.648474449 seconds time elapsed ( +- 0.82% )
After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]
19.340502161 seconds time elapsed ( +- 0.56% )
After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]
19.700174670 seconds time elapsed ( +- 0.56% )
Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
2018-03-05 07:41:31 +00:00
|
|
|
if (likely(tb == NULL)) {
|
|
|
|
/* if no translated code available, then translate it now */
|
2019-05-05 02:17:43 +00:00
|
|
|
tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
|
2018-02-26 07:05:14 +00:00
|
|
|
}
|
|
|
|
|
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.
Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:
(A) Lookup succeeds for TB in hash without tb_lock
(B) Sets the TB's tb->invalid flag
(B) Removes the TB from tb_htable
(B) Clears all CPU's tb_jmp_cache
(A) Store TB into local tb_jmp_cache
Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.
Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:
Performance counter stats for 'taskset -c 0 qemu-system-arm \
-machine type=virt -nographic -smp 1 -m 4096 \
-netdev user,id=unet,hostfwd=tcp::2222-:22 \
-device virtio-net-device,netdev=unet \
-drive file=jessie.qcow2,id=myblock,index=0,if=none \
-device virtio-blk-device,drive=myblock \
-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
-name arm,debug-threads=on -smp 1' (10 runs):
Before:
18714.917392 task-clock # 0.952 CPUs utilized ( +- 0.95% )
23,142 context-switches # 0.001 M/sec ( +- 0.50% )
1 CPU-migrations # 0.000 M/sec
10,558 page-faults # 0.001 M/sec ( +- 0.95% )
53,957,727,252 cycles # 2.883 GHz ( +- 0.91% ) [83.33%]
24,440,599,852 stalled-cycles-frontend # 45.30% frontend cycles idle ( +- 1.20% ) [83.33%]
16,495,714,424 stalled-cycles-backend # 30.57% backend cycles idle ( +- 0.95% ) [66.66%]
76,267,572,582 instructions # 1.41 insns per cycle
12,692,186,323 branches # 678.186 M/sec ( +- 0.92% ) [83.35%]
263,486,879 branch-misses # 2.08% of all branches ( +- 0.73% ) [83.34%]
19.648474449 seconds time elapsed ( +- 0.82% )
After, w/ inline (this patch):
18471.376627 task-clock # 0.955 CPUs utilized ( +- 0.96% )
23,048 context-switches # 0.001 M/sec ( +- 0.48% )
1 CPU-migrations # 0.000 M/sec
10,708 page-faults # 0.001 M/sec ( +- 0.81% )
53,208,990,796 cycles # 2.881 GHz ( +- 0.98% ) [83.34%]
23,941,071,673 stalled-cycles-frontend # 44.99% frontend cycles idle ( +- 0.95% ) [83.34%]
16,161,773,848 stalled-cycles-backend # 30.37% backend cycles idle ( +- 0.76% ) [66.67%]
75,786,269,766 instructions # 1.42 insns per cycle
12,573,617,143 branches # 680.708 M/sec ( +- 1.34% ) [83.33%]
260,235,550 branch-misses # 2.07% of all branches ( +- 0.66% ) [83.33%]
19.340502161 seconds time elapsed ( +- 0.56% )
After, w/o inline:
18791.253967 task-clock # 0.954 CPUs utilized ( +- 0.78% )
23,230 context-switches # 0.001 M/sec ( +- 0.42% )
1 CPU-migrations # 0.000 M/sec
10,563 page-faults # 0.001 M/sec ( +- 1.27% )
54,168,674,622 cycles # 2.883 GHz ( +- 0.80% ) [83.34%]
24,244,712,629 stalled-cycles-frontend # 44.76% frontend cycles idle ( +- 1.37% ) [83.33%]
16,288,648,572 stalled-cycles-backend # 30.07% backend cycles idle ( +- 0.95% ) [66.66%]
77,659,755,503 instructions # 1.43 insns per cycle
12,922,780,045 branches # 687.702 M/sec ( +- 1.06% ) [83.34%]
261,962,386 branch-misses # 2.03% of all branches ( +- 0.71% ) [83.35%]
19.700174670 seconds time elapsed ( +- 0.56% )
Backports commit f6bb84d53110398f4899c19dab4e0fe9908ec060 from qemu
2018-03-05 07:41:31 +00:00
|
|
|
mmap_unlock();
|
2018-02-26 07:05:14 +00:00
|
|
|
/* We add the TB in the virtual pc hash table for the fast lookup */
|
|
|
|
atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
2018-02-24 08:24:40 +00:00
|
|
|
#ifndef CONFIG_USER_ONLY
|
|
|
|
/* We don't take care of direct jumps when address mapping changes in
|
|
|
|
* system emulation. So it's not safe to make a direct jump to a TB
|
|
|
|
* spanning two pages because the mapping for the second page can change.
|
|
|
|
*/
|
|
|
|
if (tb->page_addr[1] != -1) {
|
2018-02-26 04:23:14 +00:00
|
|
|
last_tb = NULL;
|
2018-02-24 08:24:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
2018-02-24 04:50:26 +00:00
|
|
|
/* See if we can patch the calling TB. */
|
2018-10-23 18:35:03 +00:00
|
|
|
if (last_tb) {
|
2018-03-05 07:09:40 +00:00
|
|
|
if (!acquired_tb_lock) {
|
2018-02-26 07:01:33 +00:00
|
|
|
// Unicorn: commented out
|
|
|
|
//tb_lock();
|
2018-03-05 07:09:40 +00:00
|
|
|
acquired_tb_lock = true;
|
2018-02-26 07:01:33 +00:00
|
|
|
}
|
2018-02-26 04:33:51 +00:00
|
|
|
/* Check if translation buffer has been flushed */
|
|
|
|
if (cpu->tb_flushed) {
|
|
|
|
cpu->tb_flushed = false;
|
2019-04-22 10:07:36 +00:00
|
|
|
} else if (!(tb_cflags(tb) & CF_INVALID)) {
|
2018-02-26 04:33:51 +00:00
|
|
|
tb_add_jump(last_tb, tb_exit, tb);
|
|
|
|
}
|
2018-02-26 07:01:33 +00:00
|
|
|
}
|
2018-03-05 07:09:40 +00:00
|
|
|
if (acquired_tb_lock) {
|
2018-02-26 06:58:27 +00:00
|
|
|
// Unicorn: commented out
|
|
|
|
//tb_unlock();
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
|
|
|
return tb;
|
|
|
|
}
|
|
|
|
|
2018-02-24 04:53:15 +00:00
|
|
|
static inline bool cpu_handle_halt(CPUState *cpu)
|
|
|
|
{
|
|
|
|
if (cpu->halted) {
|
|
|
|
if (!cpu_has_work(cpu)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
cpu->halted = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-02-24 05:02:28 +00:00
|
|
|
static inline void cpu_handle_debug_exception(CPUState *cpu)
|
2018-02-24 04:50:26 +00:00
|
|
|
{
|
|
|
|
CPUClass *cc = CPU_GET_CLASS(cpu->uc, cpu);
|
|
|
|
CPUWatchpoint *wp;
|
|
|
|
|
|
|
|
if (!cpu->watchpoint_hit) {
|
|
|
|
QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) {
|
|
|
|
wp->flags &= ~BP_WATCHPOINT_HIT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-04 22:04:46 +00:00
|
|
|
if (cc->tcg_ops.debug_excp_handler) {
|
|
|
|
cc->tcg_ops.debug_excp_handler(cpu);
|
|
|
|
}
|
2018-02-24 04:50:26 +00:00
|
|
|
}
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2018-02-24 05:02:28 +00:00
|
|
|
static inline bool cpu_handle_exception(struct uc_struct *uc, CPUState *cpu, int *ret)
|
|
|
|
{
|
|
|
|
struct hook *hook;
|
|
|
|
|
|
|
|
if (cpu->exception_index >= 0) {
|
|
|
|
if (cpu->exception_index >= EXCP_INTERRUPT) {
|
|
|
|
/* exit request from the cpu execution loop */
|
|
|
|
*ret = cpu->exception_index;
|
|
|
|
if (*ret == EXCP_DEBUG) {
|
|
|
|
cpu_handle_debug_exception(cpu);
|
|
|
|
}
|
|
|
|
cpu->exception_index = -1;
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
#if defined(CONFIG_USER_ONLY)
|
2019-04-26 20:02:57 +00:00
|
|
|
/* if user mode only, we simulate a fake exception
|
|
|
|
which will be handled outside the cpu execution
|
|
|
|
loop */
|
2018-02-24 05:02:28 +00:00
|
|
|
#if defined(TARGET_I386)
|
2018-03-12 18:58:03 +00:00
|
|
|
CPUClass *cc = CPU_GET_CLASS(uc, cpu);
|
2018-02-24 05:02:28 +00:00
|
|
|
cc->do_interrupt(cpu);
|
|
|
|
#endif
|
|
|
|
*ret = cpu->exception_index;
|
|
|
|
cpu->exception_index = -1;
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
bool catched = false;
|
2020-01-14 14:15:46 +00:00
|
|
|
if (uc->stop_interrupt && uc->stop_interrupt(cpu->exception_index)) {
|
|
|
|
// Unicorn: call registered invalid instruction callbacks
|
|
|
|
HOOK_FOREACH_VAR_DECLARE;
|
|
|
|
HOOK_FOREACH(uc, hook, UC_HOOK_INSN_INVALID) {
|
|
|
|
catched = ((uc_cb_hookinsn_invalid_t)hook->callback)(uc, hook->user_data);
|
|
|
|
if (catched) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!catched) {
|
|
|
|
uc->invalid_error = UC_ERR_INSN_INVALID;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Unicorn: call registered interrupt callbacks
|
|
|
|
HOOK_FOREACH_VAR_DECLARE;
|
|
|
|
HOOK_FOREACH(uc, hook, UC_HOOK_INTR) {
|
|
|
|
((uc_cb_hookintr_t)hook->callback)(uc, cpu->exception_index, hook->user_data);
|
|
|
|
catched = true;
|
|
|
|
}
|
|
|
|
if (!catched) {
|
|
|
|
uc->invalid_error = UC_ERR_EXCEPTION;
|
|
|
|
}
|
2018-02-24 05:02:28 +00:00
|
|
|
}
|
2020-01-14 14:15:46 +00:00
|
|
|
|
2018-02-24 05:02:28 +00:00
|
|
|
// Unicorn: If un-catched interrupt, stop executions.
|
|
|
|
if (!catched) {
|
|
|
|
cpu->halted = 1;
|
|
|
|
*ret = EXCP_HLT;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
cpu->exception_index = -1;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-03-02 13:03:17 +00:00
|
|
|
static inline bool cpu_handle_interrupt(CPUState *cpu,
|
2018-02-24 05:08:24 +00:00
|
|
|
TranslationBlock **last_tb)
|
|
|
|
{
|
|
|
|
CPUClass *cc = CPU_GET_CLASS(cpu->uc, cpu);
|
|
|
|
|
2019-04-26 20:02:57 +00:00
|
|
|
if (unlikely(atomic_read(&cpu->interrupt_request))) {
|
|
|
|
int interrupt_request = cpu->interrupt_request;
|
2018-02-24 05:08:24 +00:00
|
|
|
if (unlikely(cpu->singlestep_enabled & SSTEP_NOIRQ)) {
|
|
|
|
/* Mask out external interrupts for this step. */
|
|
|
|
interrupt_request &= ~CPU_INTERRUPT_SSTEP_MASK;
|
|
|
|
}
|
|
|
|
if (interrupt_request & CPU_INTERRUPT_DEBUG) {
|
|
|
|
cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG;
|
|
|
|
cpu->exception_index = EXCP_DEBUG;
|
2018-03-02 13:03:17 +00:00
|
|
|
return true;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
|
|
|
if (interrupt_request & CPU_INTERRUPT_HALT) {
|
|
|
|
cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
|
|
|
|
cpu->halted = 1;
|
|
|
|
cpu->exception_index = EXCP_HLT;
|
2018-03-02 13:03:17 +00:00
|
|
|
return true;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
|
|
|
#if defined(TARGET_I386)
|
2018-02-24 05:26:59 +00:00
|
|
|
else if (interrupt_request & CPU_INTERRUPT_INIT) {
|
2018-02-24 05:52:51 +00:00
|
|
|
X86CPU *x86_cpu = X86_CPU(cpu->uc, cpu);
|
|
|
|
CPUArchState *env = &x86_cpu->env;
|
2018-03-01 14:31:08 +00:00
|
|
|
cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0, 0);
|
2018-02-24 05:08:24 +00:00
|
|
|
do_cpu_init(x86_cpu);
|
|
|
|
cpu->exception_index = EXCP_HALTED;
|
2018-03-02 13:03:17 +00:00
|
|
|
return true;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
|
|
|
#else
|
2018-02-24 05:26:59 +00:00
|
|
|
else if (interrupt_request & CPU_INTERRUPT_RESET) {
|
2018-02-24 05:08:24 +00:00
|
|
|
cpu_reset(cpu);
|
2019-04-26 20:02:57 +00:00
|
|
|
return true;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
|
|
|
#endif
|
2019-04-26 20:02:57 +00:00
|
|
|
/* The target hook has 3 exit conditions:
|
|
|
|
False when the interrupt isn't processed,
|
|
|
|
True when it is, and we should restart on a new TB,
|
|
|
|
and via longjmp via cpu_loop_exit. */
|
2018-02-24 05:26:59 +00:00
|
|
|
else {
|
2021-03-04 21:56:51 +00:00
|
|
|
if (cc->tcg_ops.cpu_exec_interrupt &&
|
|
|
|
cc->tcg_ops.cpu_exec_interrupt(cpu, interrupt_request)) {
|
2018-03-17 23:32:09 +00:00
|
|
|
cpu->exception_index = -1;
|
2018-02-24 05:26:59 +00:00
|
|
|
*last_tb = NULL;
|
|
|
|
}
|
|
|
|
/* The target hook may have updated the 'cpu->interrupt_request';
|
|
|
|
* reload the 'interrupt_request' value */
|
|
|
|
interrupt_request = cpu->interrupt_request;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
2018-02-24 05:26:59 +00:00
|
|
|
if (interrupt_request & CPU_INTERRUPT_EXITTB) {
|
2018-02-24 05:08:24 +00:00
|
|
|
cpu->interrupt_request &= ~CPU_INTERRUPT_EXITTB;
|
|
|
|
/* ensure that no TB jump will be modified as
|
|
|
|
the program flow was changed */
|
|
|
|
*last_tb = NULL;
|
|
|
|
}
|
|
|
|
}
|
2019-04-26 20:02:57 +00:00
|
|
|
|
|
|
|
/* Finally, check if we need to exit to the main loop. */
|
|
|
|
if (unlikely(atomic_read(&cpu->exit_request))) {
|
2018-03-17 23:32:09 +00:00
|
|
|
atomic_set(&cpu->exit_request, 0);
|
|
|
|
if (cpu->exception_index == -1) {
|
|
|
|
cpu->exception_index = EXCP_INTERRUPT;
|
|
|
|
}
|
2018-03-02 13:03:17 +00:00
|
|
|
return true;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
2019-04-26 20:02:57 +00:00
|
|
|
|
2018-03-02 13:03:17 +00:00
|
|
|
return false;
|
2018-02-24 05:08:24 +00:00
|
|
|
}
|
|
|
|
|
2018-02-24 05:14:29 +00:00
|
|
|
static inline void cpu_loop_exec_tb(CPUState *cpu, TranslationBlock *tb,
|
|
|
|
TranslationBlock **last_tb, int *tb_exit)
|
|
|
|
{
|
|
|
|
uintptr_t ret;
|
|
|
|
|
|
|
|
/* execute the generated code */
|
|
|
|
ret = cpu_tb_exec(cpu, tb);
|
2018-03-01 14:16:51 +00:00
|
|
|
tb = (TranslationBlock *)(ret & ~TB_EXIT_MASK);
|
2018-02-24 05:14:29 +00:00
|
|
|
*tb_exit = ret & TB_EXIT_MASK;
|
|
|
|
switch (*tb_exit) {
|
|
|
|
case TB_EXIT_REQUESTED:
|
2018-03-02 14:36:41 +00:00
|
|
|
/* Something asked us to stop executing chained TBs; just
|
|
|
|
* continue round the main loop. Whatever requested the exit
|
|
|
|
* will also have set something else (eg interrupt_request)
|
|
|
|
* which we will handle next time around the loop. But we
|
|
|
|
* need to ensure the tcg_exit_req read in generated code
|
|
|
|
* comes before the next read of cpu->exit_request or
|
|
|
|
* cpu->interrupt_request.
|
2018-02-24 05:14:29 +00:00
|
|
|
*/
|
2018-03-02 13:01:00 +00:00
|
|
|
smp_mb();
|
2018-02-24 05:14:29 +00:00
|
|
|
*last_tb = NULL;
|
|
|
|
break;
|
2018-02-27 16:12:36 +00:00
|
|
|
case TB_EXIT_ICOUNT_EXPIRED:
|
|
|
|
{
|
|
|
|
/* Instruction counter expired. */
|
|
|
|
#ifdef CONFIG_USER_ONLY
|
|
|
|
abort();
|
|
|
|
#else
|
2019-06-13 19:32:36 +00:00
|
|
|
int insns_left = atomic_read(&cpu_neg(cpu)->icount_decr.u32);
|
2018-03-01 14:16:51 +00:00
|
|
|
*last_tb = NULL;
|
2018-02-27 16:12:36 +00:00
|
|
|
if (cpu->icount_extra && insns_left >= 0) {
|
|
|
|
/* Refill decrementer and continue execution. */
|
|
|
|
cpu->icount_extra += insns_left;
|
|
|
|
insns_left = MIN(0xffff, cpu->icount_extra);
|
|
|
|
cpu->icount_extra -= insns_left;
|
2019-06-13 19:32:36 +00:00
|
|
|
cpu_neg(cpu)->icount_decr.u16.low = insns_left;
|
2018-02-27 16:12:36 +00:00
|
|
|
} else {
|
|
|
|
if (insns_left > 0) {
|
|
|
|
/* Execute remaining instructions. */
|
2018-03-01 14:16:51 +00:00
|
|
|
cpu_exec_nocache(cpu, insns_left, tb, false);
|
2018-02-27 16:12:36 +00:00
|
|
|
// Unicorn: commented out
|
|
|
|
//align_clocks(sc, cpu);
|
|
|
|
}
|
|
|
|
cpu->exception_index = EXCP_INTERRUPT;
|
|
|
|
cpu_loop_exit(cpu);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
}
|
2018-02-24 05:14:29 +00:00
|
|
|
default:
|
2018-03-01 14:16:51 +00:00
|
|
|
*last_tb = tb;
|
2018-02-24 05:14:29 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-06 04:52:39 +00:00
|
|
|
void cpu_exec_step_atomic(struct uc_struct *uc, CPUState *cpu)
|
2018-02-27 16:12:36 +00:00
|
|
|
{
|
|
|
|
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
|
|
|
|
TranslationBlock *tb;
|
|
|
|
target_ulong cs_base, pc;
|
|
|
|
uint32_t flags;
|
2019-05-06 04:57:07 +00:00
|
|
|
uint32_t cflags = 1;
|
2019-05-06 04:52:39 +00:00
|
|
|
uint32_t cf_mask = cflags & CF_HASH_MASK;
|
2018-02-27 16:12:36 +00:00
|
|
|
|
|
|
|
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
|
2018-03-02 14:56:35 +00:00
|
|
|
|
|
|
|
if (sigsetjmp(cpu->jmp_env, 0) == 0) {
|
2019-05-06 04:52:39 +00:00
|
|
|
tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
|
2019-05-05 02:17:43 +00:00
|
|
|
if (tb == NULL) {
|
|
|
|
mmap_lock();
|
2019-05-06 04:52:39 +00:00
|
|
|
tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
|
|
|
|
if (likely(tb == NULL)) {
|
|
|
|
tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
|
|
|
|
}
|
2019-05-05 02:17:43 +00:00
|
|
|
mmap_unlock();
|
|
|
|
}
|
2018-03-02 14:56:35 +00:00
|
|
|
|
2019-05-06 04:52:39 +00:00
|
|
|
uc->parallel_cpus = false;
|
2018-03-02 14:56:35 +00:00
|
|
|
/* execute the generated code */
|
|
|
|
cpu_tb_exec(cpu, tb);
|
2019-05-06 04:52:39 +00:00
|
|
|
uc->parallel_cpus = true;
|
2018-03-02 14:56:35 +00:00
|
|
|
} else {
|
|
|
|
/* We may have exited due to another problem here, so we need
|
|
|
|
* to reset any tb_locks we may have taken but didn't release.
|
|
|
|
* The mmap_lock is dropped by tb_gen_code if it runs out of
|
|
|
|
* memory.
|
|
|
|
*/
|
|
|
|
#ifndef CONFIG_SOFTMMU
|
|
|
|
// Unicorn: Commented out
|
|
|
|
//tcg_debug_assert(!have_mmap_lock());
|
|
|
|
#endif
|
|
|
|
// Unicorn: commented out
|
|
|
|
//tb_lock_reset();
|
|
|
|
}
|
2018-02-27 16:12:36 +00:00
|
|
|
}
|
|
|
|
|
2015-08-21 07:04:50 +00:00
|
|
|
/* main execution loop */
|
|
|
|
|
2018-02-14 19:59:56 +00:00
|
|
|
int cpu_exec(struct uc_struct *uc, CPUState *cpu)
|
2015-08-21 07:04:50 +00:00
|
|
|
{
|
2018-02-14 19:59:56 +00:00
|
|
|
CPUArchState *env = cpu->env_ptr;
|
2015-08-21 07:04:50 +00:00
|
|
|
CPUClass *cc = CPU_GET_CLASS(uc, cpu);
|
2018-02-24 05:08:24 +00:00
|
|
|
int ret;
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2018-02-24 04:53:15 +00:00
|
|
|
if (cpu_handle_halt(cpu)) {
|
|
|
|
return EXCP_HALTED;
|
2015-08-21 07:04:50 +00:00
|
|
|
}
|
|
|
|
|
2018-03-02 14:26:31 +00:00
|
|
|
atomic_mb_set(&uc->current_cpu, cpu);
|
|
|
|
atomic_mb_set(&uc->tcg_current_rr_cpu, cpu);
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2021-03-04 21:56:51 +00:00
|
|
|
if (cc->tcg_ops.cpu_exec_enter) {
|
|
|
|
cc->tcg_ops.cpu_exec_enter(cpu);
|
|
|
|
}
|
2015-08-21 07:04:50 +00:00
|
|
|
cpu->exception_index = -1;
|
2016-01-11 16:11:31 +00:00
|
|
|
env->invalid_error = UC_ERR_OK;
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2018-03-02 13:13:38 +00:00
|
|
|
/* prepare setjmp context for exception handling */
|
|
|
|
if (sigsetjmp(cpu->jmp_env, 0) != 0) {
|
2021-03-04 00:14:50 +00:00
|
|
|
#if defined(__clang__)
|
2018-03-02 13:13:38 +00:00
|
|
|
/* Some compilers wrongly smash all local variables after
|
|
|
|
* siglongjmp. There were bug reports for gcc 4.5.0 and clang.
|
|
|
|
* Reload essential local variables here for those compilers.
|
|
|
|
* Newer versions of gcc would complain about this code (-Wclobbered). */
|
|
|
|
cpu = uc->current_cpu;
|
|
|
|
env = cpu->env_ptr;
|
|
|
|
cc = CPU_GET_CLASS(uc, cpu);
|
2018-02-17 20:07:38 +00:00
|
|
|
#else /* buggy compiler */
|
2018-03-02 13:13:38 +00:00
|
|
|
/* Assert that the compiler does not smash local variables. */
|
2018-03-12 18:58:03 +00:00
|
|
|
g_assert(cpu == uc->current_cpu);
|
|
|
|
g_assert(cc == CPU_GET_CLASS(uc, cpu));
|
2018-02-17 20:07:38 +00:00
|
|
|
#endif /* buggy compiler */
|
2018-03-02 13:13:38 +00:00
|
|
|
// Unicorn: commented out
|
|
|
|
//tb_lock_reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* if an exception is pending, we execute it here */
|
|
|
|
while (!cpu_handle_exception(uc, cpu, &ret)) {
|
|
|
|
TranslationBlock *last_tb = NULL;
|
|
|
|
int tb_exit = 0;
|
|
|
|
|
|
|
|
while (!cpu_handle_interrupt(cpu, &last_tb)) {
|
2019-05-05 02:30:16 +00:00
|
|
|
uint32_t cflags = cpu->cflags_next_tb;
|
|
|
|
TranslationBlock *tb;
|
|
|
|
|
|
|
|
/* When requested, use an exact setting for cflags for the next
|
|
|
|
execution. This is used for icount, precise smc, and stop-
|
|
|
|
after-access watchpoints. Since this request should never
|
|
|
|
have CF_INVALID set, -1 is a convenient invalid value that
|
|
|
|
does not require tcg headers for cpu_common_reset. */
|
|
|
|
if (cflags == -1) {
|
|
|
|
cflags = curr_cflags(uc);
|
|
|
|
} else {
|
|
|
|
cpu->cflags_next_tb = -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
tb = tb_find(cpu, last_tb, tb_exit, cflags);
|
2018-03-02 13:13:38 +00:00
|
|
|
if (!tb) { // invalid TB due to invalid code?
|
|
|
|
uc->invalid_error = UC_ERR_FETCH_UNMAPPED;
|
|
|
|
ret = EXCP_HLT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit);
|
2015-08-21 07:04:50 +00:00
|
|
|
}
|
2018-03-02 13:13:38 +00:00
|
|
|
}
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2021-03-04 21:56:51 +00:00
|
|
|
if (cc->tcg_ops.cpu_exec_exit) {
|
|
|
|
cc->tcg_ops.cpu_exec_exit(cpu);
|
|
|
|
}
|
2015-08-21 07:04:50 +00:00
|
|
|
|
2015-11-13 15:57:03 +00:00
|
|
|
// Unicorn: flush JIT cache to because emulation might stop in
|
|
|
|
// the middle of translation, thus generate incomplete code.
|
|
|
|
// TODO: optimize this for better performance
|
2018-02-14 18:56:00 +00:00
|
|
|
tb_flush(cpu);
|
2015-11-13 15:57:03 +00:00
|
|
|
|
2015-08-21 07:04:50 +00:00
|
|
|
return ret;
|
|
|
|
}
|