mirror of
https://github.com/yuzu-emu/unicorn.git
synced 2025-01-22 23:21:07 +00:00
target/arm: Promote consecutive memory ops for aa64
For a sequence of loads or stores from a single register, little-endian operations can be promoted to an 8-byte op. This can reduce the number of operations by a factor of 8. Backports commit 87f9a7f0c8d5122c36743885158782c2348a6d21 from qemu
This commit is contained in:
parent
e6707b900c
commit
931b49fb06
|
@ -1269,27 +1269,25 @@ static void write_vec_element_i32(DisasContext *s, TCGv_i32 tcg_src,
|
|||
|
||||
/* Store from vector register to memory */
|
||||
static void do_vec_st(DisasContext *s, int srcidx, int element,
|
||||
TCGv_i64 tcg_addr, int size)
|
||||
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||
{
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
TCGMemOp memop = s->be_data + size;
|
||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64(tcg_ctx);
|
||||
|
||||
read_vec_element(s, tcg_tmp, srcidx, element, size);
|
||||
tcg_gen_qemu_st_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
||||
tcg_gen_qemu_st_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||
|
||||
tcg_temp_free_i64(tcg_ctx, tcg_tmp);
|
||||
}
|
||||
|
||||
/* Load from memory to vector register */
|
||||
static void do_vec_ld(DisasContext *s, int destidx, int element,
|
||||
TCGv_i64 tcg_addr, int size)
|
||||
TCGv_i64 tcg_addr, int size, TCGMemOp endian)
|
||||
{
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
TCGMemOp memop = s->be_data + size;
|
||||
TCGv_i64 tcg_tmp = tcg_temp_new_i64(tcg_ctx);
|
||||
|
||||
tcg_gen_qemu_ld_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), memop);
|
||||
tcg_gen_qemu_ld_i64(s->uc, tcg_tmp, tcg_addr, get_mem_index(s), endian | size);
|
||||
write_vec_element(s, tcg_tmp, destidx, element, size);
|
||||
|
||||
tcg_temp_free_i64(tcg_ctx, tcg_tmp);
|
||||
|
@ -3088,9 +3086,10 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
|||
bool is_postidx = extract32(insn, 23, 1);
|
||||
bool is_q = extract32(insn, 30, 1);
|
||||
TCGv_i64 tcg_addr, tcg_rn, tcg_ebytes;
|
||||
TCGMemOp endian = s->be_data;
|
||||
|
||||
int ebytes = 1 << size;
|
||||
int elements = (is_q ? 128 : 64) / (8 << size);
|
||||
int ebytes; /* bytes per element */
|
||||
int elements; /* elements per vector */
|
||||
int rpt; /* num iterations */
|
||||
int selem; /* structure elements */
|
||||
int r;
|
||||
|
@ -3149,6 +3148,20 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
|||
gen_check_sp_alignment(s);
|
||||
}
|
||||
|
||||
/* For our purposes, bytes are always little-endian. */
|
||||
if (size == 0) {
|
||||
endian = MO_LE;
|
||||
}
|
||||
|
||||
/* Consecutive little-endian elements from a single register
|
||||
* can be promoted to a larger little-endian operation.
|
||||
*/
|
||||
if (selem == 1 && endian == MO_LE) {
|
||||
size = 3;
|
||||
}
|
||||
ebytes = 1 << size;
|
||||
elements = (is_q ? 16 : 8) / ebytes;
|
||||
|
||||
tcg_rn = cpu_reg_sp(s, rn);
|
||||
tcg_addr = tcg_temp_new_i64(tcg_ctx);
|
||||
tcg_gen_mov_i64(tcg_ctx, tcg_addr, tcg_rn);
|
||||
|
@ -3157,32 +3170,33 @@ static void disas_ldst_multiple_struct(DisasContext *s, uint32_t insn)
|
|||
for (r = 0; r < rpt; r++) {
|
||||
int e;
|
||||
for (e = 0; e < elements; e++) {
|
||||
int tt = (rt + r) % 32;
|
||||
int xs;
|
||||
for (xs = 0; xs < selem; xs++) {
|
||||
int tt = (rt + r + xs) % 32;
|
||||
if (is_store) {
|
||||
do_vec_st(s, tt, e, tcg_addr, size);
|
||||
do_vec_st(s, tt, e, tcg_addr, size, endian);
|
||||
} else {
|
||||
do_vec_ld(s, tt, e, tcg_addr, size);
|
||||
|
||||
/* For non-quad operations, setting a slice of the low
|
||||
* 64 bits of the register clears the high 64 bits (in
|
||||
* the ARM ARM pseudocode this is implicit in the fact
|
||||
* that 'rval' is a 64 bit wide variable).
|
||||
* For quad operations, we might still need to zero the
|
||||
* high bits of SVE. We optimize by noticing that we only
|
||||
* need to do this the first time we touch a register.
|
||||
*/
|
||||
if (e == 0 && (r == 0 || xs == selem - 1)) {
|
||||
clear_vec_high(s, is_q, tt);
|
||||
}
|
||||
do_vec_ld(s, tt, e, tcg_addr, size, endian);
|
||||
}
|
||||
tcg_gen_add_i64(tcg_ctx, tcg_addr, tcg_addr, tcg_ebytes);
|
||||
tt = (tt + 1) % 32;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_store) {
|
||||
/* For non-quad operations, setting a slice of the low
|
||||
* 64 bits of the register clears the high 64 bits (in
|
||||
* the ARM ARM pseudocode this is implicit in the fact
|
||||
* that 'rval' is a 64 bit wide variable).
|
||||
* For quad operations, we might still need to zero the
|
||||
* high bits of SVE.
|
||||
*/
|
||||
for (r = 0; r < rpt * selem; r++) {
|
||||
int tt = (rt + r) % 32;
|
||||
clear_vec_high(s, is_q, tt);
|
||||
}
|
||||
}
|
||||
|
||||
if (is_postidx) {
|
||||
int rm = extract32(insn, 16, 5);
|
||||
if (rm == 31) {
|
||||
|
@ -3304,9 +3318,9 @@ static void disas_ldst_single_struct(DisasContext *s, uint32_t insn)
|
|||
} else {
|
||||
/* Load/store one element per register */
|
||||
if (is_load) {
|
||||
do_vec_ld(s, rt, index, tcg_addr, scale);
|
||||
do_vec_ld(s, rt, index, tcg_addr, scale, s->be_data);
|
||||
} else {
|
||||
do_vec_st(s, rt, index, tcg_addr, scale);
|
||||
do_vec_st(s, rt, index, tcg_addr, scale, s->be_data);
|
||||
}
|
||||
}
|
||||
tcg_gen_add_i64(tcg_ctx, tcg_addr, tcg_addr, tcg_ebytes);
|
||||
|
|
Loading…
Reference in a new issue