target/arm: Convert Neon 'load single structure to all lanes' to decodetree

Convert the Neon "load single structure to all lanes" insns to
decodetree.

Backports commit 3698747c48db871d876a398592c5a23d7580ed4a from qemu
This commit is contained in:
Peter Maydell 2020-05-07 09:28:59 -04:00 committed by Lioncash
parent 7aad825fa6
commit 302506f2f6
3 changed files with 81 additions and 53 deletions

View file

@ -34,3 +34,8 @@
VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
vd=%vd_dp
# Neon load single element to all lanes
VLD_all_lanes 1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
vd=%vd_dp

View file

@ -408,3 +408,77 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
return true;
}
static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
{
/* Neon load single structure to all lanes */
int reg, stride, vec_size;
int vd = a->vd;
int size = a->size;
int nregs = a->n + 1;
TCGv_i32 addr, tmp;
TCGContext *tcg_ctx = s->uc->tcg_ctx;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist */
if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
return false;
}
if (size == 3) {
if (nregs != 4 || a->a == 0) {
return false;
}
/* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
size = 2;
}
if (nregs == 1 && a->a == 1 && size == 0) {
return false;
}
if (nregs == 3 && a->a == 1) {
return false;
}
if (!vfp_access_check(s)) {
return true;
}
/*
* VLD1 to all lanes: T bit indicates how many Dregs to write.
* VLD2/3/4 to all lanes: T bit indicates register stride.
*/
stride = a->t ? 2 : 1;
vec_size = nregs == 1 ? stride * 8 : 8;
tmp = tcg_temp_new_i32(tcg_ctx);
addr = tcg_temp_new_i32(tcg_ctx);
load_reg_var(s, addr, a->rn);
for (reg = 0; reg < nregs; reg++) {
gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
s->be_data | size);
if ((vd & 1) && vec_size == 16) {
/*
* We cannot write 16 bytes at once because the
* destination is unaligned.
*/
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
8, 8, tmp);
tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(vd + 1, 0),
neon_reg_offset(vd, 0), 8, 8);
} else {
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
vec_size, vec_size, tmp);
}
tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
vd += stride;
}
tcg_temp_free_i32(tcg_ctx, tmp);
tcg_temp_free_i32(tcg_ctx, addr);
gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
return true;
}

View file

@ -3342,7 +3342,6 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
int size;
int reg;
int load;
int vec_size;
TCGv_i32 addr;
TCGv_i32 tmp;
@ -3372,58 +3371,8 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
} else {
size = (insn >> 10) & 3;
if (size == 3) {
/* Load single element to all lanes. */
int a = (insn >> 4) & 1;
if (!load) {
return 1;
}
size = (insn >> 6) & 3;
nregs = ((insn >> 8) & 3) + 1;
if (size == 3) {
if (nregs != 4 || a == 0) {
return 1;
}
/* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */
size = 2;
}
if (nregs == 1 && a == 1 && size == 0) {
return 1;
}
if (nregs == 3 && a == 1) {
return 1;
}
addr = tcg_temp_new_i32(tcg_ctx);
load_reg_var(s, addr, rn);
/* VLD1 to all lanes: bit 5 indicates how many Dregs to write.
* VLD2/3/4 to all lanes: bit 5 indicates register stride.
*/
stride = (insn & (1 << 5)) ? 2 : 1;
vec_size = nregs == 1 ? stride * 8 : 8;
tmp = tcg_temp_new_i32(tcg_ctx);
for (reg = 0; reg < nregs; reg++) {
gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
s->be_data | size);
if ((rd & 1) && vec_size == 16) {
/* We cannot write 16 bytes at once because the
* destination is unaligned.
*/
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
8, 8, tmp);
tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(rd + 1, 0),
neon_reg_offset(rd, 0), 8, 8);
} else {
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
vec_size, vec_size, tmp);
}
tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
rd += stride;
}
tcg_temp_free_i32(tcg_ctx, tmp);
tcg_temp_free_i32(tcg_ctx, addr);
stride = (1 << size) * nregs;
/* Load single element to all lanes -- handled by decodetree */
return 1;
} else {
/* Single element. */
int idx = (insn >> 4) & 0xf;