mirror of
https://github.com/yuzu-emu/unicorn.git
synced 2025-01-22 17:51:11 +00:00
target/arm: Convert Neon 'load single structure to all lanes' to decodetree
Convert the Neon "load single structure to all lanes" insns to decodetree. Backports commit 3698747c48db871d876a398592c5a23d7580ed4a from qemu
This commit is contained in:
parent
7aad825fa6
commit
302506f2f6
|
@ -34,3 +34,8 @@
|
|||
|
||||
VLDST_multiple 1111 0100 0 . l:1 0 rn:4 .... itype:4 size:2 align:2 rm:4 \
|
||||
vd=%vd_dp
|
||||
|
||||
# Neon load single element to all lanes
|
||||
|
||||
VLD_all_lanes 1111 0100 1 . 1 0 rn:4 .... 11 n:2 size:2 t:1 a:1 rm:4 \
|
||||
vd=%vd_dp
|
||||
|
|
|
@ -408,3 +408,77 @@ static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
|
|||
gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
|
||||
{
|
||||
/* Neon load single structure to all lanes */
|
||||
int reg, stride, vec_size;
|
||||
int vd = a->vd;
|
||||
int size = a->size;
|
||||
int nregs = a->n + 1;
|
||||
TCGv_i32 addr, tmp;
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* UNDEF accesses to D16-D31 if they don't exist */
|
||||
if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (size == 3) {
|
||||
if (nregs != 4 || a->a == 0) {
|
||||
return false;
|
||||
}
|
||||
/* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
|
||||
size = 2;
|
||||
}
|
||||
if (nregs == 1 && a->a == 1 && size == 0) {
|
||||
return false;
|
||||
}
|
||||
if (nregs == 3 && a->a == 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!vfp_access_check(s)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* VLD1 to all lanes: T bit indicates how many Dregs to write.
|
||||
* VLD2/3/4 to all lanes: T bit indicates register stride.
|
||||
*/
|
||||
stride = a->t ? 2 : 1;
|
||||
vec_size = nregs == 1 ? stride * 8 : 8;
|
||||
|
||||
tmp = tcg_temp_new_i32(tcg_ctx);
|
||||
addr = tcg_temp_new_i32(tcg_ctx);
|
||||
load_reg_var(s, addr, a->rn);
|
||||
for (reg = 0; reg < nregs; reg++) {
|
||||
gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
|
||||
s->be_data | size);
|
||||
if ((vd & 1) && vec_size == 16) {
|
||||
/*
|
||||
* We cannot write 16 bytes at once because the
|
||||
* destination is unaligned.
|
||||
*/
|
||||
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
|
||||
8, 8, tmp);
|
||||
tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(vd + 1, 0),
|
||||
neon_reg_offset(vd, 0), 8, 8);
|
||||
} else {
|
||||
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(vd, 0),
|
||||
vec_size, vec_size, tmp);
|
||||
}
|
||||
tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
|
||||
vd += stride;
|
||||
}
|
||||
tcg_temp_free_i32(tcg_ctx, tmp);
|
||||
tcg_temp_free_i32(tcg_ctx, addr);
|
||||
|
||||
gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -3342,7 +3342,6 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
|
|||
int size;
|
||||
int reg;
|
||||
int load;
|
||||
int vec_size;
|
||||
TCGv_i32 addr;
|
||||
TCGv_i32 tmp;
|
||||
|
||||
|
@ -3372,58 +3371,8 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t insn)
|
|||
} else {
|
||||
size = (insn >> 10) & 3;
|
||||
if (size == 3) {
|
||||
/* Load single element to all lanes. */
|
||||
int a = (insn >> 4) & 1;
|
||||
if (!load) {
|
||||
return 1;
|
||||
}
|
||||
size = (insn >> 6) & 3;
|
||||
nregs = ((insn >> 8) & 3) + 1;
|
||||
|
||||
if (size == 3) {
|
||||
if (nregs != 4 || a == 0) {
|
||||
return 1;
|
||||
}
|
||||
/* For VLD4 size==3 a == 1 means 32 bits at 16 byte alignment */
|
||||
size = 2;
|
||||
}
|
||||
if (nregs == 1 && a == 1 && size == 0) {
|
||||
return 1;
|
||||
}
|
||||
if (nregs == 3 && a == 1) {
|
||||
return 1;
|
||||
}
|
||||
addr = tcg_temp_new_i32(tcg_ctx);
|
||||
load_reg_var(s, addr, rn);
|
||||
|
||||
/* VLD1 to all lanes: bit 5 indicates how many Dregs to write.
|
||||
* VLD2/3/4 to all lanes: bit 5 indicates register stride.
|
||||
*/
|
||||
stride = (insn & (1 << 5)) ? 2 : 1;
|
||||
vec_size = nregs == 1 ? stride * 8 : 8;
|
||||
|
||||
tmp = tcg_temp_new_i32(tcg_ctx);
|
||||
for (reg = 0; reg < nregs; reg++) {
|
||||
gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s),
|
||||
s->be_data | size);
|
||||
if ((rd & 1) && vec_size == 16) {
|
||||
/* We cannot write 16 bytes at once because the
|
||||
* destination is unaligned.
|
||||
*/
|
||||
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
|
||||
8, 8, tmp);
|
||||
tcg_gen_gvec_mov(tcg_ctx, 0, neon_reg_offset(rd + 1, 0),
|
||||
neon_reg_offset(rd, 0), 8, 8);
|
||||
} else {
|
||||
tcg_gen_gvec_dup_i32(tcg_ctx, size, neon_reg_offset(rd, 0),
|
||||
vec_size, vec_size, tmp);
|
||||
}
|
||||
tcg_gen_addi_i32(tcg_ctx, addr, addr, 1 << size);
|
||||
rd += stride;
|
||||
}
|
||||
tcg_temp_free_i32(tcg_ctx, tmp);
|
||||
tcg_temp_free_i32(tcg_ctx, addr);
|
||||
stride = (1 << size) * nregs;
|
||||
/* Load single element to all lanes -- handled by decodetree */
|
||||
return 1;
|
||||
} else {
|
||||
/* Single element. */
|
||||
int idx = (insn >> 4) & 0xf;
|
||||
|
|
Loading…
Reference in a new issue