target/arm: Convert Neon VSHLL, VMOVL to decodetree

Convert the VSHLL and VMOVL insns from the 2-reg-shift group
to decodetree. Since the loop always has two passes, we unroll
it to avoid the awkward reassignment of one TCGv to another.

Backports commit 968bf842742a5ffbb0041cb31089e61a9f7a833d from qemu
This commit is contained in:
Peter Maydell 2020-06-15 12:35:30 -04:00 committed by Lioncash
parent 6fc8fdaa2b
commit a5f903b2a5
3 changed files with 100 additions and 44 deletions

View file

@ -243,6 +243,14 @@ VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
&2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
shift=%neon_rshift_i3
# Long left shifts: again Q is part of opcode decode
@2reg_shll_s .... ... . . . 1 shift:5 .... .... 0 . . . .... \
&2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
@2reg_shll_h .... ... . . . 01 shift:4 .... .... 0 . . . .... \
&2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
@2reg_shll_b .... ... . . . 001 shift:3 .... .... 0 . . . .... \
&2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
@ -348,3 +356,11 @@ VQSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
VQRSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
VQRSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
VQRSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b

View file

@ -1599,3 +1599,85 @@ DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
NeonGenWidenFn *widenfn, bool u)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i64 tmp;
TCGv_i32 rm0, rm1;
uint64_t widen_mask = 0;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist. */
if (!dc_isar_feature(aa32_simd_r32, s) &&
((a->vd | a->vm) & 0x10)) {
return false;
}
if (a->vd & 1) {
return false;
}
if (!vfp_access_check(s)) {
return true;
}
/*
* This is a widen-and-shift operation. The shift is always less
* than the width of the source type, so after widening the input
* vector we can simply shift the whole 64-bit widened register,
* and then clear the potential overflow bits resulting from left
* bits of the narrow input appearing as right bits of the left
* neighbour narrow input. Calculate a mask of bits to clear.
*/
if ((a->shift != 0) && (a->size < 2 || u)) {
int esize = 8 << a->size;
widen_mask = MAKE_64BIT_MASK(0, esize);
widen_mask >>= esize - a->shift;
widen_mask = dup_const(a->size + 1, widen_mask);
}
rm0 = neon_load_reg(s, a->vm, 0);
rm1 = neon_load_reg(s, a->vm, 1);
tmp = tcg_temp_new_i64(tcg_ctx);
widenfn(tcg_ctx, tmp, rm0);
if (a->shift != 0) {
tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
}
neon_store_reg64(s, tmp, a->vd);
widenfn(tcg_ctx, tmp, rm1);
if (a->shift != 0) {
tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
}
neon_store_reg64(s, tmp, a->vd + 1);
tcg_temp_free_i64(tcg_ctx, tmp);
return true;
}
static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
{
NeonGenWidenFn *widenfn[] = {
gen_helper_neon_widen_s8,
gen_helper_neon_widen_s16,
tcg_gen_ext_i32_i64,
};
return do_vshll_2sh(s, a, widenfn[a->size], false);
}
static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
{
NeonGenWidenFn *widenfn[] = {
gen_helper_neon_widen_u8,
gen_helper_neon_widen_u16,
tcg_gen_extu_i32_i64,
};
return do_vshll_2sh(s, a, widenfn[a->size], true);
}

View file

@ -5373,6 +5373,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
case 7: /* VQSHL */
case 8: /* VSHRN, VRSHRN, VQSHRUN, VQRSHRUN */
case 9: /* VQSHRN, VQRSHRN */
case 10: /* VSHLL, including VMOVL */
return 1; /* handled by decodetree */
default:
break;
@ -5390,50 +5391,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
size--;
}
shift = (insn >> 16) & ((1 << (3 + size)) - 1);
if (op == 10) {
/* VSHLL, VMOVL */
if (q || (rd & 1)) {
return 1;
}
tmp = neon_load_reg(s, rm, 0);
tmp2 = neon_load_reg(s, rm, 1);
for (pass = 0; pass < 2; pass++) {
if (pass == 1)
tmp = tmp2;
gen_neon_widen(s, s->V0, tmp, size, u);
if (shift != 0) {
/* The shift is less than the width of the source
type, so we can just shift the whole register. */
tcg_gen_shli_i64(tcg_ctx, s->V0, s->V0, shift);
/* Widen the result of shift: we need to clear
* the potential overflow bits resulting from
* left bits of the narrow input appearing as
* right bits of left the neighbour narrow
* input. */
if (size < 2 || !u) {
uint64_t imm64;
if (size == 0) {
imm = (0xffu >> (8 - shift));
imm |= imm << 16;
} else if (size == 1) {
imm = 0xffff >> (16 - shift);
} else {
/* size == 2 */
imm = 0xffffffff >> (32 - shift);
}
if (size < 2) {
imm64 = imm | (((uint64_t)imm) << 32);
} else {
imm64 = imm;
}
tcg_gen_andi_i64(tcg_ctx, s->V0, s->V0, ~imm64);
}
}
neon_store_reg64(s, s->V0, rd + pass);
}
} else if (op >= 14) {
if (op >= 14) {
/* VCVT fixed-point. */
TCGv_ptr fpst;
TCGv_i32 shiftv;