mirror of
https://github.com/yuzu-emu/unicorn.git
synced 2025-03-23 06:25:12 +00:00
target/arm: Convert Neon VSHLL, VMOVL to decodetree
Convert the VSHLL and VMOVL insns from the 2-reg-shift group to decodetree. Since the loop always has two passes, we unroll it to avoid the awkward reassignment of one TCGv to another. Backports commit 968bf842742a5ffbb0041cb31089e61a9f7a833d from qemu
This commit is contained in:
parent
6fc8fdaa2b
commit
a5f903b2a5
|
@ -243,6 +243,14 @@ VMINNM_fp_3s 1111 001 1 0 . 1 . .... .... 1111 ... 1 .... @3same_fp
|
|||
&2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0 \
|
||||
shift=%neon_rshift_i3
|
||||
|
||||
# Long left shifts: again Q is part of opcode decode
|
||||
@2reg_shll_s .... ... . . . 1 shift:5 .... .... 0 . . . .... \
|
||||
&2reg_shift vm=%vm_dp vd=%vd_dp size=2 q=0
|
||||
@2reg_shll_h .... ... . . . 01 shift:4 .... .... 0 . . . .... \
|
||||
&2reg_shift vm=%vm_dp vd=%vd_dp size=1 q=0
|
||||
@2reg_shll_b .... ... . . . 001 shift:3 .... .... 0 . . . .... \
|
||||
&2reg_shift vm=%vm_dp vd=%vd_dp size=0 q=0
|
||||
|
||||
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_d
|
||||
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_s
|
||||
VSHR_S_2sh 1111 001 0 1 . ...... .... 0000 . . . 1 .... @2reg_shr_h
|
||||
|
@ -348,3 +356,11 @@ VQSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 0 . 1 .... @2reg_shrn_h
|
|||
VQRSHRN_U64_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_d
|
||||
VQRSHRN_U32_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_s
|
||||
VQRSHRN_U16_2sh 1111 001 1 1 . ...... .... 1001 . 1 . 1 .... @2reg_shrn_h
|
||||
|
||||
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
|
||||
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
|
||||
VSHLL_S_2sh 1111 001 0 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
|
||||
|
||||
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_s
|
||||
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_h
|
||||
VSHLL_U_2sh 1111 001 1 1 . ...... .... 1010 . 0 . 1 .... @2reg_shll_b
|
||||
|
|
|
@ -1599,3 +1599,85 @@ DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
|
|||
DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
|
||||
DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
|
||||
DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
|
||||
|
||||
static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
|
||||
NeonGenWidenFn *widenfn, bool u)
|
||||
{
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
TCGv_i64 tmp;
|
||||
TCGv_i32 rm0, rm1;
|
||||
uint64_t widen_mask = 0;
|
||||
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* UNDEF accesses to D16-D31 if they don't exist. */
|
||||
if (!dc_isar_feature(aa32_simd_r32, s) &&
|
||||
((a->vd | a->vm) & 0x10)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (a->vd & 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!vfp_access_check(s)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a widen-and-shift operation. The shift is always less
|
||||
* than the width of the source type, so after widening the input
|
||||
* vector we can simply shift the whole 64-bit widened register,
|
||||
* and then clear the potential overflow bits resulting from left
|
||||
* bits of the narrow input appearing as right bits of the left
|
||||
* neighbour narrow input. Calculate a mask of bits to clear.
|
||||
*/
|
||||
if ((a->shift != 0) && (a->size < 2 || u)) {
|
||||
int esize = 8 << a->size;
|
||||
widen_mask = MAKE_64BIT_MASK(0, esize);
|
||||
widen_mask >>= esize - a->shift;
|
||||
widen_mask = dup_const(a->size + 1, widen_mask);
|
||||
}
|
||||
|
||||
rm0 = neon_load_reg(s, a->vm, 0);
|
||||
rm1 = neon_load_reg(s, a->vm, 1);
|
||||
tmp = tcg_temp_new_i64(tcg_ctx);
|
||||
|
||||
widenfn(tcg_ctx, tmp, rm0);
|
||||
if (a->shift != 0) {
|
||||
tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
|
||||
tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
|
||||
}
|
||||
neon_store_reg64(s, tmp, a->vd);
|
||||
|
||||
widenfn(tcg_ctx, tmp, rm1);
|
||||
if (a->shift != 0) {
|
||||
tcg_gen_shli_i64(tcg_ctx, tmp, tmp, a->shift);
|
||||
tcg_gen_andi_i64(tcg_ctx, tmp, tmp, ~widen_mask);
|
||||
}
|
||||
neon_store_reg64(s, tmp, a->vd + 1);
|
||||
tcg_temp_free_i64(tcg_ctx, tmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
|
||||
{
|
||||
NeonGenWidenFn *widenfn[] = {
|
||||
gen_helper_neon_widen_s8,
|
||||
gen_helper_neon_widen_s16,
|
||||
tcg_gen_ext_i32_i64,
|
||||
};
|
||||
return do_vshll_2sh(s, a, widenfn[a->size], false);
|
||||
}
|
||||
|
||||
static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
|
||||
{
|
||||
NeonGenWidenFn *widenfn[] = {
|
||||
gen_helper_neon_widen_u8,
|
||||
gen_helper_neon_widen_u16,
|
||||
tcg_gen_extu_i32_i64,
|
||||
};
|
||||
return do_vshll_2sh(s, a, widenfn[a->size], true);
|
||||
}
|
||||
|
|
|
@ -5373,6 +5373,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
|
|||
case 7: /* VQSHL */
|
||||
case 8: /* VSHRN, VRSHRN, VQSHRUN, VQRSHRUN */
|
||||
case 9: /* VQSHRN, VQRSHRN */
|
||||
case 10: /* VSHLL, including VMOVL */
|
||||
return 1; /* handled by decodetree */
|
||||
default:
|
||||
break;
|
||||
|
@ -5390,50 +5391,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
|
|||
size--;
|
||||
}
|
||||
shift = (insn >> 16) & ((1 << (3 + size)) - 1);
|
||||
if (op == 10) {
|
||||
/* VSHLL, VMOVL */
|
||||
if (q || (rd & 1)) {
|
||||
return 1;
|
||||
}
|
||||
tmp = neon_load_reg(s, rm, 0);
|
||||
tmp2 = neon_load_reg(s, rm, 1);
|
||||
for (pass = 0; pass < 2; pass++) {
|
||||
if (pass == 1)
|
||||
tmp = tmp2;
|
||||
|
||||
gen_neon_widen(s, s->V0, tmp, size, u);
|
||||
|
||||
if (shift != 0) {
|
||||
/* The shift is less than the width of the source
|
||||
type, so we can just shift the whole register. */
|
||||
tcg_gen_shli_i64(tcg_ctx, s->V0, s->V0, shift);
|
||||
/* Widen the result of shift: we need to clear
|
||||
* the potential overflow bits resulting from
|
||||
* left bits of the narrow input appearing as
|
||||
* right bits of left the neighbour narrow
|
||||
* input. */
|
||||
if (size < 2 || !u) {
|
||||
uint64_t imm64;
|
||||
if (size == 0) {
|
||||
imm = (0xffu >> (8 - shift));
|
||||
imm |= imm << 16;
|
||||
} else if (size == 1) {
|
||||
imm = 0xffff >> (16 - shift);
|
||||
} else {
|
||||
/* size == 2 */
|
||||
imm = 0xffffffff >> (32 - shift);
|
||||
}
|
||||
if (size < 2) {
|
||||
imm64 = imm | (((uint64_t)imm) << 32);
|
||||
} else {
|
||||
imm64 = imm;
|
||||
}
|
||||
tcg_gen_andi_i64(tcg_ctx, s->V0, s->V0, ~imm64);
|
||||
}
|
||||
}
|
||||
neon_store_reg64(s, s->V0, rd + pass);
|
||||
}
|
||||
} else if (op >= 14) {
|
||||
if (op >= 14) {
|
||||
/* VCVT fixed-point. */
|
||||
TCGv_ptr fpst;
|
||||
TCGv_i32 shiftv;
|
||||
|
|
Loading…
Reference in a new issue