target/arm: Convert Neon 2-reg-scalar integer multiplies to decodetree

Convert the VMLA, VMLS and VMUL insns in the Neon "2 registers and a
scalar" group to decodetree. These are 32x32->32 operations where
one of the inputs is the scalar, followed by a possible accumulate
operation of the 32-bit result.

The refactoring removes some of the oddities of the old decoder:
* operands to the operation and accumulation were often
reversed (taking advantage of the fact that most of these ops
are commutative); the new code follows the pseudocode order
* the Q bit in the insn was in a local variable 'u'; in the
new code it is decoded into a->q

Backports commit 96fc80f5f186decd1a649f6c04252faceb057ad2 from qemu
This commit is contained in:
Peter Maydell 2020-06-17 00:02:01 -04:00 committed by Lioncash
parent 1817f28afd
commit bf1b0374b9
3 changed files with 158 additions and 77 deletions

View file

@ -467,5 +467,20 @@ Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm
VQDMULL_3d 1111 001 0 1 . .. .... .... 1101 . 0 . 0 .... @3diff
VMULL_P_3d 1111 001 0 1 . .. .... .... 1110 . 0 . 0 .... @3diff
##################################################################
# 2-regs-plus-scalar grouping:
# 1111 001 Q 1 D sz!=11 Vn:4 Vd:4 opc:4 N 1 M 0 Vm:4
##################################################################
&2scalar vm vn vd size q
@2scalar .... ... q:1 . . size:2 .... .... .... . . . . .... \
&2scalar vm=%vm_dp vn=%vn_dp vd=%vd_dp
VMLA_2sc 1111 001 . 1 . .. .... .... 0000 . 1 . 0 .... @2scalar
VMLS_2sc 1111 001 . 1 . .. .... .... 0100 . 1 . 0 .... @2scalar
VMUL_2sc 1111 001 . 1 . .. .... .... 1000 . 1 . 0 .... @2scalar
]
}

View file

@ -1667,7 +1667,7 @@ static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
{
static const NeonGenWidenFn *widenfn[] = {
static NeonGenWidenFn * const widenfn[] = {
gen_helper_neon_widen_s8,
gen_helper_neon_widen_s16,
tcg_gen_ext_i32_i64,
@ -1677,7 +1677,7 @@ static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
{
static const NeonGenWidenFn *widenfn[] = {
static NeonGenWidenFn * const widenfn[] = {
gen_helper_neon_widen_u8,
gen_helper_neon_widen_u16,
tcg_gen_extu_i32_i64,
@ -2376,3 +2376,138 @@ static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
16, 16, 0, fn_gvec);
return true;
}
static void gen_neon_dup_low16(TCGContext *s, TCGv_i32 var)
{
TCGv_i32 tmp = tcg_temp_new_i32(s);
tcg_gen_ext16u_i32(s, var, var);
tcg_gen_shli_i32(s, tmp, var, 16);
tcg_gen_or_i32(s, var, var, tmp);
tcg_temp_free_i32(s, tmp);
}
static void gen_neon_dup_high16(TCGContext *s, TCGv_i32 var)
{
TCGv_i32 tmp = tcg_temp_new_i32(s);
tcg_gen_andi_i32(s, var, var, 0xffff0000);
tcg_gen_shri_i32(s, tmp, var, 16);
tcg_gen_or_i32(s, var, var, tmp);
tcg_temp_free_i32(s, tmp);
}
static inline TCGv_i32 neon_get_scalar(DisasContext *s, int size, int reg)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i32 tmp;
if (size == 1) {
tmp = neon_load_reg(s, reg & 7, reg >> 4);
if (reg & 8) {
gen_neon_dup_high16(tcg_ctx, tmp);
} else {
gen_neon_dup_low16(tcg_ctx, tmp);
}
} else {
tmp = neon_load_reg(s, reg & 15, reg >> 4);
}
return tmp;
}
static bool do_2scalar(DisasContext *s, arg_2scalar *a,
NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
{
/*
* Two registers and a scalar: perform an operation between
* the input elements and the scalar, and then possibly
* perform an accumulation operation of that result into the
* destination.
*/
TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i32 scalar;
int pass;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false;
}
/* UNDEF accesses to D16-D31 if they don't exist. */
if (!dc_isar_feature(aa32_simd_r32, s) &&
((a->vd | a->vn | a->vm) & 0x10)) {
return false;
}
if (!opfn) {
/* Bad size (including size == 3, which is a different insn group) */
return false;
}
if (a->q && ((a->vd | a->vn) & 1)) {
return false;
}
if (!vfp_access_check(s)) {
return true;
}
scalar = neon_get_scalar(s, a->size, a->vm);
for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
TCGv_i32 tmp = neon_load_reg(s, a->vn, pass);
opfn(tcg_ctx, tmp, tmp, scalar);
if (accfn) {
TCGv_i32 rd = neon_load_reg(s, a->vd, pass);
accfn(tcg_ctx, tmp, rd, tmp);
tcg_temp_free_i32(tcg_ctx, rd);
}
neon_store_reg(s, a->vd, pass, tmp);
}
tcg_temp_free_i32(tcg_ctx, scalar);
return true;
}
static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
{
static NeonGenTwoOpFn * const opfn[] = {
NULL,
gen_helper_neon_mul_u16,
tcg_gen_mul_i32,
NULL,
};
return do_2scalar(s, a, opfn[a->size], NULL);
}
static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
{
static NeonGenTwoOpFn * const opfn[] = {
NULL,
gen_helper_neon_mul_u16,
tcg_gen_mul_i32,
NULL,
};
static NeonGenTwoOpFn * const accfn[] = {
NULL,
gen_helper_neon_add_u16,
tcg_gen_add_i32,
NULL,
};
return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
}
static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
{
static NeonGenTwoOpFn * const opfn[] = {
NULL,
gen_helper_neon_mul_u16,
tcg_gen_mul_i32,
NULL,
};
static NeonGenTwoOpFn * const accfn[] = {
NULL,
gen_helper_neon_sub_u16,
tcg_gen_sub_i32,
NULL,
};
return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
}

View file

@ -2703,26 +2703,6 @@ static int disas_dsp_insn(DisasContext *s, uint32_t insn)
#define VFP_DREG_N(reg, insn) VFP_DREG(reg, insn, 16, 7)
#define VFP_DREG_M(reg, insn) VFP_DREG(reg, insn, 0, 5)
static void gen_neon_dup_low16(DisasContext *s, TCGv_i32 var)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
tcg_gen_ext16u_i32(tcg_ctx, var, var);
tcg_gen_shli_i32(tcg_ctx, tmp, var, 16);
tcg_gen_or_i32(tcg_ctx, var, var, tmp);
tcg_temp_free_i32(tcg_ctx, tmp);
}
static void gen_neon_dup_high16(DisasContext *s, TCGv_i32 var)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
tcg_gen_andi_i32(tcg_ctx, var, var, 0xffff0000);
tcg_gen_shri_i32(tcg_ctx, tmp, var, 16);
tcg_gen_or_i32(tcg_ctx, var, var, tmp);
tcg_temp_free_i32(tcg_ctx, tmp);
}
static inline bool use_goto_tb(DisasContext *s, target_ulong dest)
{
#ifndef CONFIG_USER_ONLY
@ -3099,28 +3079,6 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
#define CPU_V001 s->V0, s->V0, s->V1
static inline void gen_neon_add(DisasContext *s, int size, TCGv_i32 t0, TCGv_i32 t1)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
switch (size) {
case 0: gen_helper_neon_add_u8(tcg_ctx, t0, t0, t1); break;
case 1: gen_helper_neon_add_u16(tcg_ctx, t0, t0, t1); break;
case 2: tcg_gen_add_i32(tcg_ctx, t0, t0, t1); break;
default: abort();
}
}
static inline void gen_neon_rsb(DisasContext *s, int size, TCGv_i32 t0, TCGv_i32 t1)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
switch (size) {
case 0: gen_helper_neon_sub_u8(tcg_ctx, t0, t1, t0); break;
case 1: gen_helper_neon_sub_u16(tcg_ctx, t0, t1, t0); break;
case 2: tcg_gen_sub_i32(tcg_ctx, t0, t1, t0); break;
default: return;
}
}
static TCGv_i32 neon_load_scratch(DisasContext *s, int scratch)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
@ -3136,22 +3094,6 @@ static void neon_store_scratch(DisasContext *s, int scratch, TCGv_i32 var)
tcg_temp_free_i32(tcg_ctx, var);
}
static inline TCGv_i32 neon_get_scalar(DisasContext *s, int size, int reg)
{
TCGv_i32 tmp;
if (size == 1) {
tmp = neon_load_reg(s, reg & 7, reg >> 4);
if (reg & 8) {
gen_neon_dup_high16(s, tmp);
} else {
gen_neon_dup_low16(s, tmp);
}
} else {
tmp = neon_load_reg(s, reg & 15, reg >> 4);
}
return tmp;
}
static int gen_neon_unzip(DisasContext *s, int rd, int rm, int size, int q)
{
TCGContext *tcg_ctx = s->uc->tcg_ctx;
@ -5362,6 +5304,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 1;
}
switch (op) {
case 0: /* Integer VMLA scalar */
case 4: /* Integer VMLS scalar */
case 8: /* Integer VMUL scalar */
return 1; /* handled by decodetree */
case 1: /* Float VMLA scalar */
case 5: /* Floating point VMLS scalar */
case 9: /* Floating point VMUL scalar */
@ -5369,9 +5316,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 1;
}
/* fall through */
case 0: /* Integer VMLA scalar */
case 4: /* Integer VMLS scalar */
case 8: /* Integer VMUL scalar */
case 12: /* VQDMULH scalar */
case 13: /* VQRDMULH scalar */
if (u && ((rd | rn) & 1)) {
@ -5394,26 +5338,16 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
} else {
gen_helper_neon_qrdmulh_s32(tcg_ctx, tmp, tcg_ctx->cpu_env, tmp, tmp2);
}
} else if (op & 1) {
} else {
TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);
gen_helper_vfp_muls(tcg_ctx, tmp, tmp, tmp2, fpstatus);
tcg_temp_free_ptr(tcg_ctx, fpstatus);
} else {
switch (size) {
case 0: gen_helper_neon_mul_u8(tcg_ctx, tmp, tmp, tmp2); break;
case 1: gen_helper_neon_mul_u16(tcg_ctx, tmp, tmp, tmp2); break;
case 2: tcg_gen_mul_i32(tcg_ctx, tmp, tmp, tmp2); break;
default: abort();
}
}
tcg_temp_free_i32(tcg_ctx, tmp2);
if (op < 8) {
/* Accumulate. */
tmp2 = neon_load_reg(s, rd, pass);
switch (op) {
case 0:
gen_neon_add(s, size, tmp, tmp2);
break;
case 1:
{
TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);
@ -5421,9 +5355,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
tcg_temp_free_ptr(tcg_ctx, fpstatus);
break;
}
case 4:
gen_neon_rsb(s, size, tmp, tmp2);
break;
case 5:
{
TCGv_ptr fpstatus = get_fpstatus_ptr(tcg_ctx, 1);