From d21fae82bac0dc11eafef9043a0c91826ef66990 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Thu, 25 Feb 2021 12:10:50 -0500 Subject: [PATCH] target/arm: Convert Neon 2-reg-misc pairwise ops to decodetree Convert the pairwise ops VPADDL and VPADAL in the 2-reg-misc grouping to decodetree. At this point we can get rid of the weird CPU_V001 #define that was used to avoid having to explicitly list all the arguments being passed to some TCG gen/helper functions. Backports commit 6106af3aa2304fccee91a3a90138352b0c2af998 from qemu --- qemu/target/arm/neon-dp.decode | 6 ++ qemu/target/arm/translate-neon.inc.c | 150 +++++++++++++++++++++++++++ qemu/target/arm/translate.c | 36 +------ 3 files changed, 158 insertions(+), 34 deletions(-) diff --git a/qemu/target/arm/neon-dp.decode b/qemu/target/arm/neon-dp.decode index e12fdf30..dd521baa 100644 --- a/qemu/target/arm/neon-dp.decode +++ b/qemu/target/arm/neon-dp.decode @@ -441,6 +441,12 @@ Vimm_1r 1111 001 . 1 . 000 ... .... cmode:4 0 . op:1 1 .... @1reg_imm &2misc vm=%vm_dp vd=%vd_dp VREV64 1111 001 11 . 11 .. 00 .... 0 0000 . . 0 .... @2misc + + VPADDL_S 1111 001 11 . 11 .. 00 .... 0 0100 . . 0 .... @2misc + VPADDL_U 1111 001 11 . 11 .. 00 .... 0 0101 . . 0 .... @2misc + + VPADAL_S 1111 001 11 . 11 .. 00 .... 0 1100 . . 0 .... @2misc + VPADAL_U 1111 001 11 . 11 .. 00 .... 0 1101 . . 0 .... @2misc ] # Subgroup for size != 0b11 diff --git a/qemu/target/arm/translate-neon.inc.c b/qemu/target/arm/translate-neon.inc.c index 70d29c98..5274c242 100644 --- a/qemu/target/arm/translate-neon.inc.c +++ b/qemu/target/arm/translate-neon.inc.c @@ -3058,3 +3058,153 @@ static bool trans_VREV64(DisasContext *s, arg_VREV64 *a) } return true; } + +static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a, + NeonGenWidenFn *widenfn, + NeonGenTwo64OpFn *opfn, + NeonGenTwo64OpFn *accfn) +{ + /* + * Pairwise long operations: widen both halves of the pair, + * combine the pairs with the opfn, and then possibly accumulate + * into the destination with the accfn. + */ + int pass; + TCGContext *tcg_ctx = s->uc->tcg_ctx; + + if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { + return false; + } + + /* UNDEF accesses to D16-D31 if they don't exist. */ + if (!dc_isar_feature(aa32_simd_r32, s) && + ((a->vd | a->vm) & 0x10)) { + return false; + } + + if ((a->vd | a->vm) & a->q) { + return false; + } + + if (!widenfn) { + return false; + } + + if (!vfp_access_check(s)) { + return true; + } + + for (pass = 0; pass < a->q + 1; pass++) { + TCGv_i32 tmp; + TCGv_i64 rm0_64, rm1_64, rd_64; + + rm0_64 = tcg_temp_new_i64(tcg_ctx); + rm1_64 = tcg_temp_new_i64(tcg_ctx); + rd_64 = tcg_temp_new_i64(tcg_ctx); + tmp = neon_load_reg(s, a->vm, pass * 2); + widenfn(tcg_ctx, rm0_64, tmp); + tcg_temp_free_i32(tcg_ctx, tmp); + tmp = neon_load_reg(s, a->vm, pass * 2 + 1); + widenfn(tcg_ctx, rm1_64, tmp); + tcg_temp_free_i32(tcg_ctx, tmp); + opfn(tcg_ctx, rd_64, rm0_64, rm1_64); + tcg_temp_free_i64(tcg_ctx, rm0_64); + tcg_temp_free_i64(tcg_ctx, rm1_64); + + if (accfn) { + TCGv_i64 tmp64 = tcg_temp_new_i64(tcg_ctx); + neon_load_reg64(s, tmp64, a->vd + pass); + accfn(tcg_ctx, rd_64, tmp64, rd_64); + tcg_temp_free_i64(tcg_ctx, tmp64); + } + neon_store_reg64(s, rd_64, a->vd + pass); + tcg_temp_free_i64(tcg_ctx, rd_64); + } + return true; +} + +static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL); +} + +static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_s8, + gen_helper_neon_widen_s16, + tcg_gen_ext_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} + +static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a) +{ + static NeonGenWidenFn * const widenfn[] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + NULL, + }; + static NeonGenTwo64OpFn * const opfn[] = { + gen_helper_neon_paddl_u16, + gen_helper_neon_paddl_u32, + tcg_gen_add_i64, + NULL, + }; + static NeonGenTwo64OpFn * const accfn[] = { + gen_helper_neon_addl_u16, + gen_helper_neon_addl_u32, + tcg_gen_add_i64, + NULL, + }; + + return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], + accfn[a->size]); +} diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c index 345f13d3..3eb26816 100644 --- a/qemu/target/arm/translate.c +++ b/qemu/target/arm/translate.c @@ -3038,8 +3038,6 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc) gen_rfe(s, pc, load_cpu_field(s, spsr)); } -#define CPU_V001 s->V0, s->V0, s->V1 - static int gen_neon_unzip(DisasContext *s, int rd, int rm, int size, int q) { TCGContext *tcg_ctx = s->uc->tcg_ctx; @@ -3230,17 +3228,6 @@ static inline void gen_neon_widen(DisasContext *s, TCGv_i64 dest, TCGv_i32 src, tcg_temp_free_i32(tcg_ctx, src); } -static inline void gen_neon_addl(DisasContext *s, int size) -{ - TCGContext *tcg_ctx = s->uc->tcg_ctx; - switch (size) { - case 0: gen_helper_neon_addl_u16(tcg_ctx, CPU_V001); break; - case 1: gen_helper_neon_addl_u32(tcg_ctx, CPU_V001); break; - case 2: tcg_gen_add_i64(tcg_ctx, CPU_V001); break; - default: abort(); - } -} - static void gen_neon_narrow_op(DisasContext *s, int op, int u, int size, TCGv_i32 dest, TCGv_i64 src) { @@ -5205,29 +5192,10 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } switch (op) { case NEON_2RM_VREV64: - /* handled by decodetree */ - return 1; case NEON_2RM_VPADDL: case NEON_2RM_VPADDL_U: case NEON_2RM_VPADAL: case NEON_2RM_VPADAL_U: - for (pass = 0; pass < q + 1; pass++) { - tmp = neon_load_reg(s, rm, pass * 2); - gen_neon_widen(s, s->V0, tmp, size, op & 1); - tmp = neon_load_reg(s, rm, pass * 2 + 1); - gen_neon_widen(s, s->V1, tmp, size, op & 1); - switch (size) { - case 0: gen_helper_neon_paddl_u16(tcg_ctx, CPU_V001); break; - case 1: gen_helper_neon_paddl_u32(tcg_ctx, CPU_V001); break; - case 2: tcg_gen_add_i64(tcg_ctx, CPU_V001); break; - default: abort(); - } - if (op >= NEON_2RM_VPADAL) { - /* Accumulate. */ - neon_load_reg64(s, s->V1, rd + pass); - gen_neon_addl(s, size); - } - neon_store_reg64(s, s->V0, rd + pass); - } - break; + /* handled by decodetree */ + return 1; case NEON_2RM_VTRN: if (size == 2) { int n;