From 5d9c0e52bf1f0a57b00bb42954eb47fd666f470a Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 10 Nov 2018 10:58:31 -0500 Subject: [PATCH] target/arm: Use gvec for NEON_3R_VML Move mla_op and mls_op expanders from translate-a64.c. Backports commit 4a7832b095b9ce97a815749a13516f5cfb3c5dd4 from qemu --- qemu/aarch64.h | 2 + qemu/aarch64eb.h | 2 + qemu/arm.h | 2 + qemu/armeb.h | 2 + qemu/header_gen.py | 4 + qemu/target/arm/translate-a64.c | 106 ------------------------- qemu/target/arm/translate.c | 132 ++++++++++++++++++++++++++++---- qemu/target/arm/translate.h | 2 + 8 files changed, 131 insertions(+), 121 deletions(-) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index 9e6962ad..12644408 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -4256,6 +4256,8 @@ #define helper_vfp_mulxd helper_vfp_mulxd_aarch64 #define helper_vfp_mulxs helper_vfp_mulxs_aarch64 #define logic_imm_decode_wmask logic_imm_decode_wmask_aarch64 +#define mla_op mla_op_aarch64 +#define mls_op mls_op_aarch64 #define new_tmp_a64 new_tmp_a64_aarch64 #define new_tmp_a64_zero new_tmp_a64_zero_aarch64 #define pred_esz_masks pred_esz_masks_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 84e785f6..d4df6792 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -4256,6 +4256,8 @@ #define helper_vfp_mulxd helper_vfp_mulxd_aarch64eb #define helper_vfp_mulxs helper_vfp_mulxs_aarch64eb #define logic_imm_decode_wmask logic_imm_decode_wmask_aarch64eb +#define mla_op mla_op_aarch64eb +#define mls_op mls_op_aarch64eb #define new_tmp_a64 new_tmp_a64_aarch64eb #define new_tmp_a64_zero new_tmp_a64_zero_aarch64eb #define pred_esz_masks pred_esz_masks_aarch64eb diff --git a/qemu/arm.h b/qemu/arm.h index cd59f1bd..97173de2 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -3277,6 +3277,8 @@ #define arm_set_cpu_off arm_set_cpu_off_arm #define arm_set_cpu_on arm_set_cpu_on_arm #define fp_exception_el fp_exception_el_arm +#define mla_op mla_op_arm +#define mls_op mls_op_arm #define raise_exception raise_exception_arm #define sli_op sli_op_arm #define ssra_op ssra_op_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index f5b653fc..1e7f448d 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -3277,6 +3277,8 @@ #define arm_set_cpu_off arm_set_cpu_off_armeb #define arm_set_cpu_on arm_set_cpu_on_armeb #define fp_exception_el fp_exception_el_armeb +#define mla_op mla_op_armeb +#define mls_op mls_op_armeb #define raise_exception raise_exception_armeb #define sli_op sli_op_armeb #define ssra_op ssra_op_armeb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index d5048a4c..b160930d 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -3286,6 +3286,8 @@ arm_symbols = ( 'arm_set_cpu_off', 'arm_set_cpu_on', 'fp_exception_el', + 'mla_op', + 'mls_op', 'raise_exception', 'sli_op', 'ssra_op', @@ -4285,6 +4287,8 @@ aarch64_symbols = ( 'helper_vfp_mulxd', 'helper_vfp_mulxs', 'logic_imm_decode_wmask', + 'mla_op', + 'mls_op', 'new_tmp_a64', 'new_tmp_a64_zero', 'pred_esz_masks', diff --git a/qemu/target/arm/translate-a64.c b/qemu/target/arm/translate-a64.c index 5cc96337..3eb747fb 100644 --- a/qemu/target/arm/translate-a64.c +++ b/qemu/target/arm/translate-a64.c @@ -10569,66 +10569,6 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn) } } -static void gen_mla8_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - gen_helper_neon_mul_u8(s, a, a, b); - gen_helper_neon_add_u8(s, d, d, a); -} - -static void gen_mla16_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - gen_helper_neon_mul_u16(s, a, a, b); - gen_helper_neon_add_u16(s, d, d, a); -} - -static void gen_mla32_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - tcg_gen_mul_i32(s, a, a, b); - tcg_gen_add_i32(s, d, d, a); -} - -static void gen_mla64_i64(TCGContext *s, TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) -{ - tcg_gen_mul_i64(s, a, a, b); - tcg_gen_add_i64(s, d, d, a); -} - -static void gen_mla_vec(TCGContext *s, unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) -{ - tcg_gen_mul_vec(s, vece, a, a, b); - tcg_gen_add_vec(s, vece, d, d, a); -} - -static void gen_mls8_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - gen_helper_neon_mul_u8(s, a, a, b); - gen_helper_neon_sub_u8(s, d, d, a); -} - -static void gen_mls16_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - gen_helper_neon_mul_u16(s, a, a, b); - gen_helper_neon_sub_u16(s, d, d, a); -} - -static void gen_mls32_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) -{ - tcg_gen_mul_i32(s, a, a, b); - tcg_gen_sub_i32(s, d, d, a); -} - -static void gen_mls64_i64(TCGContext *s, TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) -{ - tcg_gen_mul_i64(s, a, a, b); - tcg_gen_sub_i64(s, d, d, a); -} - -static void gen_mls_vec(TCGContext *s, unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) -{ - tcg_gen_mul_vec(s, vece, a, a, b); - tcg_gen_sub_vec(s, vece, d, d, a); -} - /* Integer op subgroup of C3.6.16. */ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) { @@ -10647,52 +10587,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) .prefer_i64 = TCG_TARGET_REG_BITS == 64, .vece = MO_64 }, }; - static const GVecGen3 mla_op[4] = { - { .fni4 = gen_mla8_i32, - .fniv = gen_mla_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_8 }, - { .fni4 = gen_mla16_i32, - .fniv = gen_mla_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_16 }, - { .fni4 = gen_mla32_i32, - .fniv = gen_mla_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_32 }, - { .fni8 = gen_mla64_i64, - .fniv = gen_mla_vec, - .opc = INDEX_op_mul_vec, - .prefer_i64 = TCG_TARGET_REG_BITS == 64, - .load_dest = true, - .vece = MO_64 }, - }; - static const GVecGen3 mls_op[4] = { - { .fni4 = gen_mls8_i32, - .fniv = gen_mls_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_8 }, - { .fni4 = gen_mls16_i32, - .fniv = gen_mls_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_16 }, - { .fni4 = gen_mls32_i32, - .fniv = gen_mls_vec, - .opc = INDEX_op_mul_vec, - .load_dest = true, - .vece = MO_32 }, - { .fni8 = gen_mls64_i64, - .fniv = gen_mls_vec, - .opc = INDEX_op_mul_vec, - .prefer_i64 = TCG_TARGET_REG_BITS == 64, - .load_dest = true, - .vece = MO_64 }, - }; TCGContext *tcg_ctx = s->uc->tcg_ctx; int is_q = extract32(insn, 30, 1); diff --git a/qemu/target/arm/translate.c b/qemu/target/arm/translate.c index 1ae97e63..8c1c9833 100644 --- a/qemu/target/arm/translate.c +++ b/qemu/target/arm/translate.c @@ -6192,6 +6192,117 @@ const GVecGen2i sli_op[4] = { .vece = MO_64 }, }; +static void gen_mla8_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(s, a, a, b); + gen_helper_neon_add_u8(s, d, d, a); +} + +static void gen_mla16_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(s, a, a, b); + gen_helper_neon_add_u16(s, d, d, a); +} + +static void gen_mla32_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(s, a, a, b); + tcg_gen_add_i32(s, d, d, a); +} + +static void gen_mla64_i64(TCGContext *s, TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(s, a, a, b); + tcg_gen_add_i64(s, d, d, a); +} + +static void gen_mla_vec(TCGContext *s, unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(s, vece, a, a, b); + tcg_gen_add_vec(s, vece, d, d, a); +} + +static void gen_mls8_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(s, a, a, b); + gen_helper_neon_sub_u8(s, d, d, a); +} + +static void gen_mls16_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(s, a, a, b); + gen_helper_neon_sub_u16(s, d, d, a); +} + +static void gen_mls32_i32(TCGContext *s, TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(s, a, a, b); + tcg_gen_sub_i32(s, d, d, a); +} + +static void gen_mls64_i64(TCGContext *s, TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(s, a, a, b); + tcg_gen_sub_i64(s, d, d, a); +} + +static void gen_mls_vec(TCGContext *s, unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(s, vece, a, a, b); + tcg_gen_sub_vec(s, vece, d, d, a); +} + +/* Note that while NEON does not support VMLA and VMLS as 64-bit ops, + * these tables are shared with AArch64 which does support them. + */ +const GVecGen3 mla_op[4] = { + { .fni4 = gen_mla8_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mla16_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mla32_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mla64_i64, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, +}; + +const GVecGen3 mls_op[4] = { + { .fni4 = gen_mls8_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mls16_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mls32_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mls64_i64, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, +}; + /* Translate a NEON data processing instruction. Return nonzero if the instruction is invalid. We process data in a mixture of 32-bit and 64-bit chunks. @@ -6394,7 +6505,13 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) return 0; } break; + + case NEON_3R_VML: /* VMLA, VMLS */ + tcg_gen_gvec_3(tcg_ctx, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size, + u ? &mls_op[size] : &mla_op[size]); + return 0; } + if (size == 3) { /* 64-bit element instructions. */ for (pass = 0; pass < (q ? 2 : 1); pass++) { @@ -6596,21 +6713,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } } break; - case NEON_3R_VML: /* VMLA, VMLAL, VMLS,VMLSL */ - switch (size) { - case 0: gen_helper_neon_mul_u8(tcg_ctx, tmp, tmp, tmp2); break; - case 1: gen_helper_neon_mul_u16(tcg_ctx, tmp, tmp, tmp2); break; - case 2: tcg_gen_mul_i32(tcg_ctx, tmp, tmp, tmp2); break; - default: abort(); - } - tcg_temp_free_i32(tcg_ctx, tmp2); - tmp2 = neon_load_reg(s, rd, pass); - if (u) { /* VMLS */ - gen_neon_rsb(s, size, tmp, tmp2); - } else { /* VMLA */ - gen_neon_add(s, size, tmp, tmp2); - } - break; case NEON_3R_VMUL: /* VMUL.P8; other cases already eliminated. */ gen_helper_neon_mul_p8(tcg_ctx, tmp, tmp, tmp2); diff --git a/qemu/target/arm/translate.h b/qemu/target/arm/translate.h index 7bc45351..cb1cdc41 100644 --- a/qemu/target/arm/translate.h +++ b/qemu/target/arm/translate.h @@ -195,6 +195,8 @@ static inline TCGv_i32 get_ahp_flag(DisasContext *s) extern const GVecGen3 bsl_op; extern const GVecGen3 bit_op; extern const GVecGen3 bif_op; +extern const GVecGen3 mla_op[4]; +extern const GVecGen3 mls_op[4]; extern const GVecGen2i ssra_op[4]; extern const GVecGen2i usra_op[4]; extern const GVecGen2i sri_op[4];