From 331aabddebb8e772e862a4fbe47e41929b6d677d Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sun, 20 May 2018 02:25:06 -0400 Subject: [PATCH] target/arm: Implement SVE Predicate Misc Group Backports commit 028e2a7b876631eff165cac59eb43bdb2dcc213b and f97cfd596ed9bd38644323cb61d19b85ac703c81 from qemu --- qemu/aarch64.h | 67 ++++++++ qemu/aarch64eb.h | 67 ++++++++ qemu/header_gen.py | 67 ++++++++ qemu/target/arm/cpu.h | 4 + qemu/target/arm/helper-sve.h | 148 +++++++++++++++++ qemu/target/arm/sve.decode | 73 +++++++++ qemu/target/arm/sve_helper.c | 277 +++++++++++++++++++++++++++++++ qemu/target/arm/translate-sve.c | 280 ++++++++++++++++++++++++++++++++ 8 files changed, 983 insertions(+) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index c7ed0e76..a65cd2c7 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -3274,16 +3274,82 @@ #define helper_sdiv64 helper_sdiv64_aarch64 #define helper_simd_tbl helper_simd_tbl_aarch64 #define helper_sqrt_f16 helper_sqrt_f16_aarch64 +#define helper_sve_add_zpzz_b helper_sve_add_zpzz_b_aarch64 +#define helper_sve_add_zpzz_d helper_sve_add_zpzz_d_aarch64 +#define helper_sve_add_zpzz_h helper_sve_add_zpzz_h_aarch64 +#define helper_sve_add_zpzz_s helper_sve_add_zpzz_s_aarch64 #define helper_sve_and_pppp helper_sve_and_pppp_aarch64 +#define helper_sve_and_zpzz_b helper_sve_and_zpzz_b_aarch64 +#define helper_sve_and_zpzz_d helper_sve_and_zpzz_d_aarch64 +#define helper_sve_and_zpzz_h helper_sve_and_zpzz_h_aarch64 +#define helper_sve_and_zpzz_s helper_sve_and_zpzz_s_aarch64 #define helper_sve_bic_pppp helper_sve_bic_pppp_aarch64 +#define helper_sve_bic_zpzz_b helper_sve_bic_zpzz_b_aarch64 +#define helper_sve_bic_zpzz_d helper_sve_bic_zpzz_d_aarch64 +#define helper_sve_bic_zpzz_h helper_sve_bic_zpzz_h_aarch64 +#define helper_sve_bic_zpzz_s helper_sve_bic_zpzz_s_aarch64 #define helper_sve_eor_pppp helper_sve_eor_pppp_aarch64 +#define helper_sve_eor_zpzz_b helper_sve_eor_zpzz_b_aarch64 +#define helper_sve_eor_zpzz_d helper_sve_eor_zpzz_d_aarch64 +#define helper_sve_eor_zpzz_h helper_sve_eor_zpzz_h_aarch64 +#define helper_sve_eor_zpzz_s helper_sve_eor_zpzz_s_aarch64 +#define helper_sve_mul_zpzz_b helper_sve_mul_zpzz_b_aarch64 +#define helper_sve_mul_zpzz_d helper_sve_mul_zpzz_d_aarch64 +#define helper_sve_mul_zpzz_h helper_sve_mul_zpzz_h_aarch64 +#define helper_sve_mul_zpzz_s helper_sve_mul_zpzz_s_aarch64 #define helper_sve_nand_pppp helper_sve_nand_pppp_aarch64 #define helper_sve_nor_pppp helper_sve_nor_pppp_aarch64 #define helper_sve_orn_pppp helper_sve_orn_pppp_aarch64 #define helper_sve_orr_pppp helper_sve_orr_pppp_aarch64 +#define helper_sve_orr_zpzz_b helper_sve_orr_zpzz_b_aarch64 +#define helper_sve_orr_zpzz_d helper_sve_orr_zpzz_d_aarch64 +#define helper_sve_orr_zpzz_h helper_sve_orr_zpzz_h_aarch64 +#define helper_sve_orr_zpzz_s helper_sve_orr_zpzz_s_aarch64 +#define helper_sve_sabd_zpzz_b helper_sve_sabd_zpzz_b_aarch64 +#define helper_sve_sabd_zpzz_d helper_sve_sabd_zpzz_d_aarch64 +#define helper_sve_sabd_zpzz_h helper_sve_sabd_zpzz_h_aarch64 +#define helper_sve_sabd_zpzz_s helper_sve_sabd_zpzz_s_aarch64 +#define helper_sve_sdiv_zpzz_d helper_sve_sdiv_zpzz_d_aarch64 +#define helper_sve_sdiv_zpzz_s helper_sve_sdiv_zpzz_s_aarch64 #define helper_sve_sel_pppp helper_sve_sel_pppp_aarch64 +#define helper_sve_smax_zpzz_b helper_sve_smax_zpzz_b_aarch64 +#define helper_sve_smax_zpzz_d helper_sve_smax_zpzz_d_aarch64 +#define helper_sve_smax_zpzz_h helper_sve_smax_zpzz_h_aarch64 +#define helper_sve_smax_zpzz_s helper_sve_smax_zpzz_s_aarch64 +#define helper_sve_smin_zpzz_b helper_sve_smin_zpzz_b_aarch64 +#define helper_sve_smin_zpzz_d helper_sve_smin_zpzz_d_aarch64 +#define helper_sve_smin_zpzz_h helper_sve_smin_zpzz_h_aarch64 +#define helper_sve_smin_zpzz_s helper_sve_smin_zpzz_s_aarch64 +#define helper_sve_smulh_zpzz_b helper_sve_smulh_zpzz_b_aarch64 +#define helper_sve_smulh_zpzz_d helper_sve_smulh_zpzz_d_aarch64 +#define helper_sve_smulh_zpzz_h helper_sve_smulh_zpzz_h_aarch64 +#define helper_sve_smulh_zpzz_s helper_sve_smulh_zpzz_s_aarch64 +#define helper_sve_sub_zpzz_b helper_sve_sub_zpzz_b_aarch64 +#define helper_sve_sub_zpzz_d helper_sve_sub_zpzz_d_aarch64 +#define helper_sve_sub_zpzz_h helper_sve_sub_zpzz_h_aarch64 +#define helper_sve_sub_zpzz_s helper_sve_sub_zpzz_s_aarch64 +#define helper_sve_pfirst helper_sve_pfirst_aarch64 +#define helper_sve_pnext helper_sve_pnext_aarch64 #define helper_sve_predtest helper_sve_predtest_aarch64 #define helper_sve_predtest1 helper_sve_predtest1_aarch64 +#define helper_sve_uabd_zpzz_b helper_sve_uabd_zpzz_b_aarch64 +#define helper_sve_uabd_zpzz_d helper_sve_uabd_zpzz_d_aarch64 +#define helper_sve_uabd_zpzz_h helper_sve_uabd_zpzz_h_aarch64 +#define helper_sve_uabd_zpzz_s helper_sve_uabd_zpzz_s_aarch64 +#define helper_sve_udiv_zpzz_d helper_sve_udiv_zpzz_d_aarch64 +#define helper_sve_udiv_zpzz_s helper_sve_udiv_zpzz_s_aarch64 +#define helper_sve_umax_zpzz_b helper_sve_umax_zpzz_b_aarch64 +#define helper_sve_umax_zpzz_d helper_sve_umax_zpzz_d_aarch64 +#define helper_sve_umax_zpzz_h helper_sve_umax_zpzz_h_aarch64 +#define helper_sve_umax_zpzz_s helper_sve_umax_zpzz_s_aarch64 +#define helper_sve_umin_zpzz_b helper_sve_umin_zpzz_b_aarch64 +#define helper_sve_umin_zpzz_d helper_sve_umin_zpzz_d_aarch64 +#define helper_sve_umin_zpzz_h helper_sve_umin_zpzz_h_aarch64 +#define helper_sve_umin_zpzz_s helper_sve_umin_zpzz_s_aarch64 +#define helper_sve_umulh_zpzz_b helper_sve_umulh_zpzz_b_aarch64 +#define helper_sve_umulh_zpzz_d helper_sve_umulh_zpzz_d_aarch64 +#define helper_sve_umulh_zpzz_h helper_sve_umulh_zpzz_h_aarch64 +#define helper_sve_umulh_zpzz_s helper_sve_umulh_zpzz_s_aarch64 #define helper_udiv64 helper_udiv64_aarch64 #define helper_vfp_cmpd_a64 helper_vfp_cmpd_a64_aarch64 #define helper_vfp_cmped_a64 helper_vfp_cmped_a64_aarch64 @@ -3294,6 +3360,7 @@ #define logic_imm_decode_wmask logic_imm_decode_wmask_aarch64 #define new_tmp_a64 new_tmp_a64_aarch64 #define new_tmp_a64_zero new_tmp_a64_zero_aarch64 +#define pred_esz_masks pred_esz_masks_aarch64 #define read_cpu_reg read_cpu_reg_aarch64 #define read_cpu_reg_sp read_cpu_reg_sp_aarch64 #define sve_access_check sve_access_check_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 1a85e112..b3e25c67 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -3274,16 +3274,82 @@ #define helper_sdiv64 helper_sdiv64_aarch64eb #define helper_simd_tbl helper_simd_tbl_aarch64eb #define helper_sqrt_f16 helper_sqrt_f16_aarch64eb +#define helper_sve_add_zpzz_b helper_sve_add_zpzz_b_aarch64eb +#define helper_sve_add_zpzz_d helper_sve_add_zpzz_d_aarch64eb +#define helper_sve_add_zpzz_h helper_sve_add_zpzz_h_aarch64eb +#define helper_sve_add_zpzz_s helper_sve_add_zpzz_s_aarch64eb #define helper_sve_and_pppp helper_sve_and_pppp_aarch64eb +#define helper_sve_and_zpzz_b helper_sve_and_zpzz_b_aarch64eb +#define helper_sve_and_zpzz_d helper_sve_and_zpzz_d_aarch64eb +#define helper_sve_and_zpzz_h helper_sve_and_zpzz_h_aarch64eb +#define helper_sve_and_zpzz_s helper_sve_and_zpzz_s_aarch64eb #define helper_sve_bic_pppp helper_sve_bic_pppp_aarch64eb +#define helper_sve_bic_zpzz_b helper_sve_bic_zpzz_b_aarch64eb +#define helper_sve_bic_zpzz_d helper_sve_bic_zpzz_d_aarch64eb +#define helper_sve_bic_zpzz_h helper_sve_bic_zpzz_h_aarch64eb +#define helper_sve_bic_zpzz_s helper_sve_bic_zpzz_s_aarch64eb #define helper_sve_eor_pppp helper_sve_eor_pppp_aarch64eb +#define helper_sve_eor_zpzz_b helper_sve_eor_zpzz_b_aarch64eb +#define helper_sve_eor_zpzz_d helper_sve_eor_zpzz_d_aarch64eb +#define helper_sve_eor_zpzz_h helper_sve_eor_zpzz_h_aarch64eb +#define helper_sve_eor_zpzz_s helper_sve_eor_zpzz_s_aarch64eb +#define helper_sve_mul_zpzz_b helper_sve_mul_zpzz_b_aarch64eb +#define helper_sve_mul_zpzz_d helper_sve_mul_zpzz_d_aarch64eb +#define helper_sve_mul_zpzz_h helper_sve_mul_zpzz_h_aarch64eb +#define helper_sve_mul_zpzz_s helper_sve_mul_zpzz_s_aarch64eb #define helper_sve_nand_pppp helper_sve_nand_pppp_aarch64eb #define helper_sve_nor_pppp helper_sve_nor_pppp_aarch64eb #define helper_sve_orn_pppp helper_sve_orn_pppp_aarch64eb #define helper_sve_orr_pppp helper_sve_orr_pppp_aarch64eb +#define helper_sve_orr_zpzz_b helper_sve_orr_zpzz_b_aarch64eb +#define helper_sve_orr_zpzz_d helper_sve_orr_zpzz_d_aarch64eb +#define helper_sve_orr_zpzz_h helper_sve_orr_zpzz_h_aarch64eb +#define helper_sve_orr_zpzz_s helper_sve_orr_zpzz_s_aarch64eb +#define helper_sve_sabd_zpzz_b helper_sve_sabd_zpzz_b_aarch64eb +#define helper_sve_sabd_zpzz_d helper_sve_sabd_zpzz_d_aarch64eb +#define helper_sve_sabd_zpzz_h helper_sve_sabd_zpzz_h_aarch64eb +#define helper_sve_sabd_zpzz_s helper_sve_sabd_zpzz_s_aarch64eb +#define helper_sve_sdiv_zpzz_d helper_sve_sdiv_zpzz_d_aarch64eb +#define helper_sve_sdiv_zpzz_s helper_sve_sdiv_zpzz_s_aarch64eb #define helper_sve_sel_pppp helper_sve_sel_pppp_aarch64eb +#define helper_sve_smax_zpzz_b helper_sve_smax_zpzz_b_aarch64eb +#define helper_sve_smax_zpzz_d helper_sve_smax_zpzz_d_aarch64eb +#define helper_sve_smax_zpzz_h helper_sve_smax_zpzz_h_aarch64eb +#define helper_sve_smax_zpzz_s helper_sve_smax_zpzz_s_aarch64eb +#define helper_sve_smin_zpzz_b helper_sve_smin_zpzz_b_aarch64eb +#define helper_sve_smin_zpzz_d helper_sve_smin_zpzz_d_aarch64eb +#define helper_sve_smin_zpzz_h helper_sve_smin_zpzz_h_aarch64eb +#define helper_sve_smin_zpzz_s helper_sve_smin_zpzz_s_aarch64eb +#define helper_sve_smulh_zpzz_b helper_sve_smulh_zpzz_b_aarch64eb +#define helper_sve_smulh_zpzz_d helper_sve_smulh_zpzz_d_aarch64eb +#define helper_sve_smulh_zpzz_h helper_sve_smulh_zpzz_h_aarch64eb +#define helper_sve_smulh_zpzz_s helper_sve_smulh_zpzz_s_aarch64eb +#define helper_sve_sub_zpzz_b helper_sve_sub_zpzz_b_aarch64eb +#define helper_sve_sub_zpzz_d helper_sve_sub_zpzz_d_aarch64eb +#define helper_sve_sub_zpzz_h helper_sve_sub_zpzz_h_aarch64eb +#define helper_sve_sub_zpzz_s helper_sve_sub_zpzz_s_aarch64eb +#define helper_sve_pfirst helper_sve_pfirst_aarch64eb +#define helper_sve_pnext helper_sve_pnext_aarch64eb #define helper_sve_predtest helper_sve_predtest_aarch64eb #define helper_sve_predtest1 helper_sve_predtest1_aarch64eb +#define helper_sve_uabd_zpzz_b helper_sve_uabd_zpzz_b_aarch64eb +#define helper_sve_uabd_zpzz_d helper_sve_uabd_zpzz_d_aarch64eb +#define helper_sve_uabd_zpzz_h helper_sve_uabd_zpzz_h_aarch64eb +#define helper_sve_uabd_zpzz_s helper_sve_uabd_zpzz_s_aarch64eb +#define helper_sve_udiv_zpzz_d helper_sve_udiv_zpzz_d_aarch64eb +#define helper_sve_udiv_zpzz_s helper_sve_udiv_zpzz_s_aarch64eb +#define helper_sve_umax_zpzz_b helper_sve_umax_zpzz_b_aarch64eb +#define helper_sve_umax_zpzz_d helper_sve_umax_zpzz_d_aarch64eb +#define helper_sve_umax_zpzz_h helper_sve_umax_zpzz_h_aarch64eb +#define helper_sve_umax_zpzz_s helper_sve_umax_zpzz_s_aarch64eb +#define helper_sve_umin_zpzz_b helper_sve_umin_zpzz_b_aarch64eb +#define helper_sve_umin_zpzz_d helper_sve_umin_zpzz_d_aarch64eb +#define helper_sve_umin_zpzz_h helper_sve_umin_zpzz_h_aarch64eb +#define helper_sve_umin_zpzz_s helper_sve_umin_zpzz_s_aarch64eb +#define helper_sve_umulh_zpzz_b helper_sve_umulh_zpzz_b_aarch64eb +#define helper_sve_umulh_zpzz_d helper_sve_umulh_zpzz_d_aarch64eb +#define helper_sve_umulh_zpzz_h helper_sve_umulh_zpzz_h_aarch64eb +#define helper_sve_umulh_zpzz_s helper_sve_umulh_zpzz_s_aarch64eb #define helper_udiv64 helper_udiv64_aarch64eb #define helper_vfp_cmpd_a64 helper_vfp_cmpd_a64_aarch64eb #define helper_vfp_cmped_a64 helper_vfp_cmped_a64_aarch64eb @@ -3294,6 +3360,7 @@ #define logic_imm_decode_wmask logic_imm_decode_wmask_aarch64eb #define new_tmp_a64 new_tmp_a64_aarch64eb #define new_tmp_a64_zero new_tmp_a64_zero_aarch64eb +#define pred_esz_masks pred_esz_masks_aarch64eb #define read_cpu_reg read_cpu_reg_aarch64eb #define read_cpu_reg_sp read_cpu_reg_sp_aarch64eb #define sve_access_check sve_access_check_aarch64eb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index f398d235..a5ca0696 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -3295,16 +3295,82 @@ aarch64_symbols = ( 'helper_sdiv64', 'helper_simd_tbl', 'helper_sqrt_f16', + 'helper_sve_add_zpzz_b', + 'helper_sve_add_zpzz_d', + 'helper_sve_add_zpzz_h', + 'helper_sve_add_zpzz_s', 'helper_sve_and_pppp', + 'helper_sve_and_zpzz_b', + 'helper_sve_and_zpzz_d', + 'helper_sve_and_zpzz_h', + 'helper_sve_and_zpzz_s', 'helper_sve_bic_pppp', + 'helper_sve_bic_zpzz_b', + 'helper_sve_bic_zpzz_d', + 'helper_sve_bic_zpzz_h', + 'helper_sve_bic_zpzz_s', 'helper_sve_eor_pppp', + 'helper_sve_eor_zpzz_b', + 'helper_sve_eor_zpzz_d', + 'helper_sve_eor_zpzz_h', + 'helper_sve_eor_zpzz_s', + 'helper_sve_mul_zpzz_b', + 'helper_sve_mul_zpzz_d', + 'helper_sve_mul_zpzz_h', + 'helper_sve_mul_zpzz_s', 'helper_sve_nand_pppp', 'helper_sve_nor_pppp', 'helper_sve_orn_pppp', 'helper_sve_orr_pppp', + 'helper_sve_orr_zpzz_b', + 'helper_sve_orr_zpzz_d', + 'helper_sve_orr_zpzz_h', + 'helper_sve_orr_zpzz_s', + 'helper_sve_sabd_zpzz_b', + 'helper_sve_sabd_zpzz_d', + 'helper_sve_sabd_zpzz_h', + 'helper_sve_sabd_zpzz_s', + 'helper_sve_sdiv_zpzz_d', + 'helper_sve_sdiv_zpzz_s', 'helper_sve_sel_pppp', + 'helper_sve_smax_zpzz_b', + 'helper_sve_smax_zpzz_d', + 'helper_sve_smax_zpzz_h', + 'helper_sve_smax_zpzz_s', + 'helper_sve_smin_zpzz_b', + 'helper_sve_smin_zpzz_d', + 'helper_sve_smin_zpzz_h', + 'helper_sve_smin_zpzz_s', + 'helper_sve_smulh_zpzz_b', + 'helper_sve_smulh_zpzz_d', + 'helper_sve_smulh_zpzz_h', + 'helper_sve_smulh_zpzz_s', + 'helper_sve_sub_zpzz_b', + 'helper_sve_sub_zpzz_d', + 'helper_sve_sub_zpzz_h', + 'helper_sve_sub_zpzz_s', + 'helper_sve_pfirst', + 'helper_sve_pnext', 'helper_sve_predtest', 'helper_sve_predtest1', + 'helper_sve_uabd_zpzz_b', + 'helper_sve_uabd_zpzz_d', + 'helper_sve_uabd_zpzz_h', + 'helper_sve_uabd_zpzz_s', + 'helper_sve_udiv_zpzz_d', + 'helper_sve_udiv_zpzz_s', + 'helper_sve_umax_zpzz_b', + 'helper_sve_umax_zpzz_d', + 'helper_sve_umax_zpzz_h', + 'helper_sve_umax_zpzz_s', + 'helper_sve_umin_zpzz_b', + 'helper_sve_umin_zpzz_d', + 'helper_sve_umin_zpzz_h', + 'helper_sve_umin_zpzz_s', + 'helper_sve_umulh_zpzz_b', + 'helper_sve_umulh_zpzz_d', + 'helper_sve_umulh_zpzz_h', + 'helper_sve_umulh_zpzz_s', 'helper_udiv64', 'helper_vfp_cmpd_a64', 'helper_vfp_cmped_a64', @@ -3315,6 +3381,7 @@ aarch64_symbols = ( 'logic_imm_decode_wmask', 'new_tmp_a64', 'new_tmp_a64_zero', + 'pred_esz_masks', 'read_cpu_reg', 'read_cpu_reg_sp', 'sve_access_check', diff --git a/qemu/target/arm/cpu.h b/qemu/target/arm/cpu.h index 71ac6cf8..32589bc9 100644 --- a/qemu/target/arm/cpu.h +++ b/qemu/target/arm/cpu.h @@ -531,6 +531,7 @@ typedef struct CPUARMState { #ifdef TARGET_AARCH64 /* Store FFR as pregs[16] to make it easier to treat as any other. */ +#define FFR_PRED_NUM 16 ARMPredicateReg pregs[17]; /* Scratch space for aa64 sve predicate temporary. */ ARMPredicateReg preg_tmp; @@ -2909,4 +2910,7 @@ static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno) return &env->vfp.zregs[regno].d[0]; } +/* Shared between translate-sve.c and sve_helper.c. */ +extern const uint64_t pred_esz_masks[4]; + #endif diff --git a/qemu/target/arm/helper-sve.h b/qemu/target/arm/helper-sve.h index 57adc4d9..5b82ba15 100644 --- a/qemu/target/arm/helper-sve.h +++ b/qemu/target/arm/helper-sve.h @@ -20,6 +20,154 @@ DEF_HELPER_FLAGS_2(sve_predtest1, TCG_CALL_NO_WG, i32, i64, i64) DEF_HELPER_FLAGS_3(sve_predtest, TCG_CALL_NO_WG, i32, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_pfirst, TCG_CALL_NO_WG, i32, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(sve_pnext, TCG_CALL_NO_WG, i32, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_and_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_and_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_and_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_and_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_eor_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_eor_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_eor_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_eor_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_orr_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_orr_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_orr_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_orr_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_bic_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_bic_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_bic_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_bic_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_add_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_add_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_add_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_add_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_sub_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sub_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sub_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sub_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_smax_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smax_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smax_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smax_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_umax_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umax_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umax_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umax_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_smin_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smin_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smin_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smin_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_umin_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umin_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umin_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umin_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_sabd_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sabd_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sabd_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sabd_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_uabd_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_uabd_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_uabd_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_uabd_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_mul_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_mul_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_mul_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_mul_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_smulh_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smulh_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smulh_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_smulh_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_umulh_zpzz_b, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umulh_zpzz_h, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umulh_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_umulh_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_sdiv_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(sve_udiv_zpzz_s, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(sve_udiv_zpzz_d, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + DEF_HELPER_FLAGS_5(sve_and_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_bic_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(sve_eor_pppp, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) diff --git a/qemu/target/arm/sve.decode b/qemu/target/arm/sve.decode index f695dda3..c444357c 100644 --- a/qemu/target/arm/sve.decode +++ b/qemu/target/arm/sve.decode @@ -24,25 +24,43 @@ %imm9_16_10 16:s6 10:3 +# Either a copy of rd (at bit 0), or a different source +# as propagated via the MOVPRFX instruction. +%reg_movprfx 0:5 + ########################################################################### # Named attribute sets. These are used to make nice(er) names # when creating helpers common to those for the individual # instruction patterns. +&rr_esz rd rn esz &rri rd rn imm &rrr_esz rd rn rm esz &rprr_s rd pg rn rm s +&rprr_esz rd pg rn rm esz ########################################################################### # Named instruction formats. These are generally used to # reduce the amount of duplication between instruction patterns. +# Two operand with unused vector element size +@pd_pn_e0 ........ ........ ....... rn:4 . rd:4 &rr_esz esz=0 + +# Two operand +@pd_pn ........ esz:2 .. .... ....... rn:4 . rd:4 &rr_esz + # Three operand with unused vector element size @rd_rn_rm_e0 ........ ... rm:5 ... ... rn:5 rd:5 &rrr_esz esz=0 # Three predicate operand, with governing predicate, flag setting @pd_pg_pn_pm_s ........ . s:1 .. rm:4 .. pg:4 . rn:4 . rd:4 &rprr_s +# Two register operand, with governing predicate, vector element size +@rdn_pg_rm ........ esz:2 ... ... ... pg:3 rm:5 rd:5 \ + &rprr_esz rn=%reg_movprfx +@rdm_pg_rn ........ esz:2 ... ... ... pg:3 rn:5 rd:5 \ + &rprr_esz rm=%reg_movprfx + # Basic Load/Store with 9-bit immediate offset @pd_rn_i9 ........ ........ ...... rn:5 . rd:4 \ &rri imm=%imm9_16_10 @@ -52,6 +70,37 @@ ########################################################################### # Instruction patterns. Grouped according to the SVE encodingindex.xhtml. +### SVE Integer Arithmetic - Binary Predicated Group + +# SVE bitwise logical vector operations (predicated) +ORR_zpzz 00000100 .. 011 000 000 ... ..... ..... @rdn_pg_rm +EOR_zpzz 00000100 .. 011 001 000 ... ..... ..... @rdn_pg_rm +AND_zpzz 00000100 .. 011 010 000 ... ..... ..... @rdn_pg_rm +BIC_zpzz 00000100 .. 011 011 000 ... ..... ..... @rdn_pg_rm + +# SVE integer add/subtract vectors (predicated) +ADD_zpzz 00000100 .. 000 000 000 ... ..... ..... @rdn_pg_rm +SUB_zpzz 00000100 .. 000 001 000 ... ..... ..... @rdn_pg_rm +SUB_zpzz 00000100 .. 000 011 000 ... ..... ..... @rdm_pg_rn # SUBR + +# SVE integer min/max/difference (predicated) +SMAX_zpzz 00000100 .. 001 000 000 ... ..... ..... @rdn_pg_rm +UMAX_zpzz 00000100 .. 001 001 000 ... ..... ..... @rdn_pg_rm +SMIN_zpzz 00000100 .. 001 010 000 ... ..... ..... @rdn_pg_rm +UMIN_zpzz 00000100 .. 001 011 000 ... ..... ..... @rdn_pg_rm +SABD_zpzz 00000100 .. 001 100 000 ... ..... ..... @rdn_pg_rm +UABD_zpzz 00000100 .. 001 101 000 ... ..... ..... @rdn_pg_rm + +# SVE integer multiply/divide (predicated) +MUL_zpzz 00000100 .. 010 000 000 ... ..... ..... @rdn_pg_rm +SMULH_zpzz 00000100 .. 010 010 000 ... ..... ..... @rdn_pg_rm +UMULH_zpzz 00000100 .. 010 011 000 ... ..... ..... @rdn_pg_rm +# Note that divide requires size >= 2; below 2 is unallocated. +SDIV_zpzz 00000100 .. 010 100 000 ... ..... ..... @rdn_pg_rm +UDIV_zpzz 00000100 .. 010 101 000 ... ..... ..... @rdn_pg_rm +SDIV_zpzz 00000100 .. 010 110 000 ... ..... ..... @rdm_pg_rn # SDIVR +UDIV_zpzz 00000100 .. 010 111 000 ... ..... ..... @rdm_pg_rn # UDIVR + ### SVE Logical - Unpredicated Group # SVE bitwise logical operations (unpredicated) @@ -77,6 +126,30 @@ NAND_pppp 00100101 1. 00 .... 01 .... 1 .... 1 .... @pd_pg_pn_pm_s # SVE predicate test PTEST 00100101 01 010000 11 pg:4 0 rn:4 0 0000 +# SVE predicate initialize +PTRUE 00100101 esz:2 01100 s:1 111000 pat:5 0 rd:4 + +# SVE initialize FFR +SETFFR 00100101 0010 1100 1001 0000 0000 0000 + +# SVE zero predicate register +PFALSE 00100101 0001 1000 1110 0100 0000 rd:4 + +# SVE predicate read from FFR (predicated) +RDFFR_p 00100101 0 s:1 0110001111000 pg:4 0 rd:4 + +# SVE predicate read from FFR (unpredicated) +RDFFR 00100101 0001 1001 1111 0000 0000 rd:4 + +# SVE FFR write from predicate (WRFFR) +WRFFR 00100101 0010 1000 1001 000 rn:4 00000 + +# SVE predicate first active +PFIRST 00100101 01 011 000 11000 00 .... 0 .... @pd_pn_e0 + +# SVE predicate next active +PNEXT 00100101 .. 011 001 11000 10 .... 0 .... @pd_pn + ### SVE Memory - 32-bit Gather and Unsized Contiguous Group # SVE load predicate register diff --git a/qemu/target/arm/sve_helper.c b/qemu/target/arm/sve_helper.c index 2eda6f2e..d33bbc80 100644 --- a/qemu/target/arm/sve_helper.c +++ b/qemu/target/arm/sve_helper.c @@ -24,6 +24,21 @@ #include "exec/helper-proto.h" #include "tcg/tcg-gvec-desc.h" +/* Note that vector data is stored in host-endian 64-bit chunks, + so addressing units smaller than that needs a host-endian fixup. */ +#ifdef HOST_WORDS_BIGENDIAN +#define H1(x) ((x) ^ 7) +#define H1_2(x) ((x) ^ 6) +#define H1_4(x) ((x) ^ 4) +#define H2(x) ((x) ^ 3) +#define H4(x) ((x) ^ 1) +#else +#define H1(x) (x) +#define H1_2(x) (x) +#define H1_4(x) (x) +#define H2(x) (x) +#define H4(x) (x) +#endif /* Return a value for NZCV as per the ARM PredTest pseudofunction. * @@ -115,3 +130,265 @@ LOGICAL_PPPP(sve_nand_pppp, DO_NAND) #undef DO_NAND #undef DO_SEL #undef LOGICAL_PPPP + +/* Fully general three-operand expander, controlled by a predicate. + * This is complicated by the host-endian storage of the register file. + */ +/* ??? I don't expect the compiler could ever vectorize this itself. + * With some tables we can convert bit masks to byte masks, and with + * extra care wrt byte/word ordering we could use gcc generic vectors + * and do 16 bytes at a time. + */ +#define DO_ZPZZ(NAME, TYPE, H, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc); \ + for (i = 0; i < opr_sz; ) { \ + uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ + do { \ + if (pg & 1) { \ + TYPE nn = *(TYPE *)(vn + H(i)); \ + TYPE mm = *(TYPE *)(vm + H(i)); \ + *(TYPE *)(vd + H(i)) = OP(nn, mm); \ + } \ + i += sizeof(TYPE), pg >>= sizeof(TYPE); \ + } while (i & 15); \ + } \ +} + +/* Similarly, specialized for 64-bit operands. */ +#define DO_ZPZZ_D(NAME, TYPE, OP) \ +void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ +{ \ + intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ + TYPE *d = vd, *n = vn, *m = vm; \ + uint8_t *pg = vg; \ + for (i = 0; i < opr_sz; i += 1) { \ + if (pg[H1(i)] & 1) { \ + TYPE nn = n[i], mm = m[i]; \ + d[i] = OP(nn, mm); \ + } \ + } \ +} + +#define DO_AND(N, M) (N & M) +#define DO_EOR(N, M) (N ^ M) +#define DO_ORR(N, M) (N | M) +#define DO_BIC(N, M) (N & ~M) +#define DO_ADD(N, M) (N + M) +#define DO_SUB(N, M) (N - M) +#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) +#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) +#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) +#define DO_MUL(N, M) (N * M) +#define DO_DIV(N, M) (M ? N / M : 0) + +DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) +DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) +DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) +DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) + +DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) +DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) +DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) +DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) + +DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) +DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) +DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) +DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) + +DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) +DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) +DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) +DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) + +DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) +DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) +DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) +DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) + +DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) +DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) +DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) +DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) + +DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) +DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) +DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) +DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) + +DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) +DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) +DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) +DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) + +DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) +DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) +DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) +DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) + +DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) +DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) +DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) +DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) + +DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) +DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) +DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) +DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) + +DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) +DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) +DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) +DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) + +/* Because the computation type is at least twice as large as required, + these work for both signed and unsigned source types. */ +static inline uint8_t do_mulh_b(int32_t n, int32_t m) +{ + return (n * m) >> 8; +} + +static inline uint16_t do_mulh_h(int32_t n, int32_t m) +{ + return (n * m) >> 16; +} + +static inline uint32_t do_mulh_s(int64_t n, int64_t m) +{ + return (n * m) >> 32; +} + +static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) +{ + uint64_t lo, hi; + muls64(&lo, &hi, n, m); + return hi; +} + +static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) +{ + uint64_t lo, hi; + mulu64(&lo, &hi, n, m); + return hi; +} + +DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) +DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) +DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) +DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) + +DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) +DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) +DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) +DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) + +DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) +DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) +DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) +DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) + +DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV) +DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV) + +DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV) +DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV) + +#undef DO_ZPZZ +#undef DO_ZPZZ_D +#undef DO_AND +#undef DO_ORR +#undef DO_EOR +#undef DO_BIC +#undef DO_ADD +#undef DO_SUB +#undef DO_MAX +#undef DO_MIN +#undef DO_ABD +#undef DO_MUL +#undef DO_DIV + +/* Similar to the ARM LastActiveElement pseudocode function, except the + result is multiplied by the element size. This includes the not found + indication; e.g. not found for esz=3 is -8. */ +static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) +{ + uint64_t mask = pred_esz_masks[esz]; + intptr_t i = words; + + do { + uint64_t this_g = g[--i] & mask; + if (this_g) { + return i * 64 + (63 - clz64(this_g)); + } + } while (i > 0); + return (intptr_t)-1 << esz; +} + +uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words) +{ + uint32_t flags = PREDTEST_INIT; + uint64_t *d = vd, *g = vg; + intptr_t i = 0; + + do { + uint64_t this_d = d[i]; + uint64_t this_g = g[i]; + + if (this_g) { + if (!(flags & 4)) { + /* Set in D the first bit of G. */ + this_d |= this_g & -this_g; + d[i] = this_d; + } + flags = iter_predtest_fwd(this_d, this_g, flags); + } + } while (++i < words); + + return flags; +} + +uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) +{ + intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS); + intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); + uint32_t flags = PREDTEST_INIT; + uint64_t *d = vd, *g = vg, esz_mask; + intptr_t i, next; + + next = last_active_element(vd, words, esz) + (1 << esz); + esz_mask = pred_esz_masks[esz]; + + /* Similar to the pseudocode for pnext, but scaled by ESZ + so that we find the correct bit. */ + if (next < words * 64) { + uint64_t mask = -1; + + if (next & 63) { + mask = ~((1ull << (next & 63)) - 1); + next &= -64; + } + do { + uint64_t this_g = g[next / 64] & esz_mask & mask; + if (this_g != 0) { + next = (next & -64) + ctz64(this_g); + break; + } + next += 64; + mask = -1; + } while (next < words * 64); + } + + i = 0; + do { + uint64_t this_d = 0; + if (i == next / 64) { + this_d = 1ull << (next & 63); + } + d[i] = this_d; + flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); + } while (++i < words); + + return flags; +} diff --git a/qemu/target/arm/translate-sve.c b/qemu/target/arm/translate-sve.c index c305fff7..e708c881 100644 --- a/qemu/target/arm/translate-sve.c +++ b/qemu/target/arm/translate-sve.c @@ -22,6 +22,7 @@ #include "exec/exec-all.h" #include "tcg-op.h" #include "tcg-op-gvec.h" +#include "tcg-gvec-desc.h" #include "arm_ldst.h" #include "translate.h" #include "internals.h" @@ -197,6 +198,12 @@ static void do_predtest(DisasContext *s, int dofs, int gofs, int words) tcg_temp_free_i32(tcg_ctx, t); } +/* For each element size, the bits within a predicate word that are active. */ +const uint64_t pred_esz_masks[4] = { + 0xffffffffffffffffull, 0x5555555555555555ull, + 0x1111111111111111ull, 0x0101010101010101ull +}; + /* *** SVE Logical - Unpredicated Group */ @@ -225,6 +232,75 @@ static bool trans_BIC_zzz(DisasContext *s, arg_rrr_esz *a, uint32_t insn) return do_vector3_z(s, tcg_gen_gvec_andc, 0, a->rd, a->rn, a->rm); } +/* + *** SVE Integer Arithmetic - Binary Predicated Group + */ + +static bool do_zpzz_ool(DisasContext *s, arg_rprr_esz *a, gen_helper_gvec_4 *fn) +{ + unsigned vsz = vec_full_reg_size(s); + if (fn == NULL) { + return false; + } + if (sve_access_check(s)) { + TCGContext *tcg_ctx = s->uc->tcg_ctx; + tcg_gen_gvec_4_ool(tcg_ctx, vec_full_reg_offset(s, a->rd), + vec_full_reg_offset(s, a->rn), + vec_full_reg_offset(s, a->rm), + pred_full_reg_offset(s, a->pg), + vsz, vsz, 0, fn); + } + return true; +} + +#define DO_ZPZZ(NAME, name) \ +static bool trans_##NAME##_zpzz(DisasContext *s, arg_rprr_esz *a, \ + uint32_t insn) \ +{ \ + static gen_helper_gvec_4 * const fns[4] = { \ + gen_helper_sve_##name##_zpzz_b, gen_helper_sve_##name##_zpzz_h, \ + gen_helper_sve_##name##_zpzz_s, gen_helper_sve_##name##_zpzz_d, \ + }; \ + return do_zpzz_ool(s, a, fns[a->esz]); \ +} + +DO_ZPZZ(AND, and) +DO_ZPZZ(EOR, eor) +DO_ZPZZ(ORR, orr) +DO_ZPZZ(BIC, bic) + +DO_ZPZZ(ADD, add) +DO_ZPZZ(SUB, sub) + +DO_ZPZZ(SMAX, smax) +DO_ZPZZ(UMAX, umax) +DO_ZPZZ(SMIN, smin) +DO_ZPZZ(UMIN, umin) +DO_ZPZZ(SABD, sabd) +DO_ZPZZ(UABD, uabd) + +DO_ZPZZ(MUL, mul) +DO_ZPZZ(SMULH, smulh) +DO_ZPZZ(UMULH, umulh) + +static bool trans_SDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_4 * const fns[4] = { + NULL, NULL, gen_helper_sve_sdiv_zpzz_s, gen_helper_sve_sdiv_zpzz_d + }; + return do_zpzz_ool(s, a, fns[a->esz]); +} + +static bool trans_UDIV_zpzz(DisasContext *s, arg_rprr_esz *a, uint32_t insn) +{ + static gen_helper_gvec_4 * const fns[4] = { + NULL, NULL, gen_helper_sve_udiv_zpzz_s, gen_helper_sve_udiv_zpzz_d + }; + return do_zpzz_ool(s, a, fns[a->esz]); +} + +#undef DO_ZPZZ + /* *** SVE Predicate Logical Operations Group */ @@ -580,6 +656,210 @@ static bool trans_PTEST(DisasContext *s, arg_PTEST *a, uint32_t insn) return true; } +/* See the ARM pseudocode DecodePredCount. */ +static unsigned decode_pred_count(unsigned fullsz, int pattern, int esz) +{ + unsigned elements = fullsz >> esz; + unsigned bound; + + switch (pattern) { + case 0x0: /* POW2 */ + return pow2floor(elements); + case 0x1: /* VL1 */ + case 0x2: /* VL2 */ + case 0x3: /* VL3 */ + case 0x4: /* VL4 */ + case 0x5: /* VL5 */ + case 0x6: /* VL6 */ + case 0x7: /* VL7 */ + case 0x8: /* VL8 */ + bound = pattern; + break; + case 0x9: /* VL16 */ + case 0xa: /* VL32 */ + case 0xb: /* VL64 */ + case 0xc: /* VL128 */ + case 0xd: /* VL256 */ + bound = 16 << (pattern - 9); + break; + case 0x1d: /* MUL4 */ + return elements - elements % 4; + case 0x1e: /* MUL3 */ + return elements - elements % 3; + case 0x1f: /* ALL */ + return elements; + default: /* #uimm5 */ + return 0; + } + return elements >= bound ? bound : 0; +} + +/* This handles all of the predicate initialization instructions, + * PTRUE, PFALSE, SETFFR. For PFALSE, we will have set PAT == 32 + * so that decode_pred_count returns 0. For SETFFR, we will have + * set RD == 16 == FFR. + */ +static bool do_predset(DisasContext *s, int esz, int rd, int pat, bool setflag) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGContext *tcg_ctx = s->uc->tcg_ctx; + unsigned fullsz = vec_full_reg_size(s); + unsigned ofs = pred_full_reg_offset(s, rd); + unsigned numelem, setsz, i; + uint64_t word, lastword; + TCGv_i64 t; + + numelem = decode_pred_count(fullsz, pat, esz); + + /* Determine what we must store into each bit, and how many. */ + if (numelem == 0) { + lastword = word = 0; + setsz = fullsz; + } else { + setsz = numelem << esz; + lastword = word = pred_esz_masks[esz]; + if (setsz % 64) { + lastword &= ~(-1ull << (setsz % 64)); + } + } + + t = tcg_temp_new_i64(tcg_ctx); + if (fullsz <= 64) { + tcg_gen_movi_i64(tcg_ctx, t, lastword); + tcg_gen_st_i64(tcg_ctx, t, tcg_ctx->cpu_env, ofs); + goto done; + } + + if (word == lastword) { + unsigned maxsz = size_for_gvec(fullsz / 8); + unsigned oprsz = size_for_gvec(setsz / 8); + + if (oprsz * 8 == setsz) { + tcg_gen_gvec_dup64i(tcg_ctx, ofs, oprsz, maxsz, word); + goto done; + } + if (oprsz * 8 == setsz + 8) { + tcg_gen_gvec_dup64i(tcg_ctx, ofs, oprsz, maxsz, word); + tcg_gen_movi_i64(tcg_ctx, t, 0); + tcg_gen_st_i64(tcg_ctx, t, tcg_ctx->cpu_env, ofs + oprsz - 8); + goto done; + } + } + + setsz /= 8; + fullsz /= 8; + + tcg_gen_movi_i64(tcg_ctx, t, word); + for (i = 0; i < setsz; i += 8) { + tcg_gen_st_i64(tcg_ctx, t, tcg_ctx->cpu_env, ofs + i); + } + if (lastword != word) { + tcg_gen_movi_i64(tcg_ctx, t, lastword); + tcg_gen_st_i64(tcg_ctx, t, tcg_ctx->cpu_env, ofs + i); + i += 8; + } + if (i < fullsz) { + tcg_gen_movi_i64(tcg_ctx, t, 0); + for (; i < fullsz; i += 8) { + tcg_gen_st_i64(tcg_ctx, t, tcg_ctx->cpu_env, ofs + i); + } + } + + done: + tcg_temp_free_i64(tcg_ctx, t); + + /* PTRUES */ + if (setflag) { + tcg_gen_movi_i32(tcg_ctx, tcg_ctx->cpu_NF, -(word != 0)); + tcg_gen_movi_i32(tcg_ctx, tcg_ctx->cpu_CF, word == 0); + tcg_gen_movi_i32(tcg_ctx, tcg_ctx->cpu_VF, 0); + tcg_gen_mov_i32(tcg_ctx, tcg_ctx->cpu_ZF, tcg_ctx->cpu_NF); + } + return true; +} + +static bool trans_PTRUE(DisasContext *s, arg_PTRUE *a, uint32_t insn) +{ + return do_predset(s, a->esz, a->rd, a->pat, a->s); +} + +static bool trans_SETFFR(DisasContext *s, arg_SETFFR *a, uint32_t insn) +{ + /* Note pat == 31 is #all, to set all elements. */ + return do_predset(s, 0, FFR_PRED_NUM, 31, false); +} + +static bool trans_PFALSE(DisasContext *s, arg_PFALSE *a, uint32_t insn) +{ + /* Note pat == 32 is #unimp, to set no elements. */ + return do_predset(s, 0, a->rd, 32, false); +} + +static bool trans_RDFFR_p(DisasContext *s, arg_RDFFR_p *a, uint32_t insn) +{ + /* The path through do_pppp_flags is complicated enough to want to avoid + * duplication. Frob the arguments into the form of a predicated AND. + */ + arg_rprr_s alt_a = { + .rd = a->rd, .pg = a->pg, .s = a->s, + .rn = FFR_PRED_NUM, .rm = FFR_PRED_NUM, + }; + return trans_AND_pppp(s, &alt_a, insn); +} + +static bool trans_RDFFR(DisasContext *s, arg_RDFFR *a, uint32_t insn) +{ + return do_mov_p(s, a->rd, FFR_PRED_NUM); +} + +static bool trans_WRFFR(DisasContext *s, arg_WRFFR *a, uint32_t insn) +{ + return do_mov_p(s, FFR_PRED_NUM, a->rn); +} + +static bool do_pfirst_pnext(DisasContext *s, arg_rr_esz *a, + void (*gen_fn)(TCGContext *, TCGv_i32, TCGv_ptr, + TCGv_ptr, TCGv_i32)) +{ + if (!sve_access_check(s)) { + return true; + } + + TCGContext *tcg_ctx = s->uc->tcg_ctx; + TCGv_ptr t_pd = tcg_temp_new_ptr(tcg_ctx); + TCGv_ptr t_pg = tcg_temp_new_ptr(tcg_ctx); + TCGv_i32 t; + unsigned desc; + + desc = DIV_ROUND_UP(pred_full_reg_size(s), 8); + desc = deposit32(desc, SIMD_DATA_SHIFT, 2, a->esz); + + tcg_gen_addi_ptr(tcg_ctx, t_pd, tcg_ctx->cpu_env, pred_full_reg_offset(s, a->rd)); + tcg_gen_addi_ptr(tcg_ctx, t_pg, tcg_ctx->cpu_env, pred_full_reg_offset(s, a->rn)); + t = tcg_const_i32(tcg_ctx, desc); + + gen_fn(tcg_ctx, t, t_pd, t_pg, t); + tcg_temp_free_ptr(tcg_ctx, t_pd); + tcg_temp_free_ptr(tcg_ctx, t_pg); + + do_pred_flags(s, t); + tcg_temp_free_i32(tcg_ctx, t); + return true; +} + +static bool trans_PFIRST(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + return do_pfirst_pnext(s, a, gen_helper_sve_pfirst); +} + +static bool trans_PNEXT(DisasContext *s, arg_rr_esz *a, uint32_t insn) +{ + return do_pfirst_pnext(s, a, gen_helper_sve_pnext); +} + /* *** SVE Memory - 32-bit Gather and Unsized Contiguous Group */