mirror of
https://github.com/yuzu-emu/unicorn.git
synced 2025-02-02 10:51:06 +00:00
target/arm: Convert VFP fused multiply-add insns to decodetree
Convert the VFP fused multiply-add instructions (VFNMA, VFNMS, VFMA, VFMS) to decodetree. Note that in the old decode structure we were implementing these to honour the VFP vector stride/length. These instructions were introduced in VFPv4, and in the v7A architecture they are UNPREDICTABLE if the vector stride or length are non-zero. In v8A they must UNDEF if stride or length are non-zero, like all VFP instructions; we choose to UNDEF always. Backports commit d4893b01d23060845ee3855bc96626e16aad9ab5 from qemu
This commit is contained in:
parent
321bcc822b
commit
0ebb6b8b90
|
@ -1500,3 +1500,126 @@ static bool trans_VDIV_dp(DisasContext *s, arg_VDIV_sp *a)
|
|||
{
|
||||
return do_vfp_3op_dp(s, gen_helper_vfp_divd, a->vd, a->vn, a->vm, false);
|
||||
}
|
||||
|
||||
static bool trans_VFM_sp(DisasContext *s, arg_VFM_sp *a)
|
||||
{
|
||||
/*
|
||||
* VFNMA : fd = muladd(-fd, fn, fm)
|
||||
* VFNMS : fd = muladd(-fd, -fn, fm)
|
||||
* VFMA : fd = muladd( fd, fn, fm)
|
||||
* VFMS : fd = muladd( fd, -fn, fm)
|
||||
*
|
||||
* These are fused multiply-add, and must be done as one floating
|
||||
* point operation with no rounding between the multiplication and
|
||||
* addition steps. NB that doing the negations here as separate
|
||||
* steps is correct : an input NaN should come out with its sign
|
||||
* bit flipped if it is a negated-input.
|
||||
*/
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
TCGv_ptr fpst;
|
||||
TCGv_i32 vn, vm, vd;
|
||||
|
||||
/*
|
||||
* Present in VFPv4 only.
|
||||
* In v7A, UNPREDICTABLE with non-zero vector length/stride; from
|
||||
* v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
|
||||
*/
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_VFP4) ||
|
||||
(s->vec_len != 0 || s->vec_stride != 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!vfp_access_check(s)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
vn = tcg_temp_new_i32(tcg_ctx);
|
||||
vm = tcg_temp_new_i32(tcg_ctx);
|
||||
vd = tcg_temp_new_i32(tcg_ctx);
|
||||
|
||||
neon_load_reg32(s, vn, a->vn);
|
||||
neon_load_reg32(s, vm, a->vm);
|
||||
if (a->o2) {
|
||||
/* VFNMS, VFMS */
|
||||
gen_helper_vfp_negs(tcg_ctx, vn, vn);
|
||||
}
|
||||
neon_load_reg32(s, vd, a->vd);
|
||||
if (a->o1 & 1) {
|
||||
/* VFNMA, VFNMS */
|
||||
gen_helper_vfp_negs(tcg_ctx, vd, vd);
|
||||
}
|
||||
fpst = get_fpstatus_ptr(s, 0);
|
||||
gen_helper_vfp_muladds(tcg_ctx, vd, vn, vm, vd, fpst);
|
||||
neon_store_reg32(s, vd, a->vd);
|
||||
|
||||
tcg_temp_free_ptr(tcg_ctx, fpst);
|
||||
tcg_temp_free_i32(tcg_ctx, vn);
|
||||
tcg_temp_free_i32(tcg_ctx, vm);
|
||||
tcg_temp_free_i32(tcg_ctx, vd);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool trans_VFM_dp(DisasContext *s, arg_VFM_sp *a)
|
||||
{
|
||||
/*
|
||||
* VFNMA : fd = muladd(-fd, fn, fm)
|
||||
* VFNMS : fd = muladd(-fd, -fn, fm)
|
||||
* VFMA : fd = muladd( fd, fn, fm)
|
||||
* VFMS : fd = muladd( fd, -fn, fm)
|
||||
*
|
||||
* These are fused multiply-add, and must be done as one floating
|
||||
* point operation with no rounding between the multiplication and
|
||||
* addition steps. NB that doing the negations here as separate
|
||||
* steps is correct : an input NaN should come out with its sign
|
||||
* bit flipped if it is a negated-input.
|
||||
*/
|
||||
TCGContext *tcg_ctx = s->uc->tcg_ctx;
|
||||
TCGv_ptr fpst;
|
||||
TCGv_i64 vn, vm, vd;
|
||||
|
||||
/*
|
||||
* Present in VFPv4 only.
|
||||
* In v7A, UNPREDICTABLE with non-zero vector length/stride; from
|
||||
* v8A, must UNDEF. We choose to UNDEF for both v7A and v8A.
|
||||
*/
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_VFP4) ||
|
||||
(s->vec_len != 0 || s->vec_stride != 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* UNDEF accesses to D16-D31 if they don't exist. */
|
||||
if (!dc_isar_feature(aa32_fp_d32, s) && ((a->vd | a->vn | a->vm) & 0x10)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!vfp_access_check(s)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
vn = tcg_temp_new_i64(tcg_ctx);
|
||||
vm = tcg_temp_new_i64(tcg_ctx);
|
||||
vd = tcg_temp_new_i64(tcg_ctx);
|
||||
|
||||
neon_load_reg64(s, vn, a->vn);
|
||||
neon_load_reg64(s, vm, a->vm);
|
||||
if (a->o2) {
|
||||
/* VFNMS, VFMS */
|
||||
gen_helper_vfp_negd(tcg_ctx, vn, vn);
|
||||
}
|
||||
neon_load_reg64(s, vd, a->vd);
|
||||
if (a->o1 & 1) {
|
||||
/* VFNMA, VFNMS */
|
||||
gen_helper_vfp_negd(tcg_ctx, vd, vd);
|
||||
}
|
||||
fpst = get_fpstatus_ptr(s, 0);
|
||||
gen_helper_vfp_muladdd(tcg_ctx, vd, vn, vm, vd, fpst);
|
||||
neon_store_reg64(s, vd, a->vd);
|
||||
|
||||
tcg_temp_free_ptr(tcg_ctx, fpst);
|
||||
tcg_temp_free_i64(tcg_ctx, vn);
|
||||
tcg_temp_free_i64(tcg_ctx, vm);
|
||||
tcg_temp_free_i64(tcg_ctx, vd);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -3194,7 +3194,7 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
|
|||
rn = VFP_SREG_N(insn);
|
||||
|
||||
switch (op) {
|
||||
case 0 ... 8:
|
||||
case 0 ... 13:
|
||||
/* Already handled by decodetree */
|
||||
return 1;
|
||||
default:
|
||||
|
@ -3380,57 +3380,6 @@ static int disas_vfp_insn(DisasContext *s, uint32_t insn)
|
|||
for (;;) {
|
||||
/* Perform the calculation. */
|
||||
switch (op) {
|
||||
case 10: /* VFNMA : fd = muladd(-fd, fn, fm) */
|
||||
case 11: /* VFNMS : fd = muladd(-fd, -fn, fm) */
|
||||
case 12: /* VFMA : fd = muladd( fd, fn, fm) */
|
||||
case 13: /* VFMS : fd = muladd( fd, -fn, fm) */
|
||||
/* These are fused multiply-add, and must be done as one
|
||||
* floating point operation with no rounding between the
|
||||
* multiplication and addition steps.
|
||||
* NB that doing the negations here as separate steps is
|
||||
* correct : an input NaN should come out with its sign bit
|
||||
* flipped if it is a negated-input.
|
||||
*/
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_VFP4)) {
|
||||
return 1;
|
||||
}
|
||||
if (dp) {
|
||||
TCGv_ptr fpst;
|
||||
TCGv_i64 frd;
|
||||
if (op & 1) {
|
||||
/* VFNMS, VFMS */
|
||||
gen_helper_vfp_negd(tcg_ctx, s->F0d, s->F0d);
|
||||
}
|
||||
frd = tcg_temp_new_i64(tcg_ctx);
|
||||
tcg_gen_ld_f64(tcg_ctx, frd, tcg_ctx->cpu_env, vfp_reg_offset(dp, rd));
|
||||
if (op & 2) {
|
||||
/* VFNMA, VFNMS */
|
||||
gen_helper_vfp_negd(tcg_ctx, frd, frd);
|
||||
}
|
||||
fpst = get_fpstatus_ptr(s, 0);
|
||||
gen_helper_vfp_muladdd(tcg_ctx, s->F0d, s->F0d,
|
||||
s->F1d, frd, fpst);
|
||||
tcg_temp_free_ptr(tcg_ctx, fpst);
|
||||
tcg_temp_free_i64(tcg_ctx, frd);
|
||||
} else {
|
||||
TCGv_ptr fpst;
|
||||
TCGv_i32 frd;
|
||||
if (op & 1) {
|
||||
/* VFNMS, VFMS */
|
||||
gen_helper_vfp_negs(tcg_ctx, s->F0s, s->F0s);
|
||||
}
|
||||
frd = tcg_temp_new_i32(tcg_ctx);
|
||||
tcg_gen_ld_f32(tcg_ctx, frd, tcg_ctx->cpu_env, vfp_reg_offset(dp, rd));
|
||||
if (op & 2) {
|
||||
gen_helper_vfp_negs(tcg_ctx, frd, frd);
|
||||
}
|
||||
fpst = get_fpstatus_ptr(s, 0);
|
||||
gen_helper_vfp_muladds(tcg_ctx, s->F0s, s->F0s,
|
||||
s->F1s, frd, fpst);
|
||||
tcg_temp_free_ptr(tcg_ctx, fpst);
|
||||
tcg_temp_free_i32(tcg_ctx, frd);
|
||||
}
|
||||
break;
|
||||
case 14: /* fconst */
|
||||
if (!arm_dc_feature(s, ARM_FEATURE_VFP3)) {
|
||||
return 1;
|
||||
|
|
|
@ -142,3 +142,12 @@ VDIV_sp ---- 1110 1.00 .... .... 1010 .0.0 .... \
|
|||
vm=%vm_sp vn=%vn_sp vd=%vd_sp
|
||||
VDIV_dp ---- 1110 1.00 .... .... 1011 .0.0 .... \
|
||||
vm=%vm_dp vn=%vn_dp vd=%vd_dp
|
||||
|
||||
VFM_sp ---- 1110 1.01 .... .... 1010 . o2:1 . 0 .... \
|
||||
vm=%vm_sp vn=%vn_sp vd=%vd_sp o1=1
|
||||
VFM_dp ---- 1110 1.01 .... .... 1011 . o2:1 . 0 .... \
|
||||
vm=%vm_dp vn=%vn_dp vd=%vd_dp o1=1
|
||||
VFM_sp ---- 1110 1.10 .... .... 1010 . o2:1 . 0 .... \
|
||||
vm=%vm_sp vn=%vn_sp vd=%vd_sp o1=2
|
||||
VFM_dp ---- 1110 1.10 .... .... 1011 . o2:1 . 0 .... \
|
||||
vm=%vm_dp vn=%vn_dp vd=%vd_dp o1=2
|
||||
|
|
Loading…
Reference in a new issue