target/arm: Improve do_prewiden_3d

We can use proper widening loads to extend 32-bit inputs,
and skip the "widenfn" step.

Backports 8aab18a2c5209e4e48998a61fbc2d89f374331ed
This commit is contained in:
Richard Henderson 2021-03-02 12:54:28 -05:00 committed by Lioncash
parent 9263117d47
commit d473f66177
2 changed files with 43 additions and 29 deletions

View file

@ -1816,12 +1816,11 @@ static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
static bool do_prewiden_3d(DisasContext *s, arg_3diff *a, static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
NeonGenWidenFn *widenfn, NeonGenWidenFn *widenfn,
NeonGenTwo64OpFn *opfn, NeonGenTwo64OpFn *opfn,
bool src1_wide) int src1_mop, int src2_mop)
{ {
/* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */ /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
TCGContext *tcg_ctx = s->uc->tcg_ctx; TCGContext *tcg_ctx = s->uc->tcg_ctx;
TCGv_i64 rn0_64, rn1_64, rm_64; TCGv_i64 rn0_64, rn1_64, rm_64;
TCGv_i32 rm;
if (!arm_dc_feature(s, ARM_FEATURE_NEON)) { if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
return false; return false;
@ -1833,12 +1832,12 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
return false; return false;
} }
if (!widenfn || !opfn) { if (!opfn) {
/* size == 3 case, which is an entirely different insn group */ /* size == 3 case, which is an entirely different insn group */
return false; return false;
} }
if ((a->vd & 1) || (src1_wide && (a->vn & 1))) { if ((a->vd & 1) || (src1_mop == MO_Q && (a->vn & 1))) {
return false; return false;
} }
@ -1850,40 +1849,48 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
rn1_64 = tcg_temp_new_i64(tcg_ctx); rn1_64 = tcg_temp_new_i64(tcg_ctx);
rm_64 = tcg_temp_new_i64(tcg_ctx); rm_64 = tcg_temp_new_i64(tcg_ctx);
if (src1_wide) { if (src1_mop >= 0) {
read_neon_element64(s, rn0_64, a->vn, 0, MO_64); read_neon_element64(s, rn0_64, a->vn, 0, src1_mop);
} else { } else {
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx); TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
read_neon_element32(s, tmp, a->vn, 0, MO_32); read_neon_element32(s, tmp, a->vn, 0, MO_32);
widenfn(tcg_ctx, rn0_64, tmp); widenfn(tcg_ctx, rn0_64, tmp);
tcg_temp_free_i32(tcg_ctx, tmp); tcg_temp_free_i32(tcg_ctx, tmp);
} }
rm = tcg_temp_new_i32(tcg_ctx); if (src2_mop >= 0) {
read_neon_element32(s, rm, a->vm, 0, MO_32); read_neon_element64(s, rm_64, a->vm, 0, src2_mop);
} else {
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
read_neon_element32(s, tmp, a->vm, 0, MO_32);
widenfn(tcg_ctx, rm_64, tmp);
tcg_temp_free_i32(tcg_ctx, tmp);
}
widenfn(tcg_ctx, rm_64, rm);
tcg_temp_free_i32(tcg_ctx, rm);
opfn(tcg_ctx, rn0_64, rn0_64, rm_64); opfn(tcg_ctx, rn0_64, rn0_64, rm_64);
/* /*
* Load second pass inputs before storing the first pass result, to * Load second pass inputs before storing the first pass result, to
* avoid incorrect results if a narrow input overlaps with the result. * avoid incorrect results if a narrow input overlaps with the result.
*/ */
if (src1_wide) { if (src1_mop >= 0) {
read_neon_element64(s, rn1_64, a->vn, 1, MO_64); read_neon_element64(s, rn1_64, a->vn, 1, src1_mop);
} else { } else {
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx); TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
read_neon_element32(s, tmp, a->vn, 1, MO_32); read_neon_element32(s, tmp, a->vn, 1, MO_32);
widenfn(tcg_ctx, rn1_64, tmp); widenfn(tcg_ctx, rn1_64, tmp);
tcg_temp_free_i32(tcg_ctx, tmp); tcg_temp_free_i32(tcg_ctx, tmp);
} }
rm = tcg_temp_new_i32(tcg_ctx); if (src2_mop >= 0) {
read_neon_element32(s, rm, a->vm, 1, MO_32); read_neon_element64(s, rm_64, a->vm, 1, src2_mop);
} else {
TCGv_i32 tmp = tcg_temp_new_i32(tcg_ctx);
read_neon_element32(s, tmp, a->vm, 1, MO_32);
widenfn(tcg_ctx, rm_64, tmp);
tcg_temp_free_i32(tcg_ctx, tmp);
}
write_neon_element64(s, rn0_64, a->vd, 0, MO_64); write_neon_element64(s, rn0_64, a->vd, 0, MO_64);
widenfn(tcg_ctx, rm_64, rm);
tcg_temp_free_i32(tcg_ctx, rm);
opfn(tcg_ctx, rn1_64, rn1_64, rm_64); opfn(tcg_ctx, rn1_64, rn1_64, rm_64);
write_neon_element64(s, rn1_64, a->vd, 1, MO_64); write_neon_element64(s, rn1_64, a->vd, 1, MO_64);
@ -1894,14 +1901,13 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
return true; return true;
} }
#define DO_PREWIDEN(INSN, S, EXT, OP, SRC1WIDE) \ #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN) \
static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \ static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a) \
{ \ { \
static NeonGenWidenFn * const widenfn[] = { \ static NeonGenWidenFn * const widenfn[] = { \
gen_helper_neon_widen_##S##8, \ gen_helper_neon_widen_##S##8, \
gen_helper_neon_widen_##S##16, \ gen_helper_neon_widen_##S##16, \
tcg_gen_##EXT##_i32_i64, \ NULL, NULL, \
NULL, \
}; \ }; \
static NeonGenTwo64OpFn * const addfn[] = { \ static NeonGenTwo64OpFn * const addfn[] = { \
gen_helper_neon_##OP##l_u16, \ gen_helper_neon_##OP##l_u16, \
@ -1909,18 +1915,20 @@ static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
tcg_gen_##OP##_i64, \ tcg_gen_##OP##_i64, \
NULL, \ NULL, \
}; \ }; \
return do_prewiden_3d(s, a, widenfn[a->size], \ int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1; \
addfn[a->size], SRC1WIDE); \ return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size], \
SRC1WIDE ? MO_Q : narrow_mop, \
narrow_mop); \
} }
DO_PREWIDEN(VADDL_S, s, ext, add, false) DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
DO_PREWIDEN(VADDL_U, u, extu, add, false) DO_PREWIDEN(VADDL_U, u, add, false, 0)
DO_PREWIDEN(VSUBL_S, s, ext, sub, false) DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
DO_PREWIDEN(VSUBL_U, u, extu, sub, false) DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
DO_PREWIDEN(VADDW_S, s, ext, add, true) DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
DO_PREWIDEN(VADDW_U, u, extu, add, true) DO_PREWIDEN(VADDW_U, u, add, true, 0)
DO_PREWIDEN(VSUBW_S, s, ext, sub, true) DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
DO_PREWIDEN(VSUBW_U, u, extu, sub, true) DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
static bool do_narrow_3d(DisasContext *s, arg_3diff *a, static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn) NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)

View file

@ -1233,6 +1233,12 @@ static void read_neon_element64(DisasContext *s, TCGv_i64 dest, int reg, int ele
long off = neon_element_offset(reg, ele, memop); long off = neon_element_offset(reg, ele, memop);
switch (memop) { switch (memop) {
case MO_SL:
tcg_gen_ld32s_i64(tcg_ctx, dest, tcg_ctx->cpu_env, off);
break;
case MO_UL:
tcg_gen_ld32u_i64(tcg_ctx, dest, tcg_ctx->cpu_env, off);
break;
case MO_Q: case MO_Q:
tcg_gen_ld_i64(tcg_ctx, dest, tcg_ctx->cpu_env, off); tcg_gen_ld_i64(tcg_ctx, dest, tcg_ctx->cpu_env, off);
break; break;