From 82ffaab7de2d00cba91e9aab7d1d5011bbb70dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Thu, 8 Mar 2018 18:06:25 -0500 Subject: [PATCH] arm/translate-a64: add FP16 x2 ops for simd_indexed A bunch of the vectorised bitwise operations just operate on larger chunks at a time. We can do the same for the new half-precision operations by introducing some TWOHALFOP helpers which work on each half of a pair of half-precision operations at once. Hopefully all this hoop jumping will get simpler once we have generically vectorised helpers here. Backports commit 6089030c7322d8f96b54fb9904e53b0f464bb8fe from qemu --- qemu/aarch64.h | 10 +++++++ qemu/aarch64eb.h | 10 +++++++ qemu/header_gen.py | 10 +++++++ qemu/target/arm/helper-a64.c | 46 ++++++++++++++++++++++++++++++++- qemu/target/arm/helper-a64.h | 10 +++++++ qemu/target/arm/translate-a64.c | 26 +++++++++++++++---- 6 files changed, 106 insertions(+), 6 deletions(-) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index cae89c5b..4acbea3b 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -3722,18 +3722,28 @@ #define gen_a64_set_pc_im gen_a64_set_pc_im_aarch64 #define helper_advsimd_acge_f16 helper_advsimd_acge_f16_aarch64 #define helper_advsimd_acgt_f16 helper_advsimd_acgt_f16_aarch64 +#define helper_advsimd_add2h helper_advsimd_add2h_aarch64 #define helper_advsimd_addh helper_advsimd_addh_aarch64 #define helper_advsimd_ceq_f16 helper_advsimd_ceq_f16_aarch64 #define helper_advsimd_cge_f16 helper_advsimd_cge_f16_aarch64 #define helper_advsimd_cgt_f16 helper_advsimd_cgt_f16_aarch64 +#define helper_advsimd_div2h helper_advsimd_div2h_aarch64 #define helper_advsimd_divh helper_advsimd_divh_aarch64 +#define helper_advsimd_max2h helper_advsimd_max2h_aarch64 #define helper_advsimd_maxh helper_advsimd_maxh_aarch64 +#define helper_advsimd_maxnum2h helper_advsimd_maxnum2h_aarch64 #define helper_advsimd_maxnumh helper_advsimd_maxnumh_aarch64 +#define helper_advsimd_min2h helper_advsimd_min2h_aarch64 #define helper_advsimd_minh helper_advsimd_minh_aarch64 +#define helper_advsimd_minnum2h helper_advsimd_minnum2h_aarch64 #define helper_advsimd_minnumh helper_advsimd_minnumh_aarch64 #define helper_advsimd_muladdh helper_advsimd_muladdh_aarch64 +#define helper_advsimd_muladd2h helper_advsimd_muladd2h_aarch64 +#define helper_advsimd_mul2h helper_advsimd_mul2h_aarch64 #define helper_advsimd_mulh helper_advsimd_mulh_aarch64 +#define helper_advsimd_mulx2h helper_advsimd_mulx2h_aarch64 #define helper_advsimd_mulxh helper_advsimd_mulxh_aarch64 +#define helper_advsimd_sub2h helper_advsimd_sub2h_aarch64 #define helper_advsimd_subh helper_advsimd_subh_aarch64 #define helper_crc32_64 helper_crc32_64_aarch64 #define helper_crc32c_64 helper_crc32c_64_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 3ef3693b..5ed09652 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -3722,18 +3722,28 @@ #define gen_a64_set_pc_im gen_a64_set_pc_im_aarch64eb #define helper_advsimd_acge_f16 helper_advsimd_acge_f16_aarch64eb #define helper_advsimd_acgt_f16 helper_advsimd_acgt_f16_aarch64eb +#define helper_advsimd_add2h helper_advsimd_add2h_aarch64eb #define helper_advsimd_addh helper_advsimd_addh_aarch64eb #define helper_advsimd_ceq_f16 helper_advsimd_ceq_f16_aarch64eb #define helper_advsimd_cge_f16 helper_advsimd_cge_f16_aarch64eb #define helper_advsimd_cgt_f16 helper_advsimd_cgt_f16_aarch64eb +#define helper_advsimd_div2h helper_advsimd_div2h_aarch64eb #define helper_advsimd_divh helper_advsimd_divh_aarch64eb +#define helper_advsimd_max2h helper_advsimd_max2h_aarch64eb #define helper_advsimd_maxh helper_advsimd_maxh_aarch64eb +#define helper_advsimd_maxnum2h helper_advsimd_maxnum2h_aarch64eb #define helper_advsimd_maxnumh helper_advsimd_maxnumh_aarch64eb +#define helper_advsimd_min2h helper_advsimd_min2h_aarch64eb #define helper_advsimd_minh helper_advsimd_minh_aarch64eb +#define helper_advsimd_minnum2h helper_advsimd_minnum2h_aarch64eb #define helper_advsimd_minnumh helper_advsimd_minnumh_aarch64eb #define helper_advsimd_muladdh helper_advsimd_muladdh_aarch64eb +#define helper_advsimd_muladd2h helper_advsimd_muladd2h_aarch64eb +#define helper_advsimd_mul2h helper_advsimd_mul2h_aarch64eb #define helper_advsimd_mulh helper_advsimd_mulh_aarch64eb +#define helper_advsimd_mulx2h helper_advsimd_mulx2h_aarch64eb #define helper_advsimd_mulxh helper_advsimd_mulxh_aarch64eb +#define helper_advsimd_sub2h helper_advsimd_sub2h_aarch64eb #define helper_advsimd_subh helper_advsimd_subh_aarch64eb #define helper_crc32_64 helper_crc32_64_aarch64eb #define helper_crc32c_64 helper_crc32c_64_aarch64eb diff --git a/qemu/header_gen.py b/qemu/header_gen.py index f7950d49..46869e18 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -3742,18 +3742,28 @@ aarch64_symbols = ( 'gen_a64_set_pc_im', 'helper_advsimd_acge_f16', 'helper_advsimd_acgt_f16', + 'helper_advsimd_add2h', 'helper_advsimd_addh', 'helper_advsimd_ceq_f16', 'helper_advsimd_cge_f16', 'helper_advsimd_cgt_f16', + 'helper_advsimd_div2h', 'helper_advsimd_divh', + 'helper_advsimd_max2h', 'helper_advsimd_maxh', + 'helper_advsimd_maxnum2h', 'helper_advsimd_maxnumh', + 'helper_advsimd_min2h', 'helper_advsimd_minh', + 'helper_advsimd_minnum2h', 'helper_advsimd_minnumh', 'helper_advsimd_muladdh', + 'helper_advsimd_muladd2h', + 'helper_advsimd_mul2h', 'helper_advsimd_mulh', + 'helper_advsimd_mulx2h', 'helper_advsimd_mulxh', + 'helper_advsimd_sub2h', 'helper_advsimd_subh', 'helper_crc32_64', 'helper_crc32c_64', diff --git a/qemu/target/arm/helper-a64.c b/qemu/target/arm/helper-a64.c index b566473e..1754683c 100644 --- a/qemu/target/arm/helper-a64.c +++ b/qemu/target/arm/helper-a64.c @@ -675,8 +675,32 @@ ADVSIMD_HALFOP(max) ADVSIMD_HALFOP(minnum) ADVSIMD_HALFOP(maxnum) +#define ADVSIMD_TWOHALFOP(name) \ +uint32_t ADVSIMD_HELPER(name, 2h)(uint32_t two_a, uint32_t two_b, void *fpstp) \ +{ \ + float16 a1, a2, b1, b2; \ + uint32_t r1, r2; \ + float_status *fpst = fpstp; \ + a1 = extract32(two_a, 0, 16); \ + a2 = extract32(two_a, 16, 16); \ + b1 = extract32(two_b, 0, 16); \ + b2 = extract32(two_b, 16, 16); \ + r1 = float16_ ## name(a1, b1, fpst); \ + r2 = float16_ ## name(a2, b2, fpst); \ + return deposit32(r1, 16, 16, r2); \ +} + +ADVSIMD_TWOHALFOP(add) +ADVSIMD_TWOHALFOP(sub) +ADVSIMD_TWOHALFOP(mul) +ADVSIMD_TWOHALFOP(div) +ADVSIMD_TWOHALFOP(min) +ADVSIMD_TWOHALFOP(max) +ADVSIMD_TWOHALFOP(minnum) +ADVSIMD_TWOHALFOP(maxnum) + /* Data processing - scalar floating-point and advanced SIMD */ -float16 HELPER(advsimd_mulxh)(float16 a, float16 b, void *fpstp) +static float16 float16_mulx(float16 a, float16 b, void *fpstp) { float_status *fpst = fpstp; @@ -692,6 +716,9 @@ float16 HELPER(advsimd_mulxh)(float16 a, float16 b, void *fpstp) return float16_mul(a, b, fpst); } +ADVSIMD_HALFOP(mulx) +ADVSIMD_TWOHALFOP(mulx) + /* fused multiply-accumulate */ float16 HELPER(advsimd_muladdh)(float16 a, float16 b, float16 c, void *fpstp) { @@ -699,6 +726,23 @@ float16 HELPER(advsimd_muladdh)(float16 a, float16 b, float16 c, void *fpstp) return float16_muladd(a, b, c, 0, fpst); } +uint32_t HELPER(advsimd_muladd2h)(uint32_t two_a, uint32_t two_b, + uint32_t two_c, void *fpstp) +{ + float_status *fpst = fpstp; + float16 a1, a2, b1, b2, c1, c2; + uint32_t r1, r2; + a1 = extract32(two_a, 0, 16); + a2 = extract32(two_a, 16, 16); + b1 = extract32(two_b, 0, 16); + b2 = extract32(two_b, 16, 16); + c1 = extract32(two_c, 0, 16); + c2 = extract32(two_c, 16, 16); + r1 = float16_muladd(a1, b1, c1, 0, fpst); + r2 = float16_muladd(a2, b2, c2, 0, fpst); + return deposit32(r1, 16, 16, r2); +} + /* * Floating point comparisons produce an integer result. Softfloat * routines return float_relation types which we convert to the 0/-1 diff --git a/qemu/target/arm/helper-a64.h b/qemu/target/arm/helper-a64.h index 1ca2243c..8d3ea6b0 100644 --- a/qemu/target/arm/helper-a64.h +++ b/qemu/target/arm/helper-a64.h @@ -61,3 +61,13 @@ DEF_HELPER_3(advsimd_acge_f16, i32, f16, f16, ptr) DEF_HELPER_3(advsimd_acgt_f16, i32, f16, f16, ptr) DEF_HELPER_3(advsimd_mulxh, f16, f16, f16, ptr) DEF_HELPER_4(advsimd_muladdh, f16, f16, f16, f16, ptr) +DEF_HELPER_3(advsimd_add2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_sub2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_mul2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_div2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_max2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_min2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_maxnum2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_minnum2h, i32, i32, i32, ptr) +DEF_HELPER_3(advsimd_mulx2h, i32, i32, i32, ptr) +DEF_HELPER_4(advsimd_muladd2h, i32, i32, i32, i32, ptr) diff --git a/qemu/target/arm/translate-a64.c b/qemu/target/arm/translate-a64.c index 31f6ed73..9ce503b6 100644 --- a/qemu/target/arm/translate-a64.c +++ b/qemu/target/arm/translate-a64.c @@ -11574,8 +11574,13 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) * multiply-add */ tcg_gen_xori_i32(tcg_ctx, tcg_op, tcg_op, 0x80008000); } - gen_helper_advsimd_muladdh(tcg_ctx, tcg_res, tcg_op, tcg_idx, - tcg_res, fpst); + if (is_scalar) { + gen_helper_advsimd_muladdh(tcg_ctx, tcg_res, tcg_op, tcg_idx, + tcg_res, fpst); + } else { + gen_helper_advsimd_muladd2h(tcg_ctx, tcg_res, tcg_op, tcg_idx, + tcg_res, fpst); + } break; case 2: if (opcode == 0x5) { @@ -11594,10 +11599,21 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) switch (size) { case 1: if (u) { - gen_helper_advsimd_mulxh(tcg_ctx, tcg_res, tcg_op, tcg_idx, - fpst); + if (is_scalar) { + gen_helper_advsimd_mulxh(tcg_ctx, tcg_res, tcg_op, + tcg_idx, fpst); + } else { + gen_helper_advsimd_mulx2h(tcg_ctx, tcg_res, tcg_op, + tcg_idx, fpst); + } } else { - g_assert_not_reached(); + if (is_scalar) { + gen_helper_advsimd_mulh(tcg_ctx, tcg_res, tcg_op, + tcg_idx, fpst); + } else { + gen_helper_advsimd_mul2h(tcg_ctx, tcg_res, tcg_op, + tcg_idx, fpst); + } } break; case 2: