arm/translate-a64: implement half-precision F(MIN|MAX)(V|NMV)

This implements the half-precision variants of the across vector reduction operations. This involves a re-factor of the reduction code which more closely matches the ARM ARM order (and handles 8 element reductions). Backports commit 807cdd504283c11addcd7ea95ba594bbddc86fe4 from qemu
2025-09-16 15:57:22 +00:00 · 2018-03-08 12:49:26 -05:00 · 2018-03-08 12:49:26 -05:00 · af75074fe7
parent 27d8d01566
commit af75074fe7
6 changed files with 122 additions and 53 deletions
--- a/qemu/aarch64.h
+++ b/qemu/aarch64.h
@ -3720,6 +3720,10 @@
 #define arm_set_cpu_off arm_set_cpu_off_aarch64
 #define arm_set_cpu_on arm_set_cpu_on_aarch64
 #define gen_a64_set_pc_im gen_a64_set_pc_im_aarch64
+#define helper_advsimd_maxh helper_advsimd_maxh_aarch64
+#define helper_advsimd_maxnumh helper_advsimd_maxnumh_aarch64
+#define helper_advsimd_minh helper_advsimd_minh_aarch64
+#define helper_advsimd_minnumh helper_advsimd_minnumh_aarch64
 #define helper_crc32_64 helper_crc32_64_aarch64
 #define helper_crc32c_64 helper_crc32c_64_aarch64
 #define helper_fcvtx_f64_to_f32 helper_fcvtx_f64_to_f32_aarch64
--- a/qemu/aarch64eb.h
+++ b/qemu/aarch64eb.h
@ -3720,6 +3720,10 @@
 #define arm_set_cpu_off arm_set_cpu_off_aarch64eb
 #define arm_set_cpu_on arm_set_cpu_on_aarch64eb
 #define gen_a64_set_pc_im gen_a64_set_pc_im_aarch64eb
+#define helper_advsimd_maxh helper_advsimd_maxh_aarch64eb
+#define helper_advsimd_maxnumh helper_advsimd_maxnumh_aarch64eb
+#define helper_advsimd_minh helper_advsimd_minh_aarch64eb
+#define helper_advsimd_minnumh helper_advsimd_minnumh_aarch64eb
 #define helper_crc32_64 helper_crc32_64_aarch64eb
 #define helper_crc32c_64 helper_crc32c_64_aarch64eb
 #define helper_fcvtx_f64_to_f32 helper_fcvtx_f64_to_f32_aarch64eb
--- a/qemu/header_gen.py
+++ b/qemu/header_gen.py
@ -3740,6 +3740,10 @@ aarch64_symbols = (
    'arm_set_cpu_off',
    'arm_set_cpu_on',
    'gen_a64_set_pc_im',
+    'helper_advsimd_maxh',
+    'helper_advsimd_maxnumh',
+    'helper_advsimd_minh',
+    'helper_advsimd_minnumh',
    'helper_crc32_64',
    'helper_crc32c_64',
    'helper_fcvtx_f64_to_f32',
--- a/qemu/target/arm/helper-a64.c
+++ b/qemu/target/arm/helper-a64.c
@ -619,3 +619,20 @@ uint64_t HELPER(paired_cmpxchg64_be)(CPUARMState *env, uint64_t addr,
    return !success;
 }

+/*
+ * AdvSIMD half-precision
+ */
+
+#define ADVSIMD_HELPER(name, suffix) HELPER(glue(glue(advsimd_, name), suffix))
+
+#define ADVSIMD_HALFOP(name) \
+float16 ADVSIMD_HELPER(name, h)(float16 a, float16 b, void *fpstp) \
+{ \
+    float_status *fpst = fpstp; \
+    return float16_ ## name(a, b, fpst);    \
+}
+
+ADVSIMD_HALFOP(min)
+ADVSIMD_HALFOP(max)
+ADVSIMD_HALFOP(minnum)
+ADVSIMD_HALFOP(maxnum)
--- a/qemu/target/arm/helper-a64.h
+++ b/qemu/target/arm/helper-a64.h
@ -44,3 +44,8 @@ DEF_HELPER_FLAGS_3(crc32_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_3(crc32c_64, TCG_CALL_NO_RWG_SE, i64, i64, i64, i32)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_le, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
 DEF_HELPER_FLAGS_4(paired_cmpxchg64_be, TCG_CALL_NO_WG, i64, env, i64, i64, i64)
+DEF_HELPER_FLAGS_3(advsimd_maxh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_minh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_maxnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+DEF_HELPER_FLAGS_3(advsimd_minnumh, TCG_CALL_NO_RWG, f16, f16, f16, ptr)
+
--- a/qemu/target/arm/translate-a64.c
+++ b/qemu/target/arm/translate-a64.c
@ -5849,27 +5849,77 @@ static void disas_simd_zip_trn(DisasContext *s, uint32_t insn)
    tcg_temp_free_i64(tcg_ctx, tcg_resh);
 }

-static void do_minmaxop(DisasContext *s, TCGv_i32 tcg_elt1, TCGv_i32 tcg_elt2,
-                        int opc, bool is_min, TCGv_ptr fpst)
+/*
+ * do_reduction_op helper
+ *
+ * This mirrors the Reduce() pseudocode in the ARM ARM. It is
+ * important for correct NaN propagation that we do these
+ * operations in exactly the order specified by the pseudocode.
+ *
+ * This is a recursive function, TCG temps should be freed by the
+ * calling function once it is done with the values.
+ */
+static TCGv_i32 do_reduction_op(DisasContext *s, int fpopcode, int rn,
+                                int esize, int size, int vmap, TCGv_ptr fpst)
 {
    TCGContext *tcg_ctx = s->uc->tcg_ctx;
-    /* Helper function for disas_simd_across_lanes: do a single precision
-     * min/max operation on the specified two inputs,
-     * and return the result in tcg_elt1.
-     */
-    if (opc == 0xc) {
-        if (is_min) {
-            gen_helper_vfp_minnums(tcg_ctx, tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-        } else {
-            gen_helper_vfp_maxnums(tcg_ctx, tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-        }
+
+    if (esize == size) {
+        int element;
+        TCGMemOp msize = esize == 16 ? MO_16 : MO_32;
+        TCGv_i32 tcg_elem;
+
+        /* We should have one register left here */
+        assert(ctpop8(vmap) == 1);
+        element = ctz32(vmap);
+        assert(element < 8);
+
+        tcg_elem = tcg_temp_new_i32(tcg_ctx);
+        read_vec_element_i32(s, tcg_elem, rn, element, msize);
+        return tcg_elem;
    } else {
-        assert(opc == 0xf);
-        if (is_min) {
-            gen_helper_vfp_mins(tcg_ctx, tcg_elt1, tcg_elt1, tcg_elt2, fpst);
-        } else {
-            gen_helper_vfp_maxs(tcg_ctx, tcg_elt1, tcg_elt1, tcg_elt2, fpst);
+        int bits = size / 2;
+        int shift = ctpop8(vmap) / 2;
+        int vmap_lo = (vmap >> shift) & vmap;
+        int vmap_hi = (vmap & ~vmap_lo);
+        TCGv_i32 tcg_hi, tcg_lo, tcg_res;
+
+        tcg_hi = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_hi, fpst);
+        tcg_lo = do_reduction_op(s, fpopcode, rn, esize, bits, vmap_lo, fpst);
+        tcg_res = tcg_temp_new_i32(tcg_ctx);
+
+        switch (fpopcode) {
+        case 0x0c: /* fmaxnmv half-precision */
+            gen_helper_advsimd_maxnumh(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x0f: /* fmaxv half-precision */
+            gen_helper_advsimd_maxh(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x1c: /* fminnmv half-precision */
+            gen_helper_advsimd_minnumh(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x1f: /* fminv half-precision */
+            gen_helper_advsimd_minh(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x2c: /* fmaxnmv */
+            gen_helper_vfp_maxnums(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x2f: /* fmaxv */
+            gen_helper_vfp_maxs(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x3c: /* fminnmv */
+            gen_helper_vfp_minnums(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        case 0x3f: /* fminv */
+            gen_helper_vfp_mins(tcg_ctx, tcg_res, tcg_lo, tcg_hi, fpst);
+            break;
+        default:
+            g_assert_not_reached();
        }
+
+        tcg_temp_free_i32(tcg_ctx, tcg_hi);
+        tcg_temp_free_i32(tcg_ctx, tcg_lo);
+        return tcg_res;
    }
 }

@ -5912,16 +5962,21 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)
        break;
    case 0xc: /* FMAXNMV, FMINNMV */
    case 0xf: /* FMAXV, FMINV */
-        if (!is_u || !is_q || extract32(size, 0, 1)) {
-            unallocated_encoding(s);
-            return;
-        }
-        /* Bit 1 of size field encodes min vs max, and actual size is always
-         * 32 bits: adjust the size variable so following code can rely on it
+        /* Bit 1 of size field encodes min vs max and the actual size
+         * depends on the encoding of the U bit. If not set (and FP16
+         * enabled) then we do half-precision float instead of single
+         * precision.
         */
        is_min = extract32(size, 1, 1);
        is_fp = true;
-        size = 2;
+        if (!is_u && arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+            size = 1;
+        } else if (!is_u || !is_q || extract32(size, 0, 1)) {
+            unallocated_encoding(s);
+            return;
+        } else {
+            size = 2;
+        }
        break;
    default:
        unallocated_encoding(s);
@ -5978,38 +6033,18 @@ static void disas_simd_across_lanes(DisasContext *s, uint32_t insn)

        }
    } else {
-        /* Floating point ops which work on 32 bit (single) intermediates.
+        /* Floating point vector reduction ops which work across 32
+         * bit (single) or 16 bit (half-precision) intermediates.
         * Note that correct NaN propagation requires that we do these
         * operations in exactly the order specified by the pseudocode.
         */
-        TCGv_i32 tcg_elt1 = tcg_temp_new_i32(tcg_ctx);
-        TCGv_i32 tcg_elt2 = tcg_temp_new_i32(tcg_ctx);
-        TCGv_i32 tcg_elt3 = tcg_temp_new_i32(tcg_ctx);
-        TCGv_ptr fpst = get_fpstatus_ptr(tcg_ctx, false);
-
-        assert(esize == 32);
-        assert(elements == 4);
-
-        read_vec_element(s, tcg_elt, rn, 0, MO_32);
-        tcg_gen_extrl_i64_i32(tcg_ctx, tcg_elt1, tcg_elt);
-        read_vec_element(s, tcg_elt, rn, 1, MO_32);
-        tcg_gen_extrl_i64_i32(tcg_ctx, tcg_elt2, tcg_elt);
-
-        do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
-
-        read_vec_element(s, tcg_elt, rn, 2, MO_32);
-        tcg_gen_extrl_i64_i32(tcg_ctx, tcg_elt2, tcg_elt);
-        read_vec_element(s, tcg_elt, rn, 3, MO_32);
-        tcg_gen_extrl_i64_i32(tcg_ctx, tcg_elt3, tcg_elt);
-
-        do_minmaxop(s, tcg_elt2, tcg_elt3, opcode, is_min, fpst);
-
-        do_minmaxop(s, tcg_elt1, tcg_elt2, opcode, is_min, fpst);
-
-        tcg_gen_extu_i32_i64(tcg_ctx, tcg_res, tcg_elt1);
-        tcg_temp_free_i32(tcg_ctx, tcg_elt1);
-        tcg_temp_free_i32(tcg_ctx, tcg_elt2);
-        tcg_temp_free_i32(tcg_ctx, tcg_elt3);
+        TCGv_ptr fpst = get_fpstatus_ptr(tcg_ctx, size == MO_16);
+        int fpopcode = opcode | is_min << 4 | is_u << 5;
+        int vmap = (1 << elements) - 1;
+        TCGv_i32 tcg_res32 = do_reduction_op(s, fpopcode, rn, esize,
+                                             (is_q ? 128 : 64), vmap, fpst);
+        tcg_gen_extu_i32_i64(tcg_ctx, tcg_res, tcg_res32);
+        tcg_temp_free_i32(tcg_ctx, tcg_res32);
        tcg_temp_free_ptr(tcg_ctx, fpst);
    }