fpu/softfloat: re-factor muladd

We can now add float16_muladd and use the common decompose and canonicalize functions to have a single implementation for float16/32/64 muladd functions. Backports commit d446830a3aac33e7221e361dad3ab1e1892646cb from qemu
2025-07-26 12:01:21 +00:00 · 2018-03-08 10:55:02 -05:00 · 2018-03-08 10:55:02 -05:00 · d92d5c6910
parent 5ea008e178
commit d92d5c6910
17 changed files with 287 additions and 573 deletions
--- a/qemu/aarch64.h
+++ b/qemu/aarch64.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_aarch64
 #define float16_maybe_silence_nan float16_maybe_silence_nan_aarch64
 #define float16_mul float16_mul_aarch64
+#define float16_muladd float16_muladd_aarch64
 #define float16_squash_input_denormal float16_squash_input_denormal_aarch64
 #define float16_sub float16_sub_aarch64
 #define float16_to_float32 float16_to_float32_aarch64
--- a/qemu/aarch64eb.h
+++ b/qemu/aarch64eb.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_aarch64eb
 #define float16_maybe_silence_nan float16_maybe_silence_nan_aarch64eb
 #define float16_mul float16_mul_aarch64eb
+#define float16_muladd float16_muladd_aarch64eb
 #define float16_squash_input_denormal float16_squash_input_denormal_aarch64eb
 #define float16_sub float16_sub_aarch64eb
 #define float16_to_float32 float16_to_float32_aarch64eb
--- a/qemu/arm.h
+++ b/qemu/arm.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_arm
 #define float16_maybe_silence_nan float16_maybe_silence_nan_arm
 #define float16_mul float16_mul_arm
+#define float16_muladd float16_muladd_arm
 #define float16_squash_input_denormal float16_squash_input_denormal_arm
 #define float16_sub float16_sub_arm
 #define float16_to_float32 float16_to_float32_arm
--- a/qemu/armeb.h
+++ b/qemu/armeb.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_armeb
 #define float16_maybe_silence_nan float16_maybe_silence_nan_armeb
 #define float16_mul float16_mul_armeb
+#define float16_muladd float16_muladd_armeb
 #define float16_squash_input_denormal float16_squash_input_denormal_armeb
 #define float16_sub float16_sub_armeb
 #define float16_to_float32 float16_to_float32_armeb
--- a/qemu/fpu/softfloat-specialize.h
+++ b/qemu/fpu/softfloat-specialize.h
@ -726,57 +726,6 @@ static float32 propagateFloat32NaN(float32 a, float32 b, float_status *status)
    }
 }

-/*----------------------------------------------------------------------------
-| Takes three single-precision floating-point values `a', `b' and `c', one of
-| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
-| `b' or `c' is a signaling NaN, the invalid exception is raised.
-| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
-| obviously c is a NaN, and whether to propagate c or some other NaN is
-| implementation defined).
-*----------------------------------------------------------------------------*/
-
-static float32 propagateFloat32MulAddNaN(float32 a, float32 b,
-                                         float32 c, flag infzero, float_status *status)
-{
-    flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN,
-        cIsQuietNaN, cIsSignalingNaN;
-    int which;
-
-    aIsQuietNaN = float32_is_quiet_nan(a, status);
-    aIsSignalingNaN = float32_is_signaling_nan(a, status);
-    bIsQuietNaN = float32_is_quiet_nan(b, status);
-    bIsSignalingNaN = float32_is_signaling_nan(b, status);
-    cIsQuietNaN = float32_is_quiet_nan(c, status);
-    cIsSignalingNaN = float32_is_signaling_nan(c, status);
-
-    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-        float_raise(float_flag_invalid, status);
-    }
-
-    which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN,
-                          bIsQuietNaN, bIsSignalingNaN,
-                          cIsQuietNaN, cIsSignalingNaN, infzero, status);
-
-    if (status->default_nan_mode) {
-        /* Note that this check is after pickNaNMulAdd so that function
-         * has an opportunity to set the Invalid flag.
-         */
-        return float32_default_nan(status);
-    }
-
-    switch (which) {
-    case 0:
-        return float32_maybe_silence_nan(a, status);
-    case 1:
-        return float32_maybe_silence_nan(b, status);
-    case 2:
-        return float32_maybe_silence_nan(c, status);
-    case 3:
-    default:
-        return float32_default_nan(status);
-    }
-}
-
 #ifdef NO_SIGNALING_NANS
 int float64_is_quiet_nan(float64 a_, float_status *status)
 {
@ -932,57 +881,6 @@ static float64 propagateFloat64NaN(float64 a, float64 b, float_status *status)
    }
 }

-/*----------------------------------------------------------------------------
-| Takes three double-precision floating-point values `a', `b' and `c', one of
-| which is a NaN, and returns the appropriate NaN result.  If any of  `a',
-| `b' or `c' is a signaling NaN, the invalid exception is raised.
-| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case
-| obviously c is a NaN, and whether to propagate c or some other NaN is
-| implementation defined).
-*----------------------------------------------------------------------------*/
-
-static float64 propagateFloat64MulAddNaN(float64 a, float64 b,
-                                         float64 c, flag infzero, float_status *status)
-{
-    flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN,
-        cIsQuietNaN, cIsSignalingNaN;
-    int which;
-
-    aIsQuietNaN = float64_is_quiet_nan(a, status);
-    aIsSignalingNaN = float64_is_signaling_nan(a, status);
-    bIsQuietNaN = float64_is_quiet_nan(b, status);
-    bIsSignalingNaN = float64_is_signaling_nan(b, status);
-    cIsQuietNaN = float64_is_quiet_nan(c, status);
-    cIsSignalingNaN = float64_is_signaling_nan(c, status);
-
-    if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-        float_raise(float_flag_invalid, status);
-    }
-
-    which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN,
-                          bIsQuietNaN, bIsSignalingNaN,
-                          cIsQuietNaN, cIsSignalingNaN, infzero, status);
-
-    if (status->default_nan_mode) {
-        /* Note that this check is after pickNaNMulAdd so that function
-         * has an opportunity to set the Invalid flag.
-         */
-        return float64_default_nan(status);
-    }
-
-    switch (which) {
-    case 0:
-        return float64_maybe_silence_nan(a, status);
-    case 1:
-        return float64_maybe_silence_nan(b, status);
-    case 2:
-        return float64_maybe_silence_nan(c, status);
-    case 3:
-    default:
-        return float64_default_nan(status);
-    }
-}
-
 #ifdef NO_SIGNALING_NANS
 int floatx80_is_quiet_nan(floatx80 a_, float_status *status)
 {
--- a/qemu/fpu/softfloat.c
+++ b/qemu/fpu/softfloat.c
@ -669,6 +669,40 @@ static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
    g_assert_not_reached();
 }

+static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
+                                  bool inf_zero, float_status *s)
+{
+    if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
+        s->float_exception_flags |= float_flag_invalid;
+    }
+
+    if (s->default_nan_mode) {
+        a.cls = float_class_dnan;
+    } else {
+        switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
+                              is_qnan(b.cls), is_snan(b.cls),
+                              is_qnan(c.cls), is_snan(c.cls),
+                              inf_zero, s)) {
+        case 0:
+            break;
+        case 1:
+            a = b;
+            break;
+        case 2:
+            a = c;
+            break;
+        case 3:
+            a.cls = float_class_dnan;
+            return a;
+        default:
+            g_assert_not_reached();
+        }
+
+        a.cls = float_class_msnan;
+    }
+    return a;
+}
+
 /*
 * Returns the result of adding or subtracting the floating-point
 * values `a' and `b'. The operation is performed according to the
@ -816,6 +850,244 @@ float64 QEMU_FLATTEN float64_mul(float64 a, float64 b,
    return float64_round_pack_canonical(pr, status);
 }

+/*
+ * Returns the result of multiplying the floating-point values `a' and
+ * `b' then adding 'c', with no intermediate rounding step after the
+ * multiplication. The operation is performed according to the
+ * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
+ * The flags argument allows the caller to select negation of the
+ * addend, the intermediate product, or the final result. (The
+ * difference between this and having the caller do a separate
+ * negation is that negating externally will flip the sign bit on
+ * NaNs.)
+ */
+
+static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
+                                int flags, float_status *s)
+{
+    bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
+                    ((1 << float_class_inf) | (1 << float_class_zero));
+    bool p_sign;
+    bool sign_flip = flags & float_muladd_negate_result;
+    FloatClass p_class;
+    uint64_t hi, lo;
+    int p_exp;
+
+    /* It is implementation-defined whether the cases of (0,inf,qnan)
+     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
+     * they return if they do), so we have to hand this information
+     * off to the target-specific pick-a-NaN routine.
+     */
+    if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
+        return pick_nan_muladd(a, b, c, inf_zero, s);
+    }
+
+    if (inf_zero) {
+        s->float_exception_flags |= float_flag_invalid;
+        a.cls = float_class_dnan;
+        return a;
+    }
+
+    if (flags & float_muladd_negate_c) {
+        c.sign ^= 1;
+    }
+
+    p_sign = a.sign ^ b.sign;
+
+    if (flags & float_muladd_negate_product) {
+        p_sign ^= 1;
+    }
+
+    if (a.cls == float_class_inf || b.cls == float_class_inf) {
+        p_class = float_class_inf;
+    } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
+        p_class = float_class_zero;
+    } else {
+        p_class = float_class_normal;
+    }
+
+    if (c.cls == float_class_inf) {
+        if (p_class == float_class_inf && p_sign != c.sign) {
+            s->float_exception_flags |= float_flag_invalid;
+            a.cls = float_class_dnan;
+        } else {
+            a.cls = float_class_inf;
+            a.sign = c.sign ^ sign_flip;
+        }
+        return a;
+    }
+
+    if (p_class == float_class_inf) {
+        a.cls = float_class_inf;
+        a.sign = p_sign ^ sign_flip;
+        return a;
+    }
+
+    if (p_class == float_class_zero) {
+        if (c.cls == float_class_zero) {
+            if (p_sign != c.sign) {
+                p_sign = s->float_rounding_mode == float_round_down;
+            }
+            c.sign = p_sign;
+        } else if (flags & float_muladd_halve_result) {
+            c.exp -= 1;
+        }
+        c.sign ^= sign_flip;
+        return c;
+    }
+
+    /* a & b should be normals now... */
+    assert(a.cls == float_class_normal &&
+           b.cls == float_class_normal);
+
+    p_exp = a.exp + b.exp;
+
+    /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
+     * result.
+     */
+    mul64To128(a.frac, b.frac, &hi, &lo);
+    /* binary point now at bit 124 */
+
+    /* check for overflow */
+    if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
+        shift128RightJamming(hi, lo, 1, &hi, &lo);
+        p_exp += 1;
+    }
+
+    /* + add/sub */
+    if (c.cls == float_class_zero) {
+        /* move binary point back to 62 */
+        shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
+    } else {
+        int exp_diff = p_exp - c.exp;
+        if (p_sign == c.sign) {
+            /* Addition */
+            if (exp_diff <= 0) {
+                shift128RightJamming(hi, lo,
+                                     DECOMPOSED_BINARY_POINT - exp_diff,
+                                     &hi, &lo);
+                lo += c.frac;
+                p_exp = c.exp;
+            } else {
+                uint64_t c_hi, c_lo;
+                /* shift c to the same binary point as the product (124) */
+                c_hi = c.frac >> 2;
+                c_lo = 0;
+                shift128RightJamming(c_hi, c_lo,
+                                     exp_diff,
+                                     &c_hi, &c_lo);
+                add128(hi, lo, c_hi, c_lo, &hi, &lo);
+                /* move binary point back to 62 */
+                shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
+            }
+
+            if (lo & DECOMPOSED_OVERFLOW_BIT) {
+                shift64RightJamming(lo, 1, &lo);
+                p_exp += 1;
+            }
+
+        } else {
+            /* Subtraction */
+            uint64_t c_hi, c_lo;
+            /* make C binary point match product at bit 124 */
+            c_hi = c.frac >> 2;
+            c_lo = 0;
+
+            if (exp_diff <= 0) {
+                shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
+                if (exp_diff == 0
+                    &&
+                    (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
+                    sub128(hi, lo, c_hi, c_lo, &hi, &lo);
+                } else {
+                    sub128(c_hi, c_lo, hi, lo, &hi, &lo);
+                    p_sign ^= 1;
+                    p_exp = c.exp;
+                }
+            } else {
+                shift128RightJamming(c_hi, c_lo,
+                                     exp_diff,
+                                     &c_hi, &c_lo);
+                sub128(hi, lo, c_hi, c_lo, &hi, &lo);
+            }
+
+            if (hi == 0 && lo == 0) {
+                a.cls = float_class_zero;
+                a.sign = s->float_rounding_mode == float_round_down;
+                a.sign ^= sign_flip;
+                return a;
+            } else {
+                int shift;
+                if (hi != 0) {
+                    shift = clz64(hi);
+                } else {
+                    shift = clz64(lo) + 64;
+                }
+                /* Normalizing to a binary point of 124 is the
+                   correct adjust for the exponent.  However since we're
+                   shifting, we might as well put the binary point back
+                   at 62 where we really want it.  Therefore shift as
+                   if we're leaving 1 bit at the top of the word, but
+                   adjust the exponent as if we're leaving 3 bits.  */
+                shift -= 1;
+                if (shift >= 64) {
+                    lo = lo << (shift - 64);
+                } else {
+                    hi = (hi << shift) | (lo >> (64 - shift));
+                    lo = hi | ((lo << shift) != 0);
+                }
+                p_exp -= shift - 2;
+            }
+        }
+    }
+
+    if (flags & float_muladd_halve_result) {
+        p_exp -= 1;
+    }
+
+    /* finally prepare our result */
+    a.cls = float_class_normal;
+    a.sign = p_sign ^ sign_flip;
+    a.exp = p_exp;
+    a.frac = lo;
+
+    return a;
+}
+
+float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
+                                    int flags, float_status *status)
+{
+    FloatParts pa = float16_unpack_canonical(a, status);
+    FloatParts pb = float16_unpack_canonical(b, status);
+    FloatParts pc = float16_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float16_round_pack_canonical(pr, status);
+}
+
+float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
+                                    int flags, float_status *status)
+{
+    FloatParts pa = float32_unpack_canonical(a, status);
+    FloatParts pb = float32_unpack_canonical(b, status);
+    FloatParts pc = float32_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float32_round_pack_canonical(pr, status);
+}
+
+float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
+                                    int flags, float_status *status)
+{
+    FloatParts pa = float64_unpack_canonical(a, status);
+    FloatParts pb = float64_unpack_canonical(b, status);
+    FloatParts pc = float64_unpack_canonical(c, status);
+    FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
+
+    return float64_round_pack_canonical(pr, status);
+}
+
+
 /*
 * Returns the result of dividing the floating-point value `a' by the
 * corresponding value `b'. The operation is performed according to
@ -2787,231 +3059,6 @@ float32 float32_rem(float32 a, float32 b, float_status *status)

 }

-/*----------------------------------------------------------------------------
-| Returns the result of multiplying the single-precision floating-point values
-| `a' and `b' then adding 'c', with no intermediate rounding step after the
-| multiplication.  The operation is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic 754-2008.
-| The flags argument allows the caller to select negation of the
-| addend, the intermediate product, or the final result. (The difference
-| between this and having the caller do a separate negation is that negating
-| externally will flip the sign bit on NaNs.)
-*----------------------------------------------------------------------------*/
-
-float32 float32_muladd(float32 a, float32 b, float32 c, int flags, float_status *status)
-{
-    flag aSign, bSign, cSign, zSign;
-    int aExp, bExp, cExp, pExp, zExp, expDiff;
-    uint32_t aSig, bSig, cSig;
-    flag pInf, pZero, pSign;
-    uint64_t pSig64, cSig64, zSig64;
-    uint32_t pSig;
-    int shiftcount;
-    flag signflip, infzero;
-
-    a = float32_squash_input_denormal(a, status);
-    b = float32_squash_input_denormal(b, status);
-    c = float32_squash_input_denormal(c, status);
-    aSig = extractFloat32Frac(a);
-    aExp = extractFloat32Exp(a);
-    aSign = extractFloat32Sign(a);
-    bSig = extractFloat32Frac(b);
-    bExp = extractFloat32Exp(b);
-    bSign = extractFloat32Sign(b);
-    cSig = extractFloat32Frac(c);
-    cExp = extractFloat32Exp(c);
-    cSign = extractFloat32Sign(c);
-
-    infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
-               (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
-
-    /* It is implementation-defined whether the cases of (0,inf,qnan)
-     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-     * they return if they do), so we have to hand this information
-     * off to the target-specific pick-a-NaN routine.
-     */
-    if (((aExp == 0xff) && aSig) ||
-        ((bExp == 0xff) && bSig) ||
-        ((cExp == 0xff) && cSig)) {
-        return propagateFloat32MulAddNaN(a, b, c, infzero, status);
-    }
-
-    if (infzero) {
-        float_raise(float_flag_invalid, status);
-        return float32_default_nan(status);
-    }
-
-    if (flags & float_muladd_negate_c) {
-        cSign ^= 1;
-    }
-
-    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
-
-    /* Work out the sign and type of the product */
-    pSign = aSign ^ bSign;
-    if (flags & float_muladd_negate_product) {
-        pSign ^= 1;
-    }
-    pInf = (aExp == 0xff) || (bExp == 0xff);
-    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-    if (cExp == 0xff) {
-        if (pInf && (pSign ^ cSign)) {
-            /* addition of opposite-signed infinities => InvalidOperation */
-            float_raise(float_flag_invalid, status);
-            return float32_default_nan(status);
-        }
-        /* Otherwise generate an infinity of the same sign */
-        return packFloat32(cSign ^ signflip, 0xff, 0);
-    }
-
-    if (pInf) {
-        return packFloat32(pSign ^ signflip, 0xff, 0);
-    }
-
-    if (pZero) {
-        if (cExp == 0) {
-            if (cSig == 0) {
-                /* Adding two exact zeroes */
-                if (pSign == cSign) {
-                    zSign = pSign;
-                } else if (status->float_rounding_mode == float_round_down) {
-                    zSign = 1;
-                } else {
-                    zSign = 0;
-                }
-                return packFloat32(zSign ^ signflip, 0, 0);
-            }
-            /* Exact zero plus a denorm */
-            if (status->flush_to_zero) {
-                float_raise(float_flag_output_denormal, status);
-                return packFloat32(cSign ^ signflip, 0, 0);
-            }
-        }
-        /* Zero plus something non-zero : just return the something */
-        if (flags & float_muladd_halve_result) {
-            if (cExp == 0) {
-                normalizeFloat32Subnormal(cSig, &cExp, &cSig);
-            }
-            /* Subtract one to halve, and one again because roundAndPackFloat32
-             * wants one less than the true exponent.
-             */
-            cExp -= 2;
-            cSig = (cSig | 0x00800000) << 7;
-            return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
-        }
-        return packFloat32(cSign ^ signflip, cExp, cSig);
-    }
-
-    if (aExp == 0) {
-        normalizeFloat32Subnormal(aSig, &aExp, &aSig);
-    }
-    if (bExp == 0) {
-        normalizeFloat32Subnormal(bSig, &bExp, &bSig);
-    }
-
-    /* Calculate the actual result a * b + c */
-
-    /* Multiply first; this is easy. */
-    /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
-     * because we want the true exponent, not the "one-less-than"
-     * flavour that roundAndPackFloat32() takes.
-     */
-    pExp = aExp + bExp - 0x7e;
-    aSig = (aSig | 0x00800000) << 7;
-    bSig = (bSig | 0x00800000) << 8;
-    pSig64 = (uint64_t)aSig * bSig;
-    if ((int64_t)(pSig64 << 1) >= 0) {
-        pSig64 <<= 1;
-        pExp--;
-    }
-
-    zSign = pSign ^ signflip;
-
-    /* Now pSig64 is the significand of the multiply, with the explicit bit in
-     * position 62.
-     */
-    if (cExp == 0) {
-        if (!cSig) {
-            /* Throw out the special case of c being an exact zero now */
-            shift64RightJamming(pSig64, 32, &pSig64);
-            pSig = (uint32_t)pSig64;
-            if (flags & float_muladd_halve_result) {
-                pExp--;
-            }
-            return roundAndPackFloat32(zSign, pExp - 1,
-                                       pSig, status);
-        }
-        normalizeFloat32Subnormal(cSig, &cExp, &cSig);
-    }
-
-    cSig64 = (uint64_t)cSig << (62 - 23);
-    cSig64 |= LIT64(0x4000000000000000);
-    expDiff = pExp - cExp;
-
-    if (pSign == cSign) {
-        /* Addition */
-        if (expDiff > 0) {
-            /* scale c to match p */
-            shift64RightJamming(cSig64, expDiff, &cSig64);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            /* scale p to match c */
-            shift64RightJamming(pSig64, -expDiff, &pSig64);
-            zExp = cExp;
-        } else {
-            /* no scaling needed */
-            zExp = cExp;
-        }
-        /* Add significands and make sure explicit bit ends up in posn 62 */
-        zSig64 = pSig64 + cSig64;
-        if ((int64_t)zSig64 < 0) {
-            shift64RightJamming(zSig64, 1, &zSig64);
-        } else {
-            zExp--;
-        }
-    } else {
-        /* Subtraction */
-        if (expDiff > 0) {
-            shift64RightJamming(cSig64, expDiff, &cSig64);
-            zSig64 = pSig64 - cSig64;
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            shift64RightJamming(pSig64, -expDiff, &pSig64);
-            zSig64 = cSig64 - pSig64;
-            zExp = cExp;
-            zSign ^= 1;
-        } else {
-            zExp = pExp;
-            if (cSig64 < pSig64) {
-                zSig64 = pSig64 - cSig64;
-            } else if (pSig64 < cSig64) {
-                zSig64 = cSig64 - pSig64;
-                zSign ^= 1;
-            } else {
-                /* Exact zero */
-                zSign = signflip;
-                if (status->float_rounding_mode == float_round_down) {
-                    zSign ^= 1;
-                }
-                return packFloat32(zSign, 0, 0);
-            }
-        }
-        --zExp;
-        /* Normalize to put the explicit bit back into bit 62. */
-        shiftcount = countLeadingZeros64(zSig64) - 1;
-        zSig64 <<= shiftcount;
-        zExp -= shiftcount;
-    }
-    if (flags & float_muladd_halve_result) {
-        zExp--;
-    }
-
-    shift64RightJamming(zSig64, 32, &zSig64);
-    return roundAndPackFloat32(zSign, zExp, (uint32_t)zSig64, status);
-}
-
-
 /*----------------------------------------------------------------------------
 | Returns the square root of the single-precision floating-point value `a'.
 | The operation is performed according to the IEC/IEEE Standard for Binary
@ -4232,252 +4279,6 @@ float64 float64_rem(float64 a, float64 b, float_status *status)

 }

-/*----------------------------------------------------------------------------
-| Returns the result of multiplying the double-precision floating-point values
-| `a' and `b' then adding 'c', with no intermediate rounding step after the
-| multiplication.  The operation is performed according to the IEC/IEEE
-| Standard for Binary Floating-Point Arithmetic 754-2008.
-| The flags argument allows the caller to select negation of the
-| addend, the intermediate product, or the final result. (The difference
-| between this and having the caller do a separate negation is that negating
-| externally will flip the sign bit on NaNs.)
-*----------------------------------------------------------------------------*/
-
-float64 float64_muladd(float64 a, float64 b, float64 c, int flags, float_status *status)
-{
-    flag aSign, bSign, cSign, zSign;
-    int aExp, bExp, cExp, pExp, zExp, expDiff;
-    uint64_t aSig, bSig, cSig;
-    flag pInf, pZero, pSign;
-    uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
-    int shiftcount;
-    flag signflip, infzero;
-
-    a = float64_squash_input_denormal(a, status);
-    b = float64_squash_input_denormal(b, status);
-    c = float64_squash_input_denormal(c, status);
-    aSig = extractFloat64Frac(a);
-    aExp = extractFloat64Exp(a);
-    aSign = extractFloat64Sign(a);
-    bSig = extractFloat64Frac(b);
-    bExp = extractFloat64Exp(b);
-    bSign = extractFloat64Sign(b);
-    cSig = extractFloat64Frac(c);
-    cExp = extractFloat64Exp(c);
-    cSign = extractFloat64Sign(c);
-
-    infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
-               (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
-
-    /* It is implementation-defined whether the cases of (0,inf,qnan)
-     * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-     * they return if they do), so we have to hand this information
-     * off to the target-specific pick-a-NaN routine.
-     */
-    if (((aExp == 0x7ff) && aSig) ||
-        ((bExp == 0x7ff) && bSig) ||
-        ((cExp == 0x7ff) && cSig)) {
-        return propagateFloat64MulAddNaN(a, b, c, infzero, status);
-    }
-
-    if (infzero) {
-        float_raise(float_flag_invalid, status);
-        return float64_default_nan(status);
-    }
-
-    if (flags & float_muladd_negate_c) {
-        cSign ^= 1;
-    }
-
-    signflip = (flags & float_muladd_negate_result) ? 1 : 0;
-
-    /* Work out the sign and type of the product */
-    pSign = aSign ^ bSign;
-    if (flags & float_muladd_negate_product) {
-        pSign ^= 1;
-    }
-    pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
-    pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-    if (cExp == 0x7ff) {
-        if (pInf && (pSign ^ cSign)) {
-            /* addition of opposite-signed infinities => InvalidOperation */
-            float_raise(float_flag_invalid, status);
-            return float64_default_nan(status);
-        }
-        /* Otherwise generate an infinity of the same sign */
-        return packFloat64(cSign ^ signflip, 0x7ff, 0);
-    }
-
-    if (pInf) {
-        return packFloat64(pSign ^ signflip, 0x7ff, 0);
-    }
-
-    if (pZero) {
-        if (cExp == 0) {
-            if (cSig == 0) {
-                /* Adding two exact zeroes */
-                if (pSign == cSign) {
-                    zSign = pSign;
-                } else if (status->float_rounding_mode == float_round_down) {
-                    zSign = 1;
-                } else {
-                    zSign = 0;
-                }
-                return packFloat64(zSign ^ signflip, 0, 0);
-            }
-            /* Exact zero plus a denorm */
-            if (status->flush_to_zero) {
-                float_raise(float_flag_output_denormal, status);
-                return packFloat64(cSign ^ signflip, 0, 0);
-            }
-        }
-        /* Zero plus something non-zero : just return the something */
-        if (flags & float_muladd_halve_result) {
-            if (cExp == 0) {
-                normalizeFloat64Subnormal(cSig, &cExp, &cSig);
-            }
-            /* Subtract one to halve, and one again because roundAndPackFloat64
-             * wants one less than the true exponent.
-             */
-            cExp -= 2;
-            cSig = (cSig | 0x0010000000000000ULL) << 10;
-            return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
-        }
-        return packFloat64(cSign ^ signflip, cExp, cSig);
-    }
-
-    if (aExp == 0) {
-        normalizeFloat64Subnormal(aSig, &aExp, &aSig);
-    }
-    if (bExp == 0) {
-        normalizeFloat64Subnormal(bSig, &bExp, &bSig);
-    }
-
-    /* Calculate the actual result a * b + c */
-
-    /* Multiply first; this is easy. */
-    /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
-     * because we want the true exponent, not the "one-less-than"
-     * flavour that roundAndPackFloat64() takes.
-     */
-    pExp = aExp + bExp - 0x3fe;
-    aSig = (aSig | LIT64(0x0010000000000000))<<10;
-    bSig = (bSig | LIT64(0x0010000000000000))<<11;
-    mul64To128(aSig, bSig, &pSig0, &pSig1);
-    if ((int64_t)(pSig0 << 1) >= 0) {
-        shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
-        pExp--;
-    }
-
-    zSign = pSign ^ signflip;
-
-    /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
-     * bit in position 126.
-     */
-    if (cExp == 0) {
-        if (!cSig) {
-            /* Throw out the special case of c being an exact zero now */
-            shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
-            if (flags & float_muladd_halve_result) {
-                pExp--;
-            }
-            return roundAndPackFloat64(zSign, pExp - 1,
-                                       pSig1, status);
-        }
-        normalizeFloat64Subnormal(cSig, &cExp, &cSig);
-    }
-
-    /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
-     * significand of the addend, with the explicit bit in position 126.
-     */
-    cSig0 = cSig << (126 - 64 - 52);
-    cSig1 = 0;
-    cSig0 |= LIT64(0x4000000000000000);
-    expDiff = pExp - cExp;
-
-    if (pSign == cSign) {
-        /* Addition */
-        if (expDiff > 0) {
-            /* scale c to match p */
-            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            /* scale p to match c */
-            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
-            zExp = cExp;
-        } else {
-            /* no scaling needed */
-            zExp = cExp;
-        }
-        /* Add significands and make sure explicit bit ends up in posn 126 */
-        add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-        if ((int64_t)zSig0 < 0) {
-            shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
-        } else {
-            zExp--;
-        }
-        shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
-        if (flags & float_muladd_halve_result) {
-            zExp--;
-        }
-        return roundAndPackFloat64(zSign, zExp, zSig1, status);
-    } else {
-        /* Subtraction */
-        if (expDiff > 0) {
-            shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
-            sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-            zExp = pExp;
-        } else if (expDiff < 0) {
-            shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
-            sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
-            zExp = cExp;
-            zSign ^= 1;
-        } else {
-            zExp = pExp;
-            if (lt128(cSig0, cSig1, pSig0, pSig1)) {
-                sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
-            } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
-                sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
-                zSign ^= 1;
-            } else {
-                /* Exact zero */
-                zSign = signflip;
-                if (status->float_rounding_mode == float_round_down) {
-                    zSign ^= 1;
-                }
-                return packFloat64(zSign, 0, 0);
-            }
-        }
-        --zExp;
-        /* Do the equivalent of normalizeRoundAndPackFloat64() but
-         * starting with the significand in a pair of uint64_t.
-         */
-        if (zSig0) {
-            shiftcount = countLeadingZeros64(zSig0) - 1;
-            shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
-            if (zSig1) {
-                zSig0 |= 1;
-            }
-            zExp -= shiftcount;
-        } else {
-            shiftcount = countLeadingZeros64(zSig1);
-            if (shiftcount == 0) {
-                zSig0 = (zSig1 >> 1) | (zSig1 & 1);
-                zExp -= 63;
-            } else {
-                shiftcount--;
-                zSig0 = zSig1 << shiftcount;
-                zExp -= (shiftcount + 64);
-            }
-        }
-        if (flags & float_muladd_halve_result) {
-            zExp--;
-        }
-        return roundAndPackFloat64(zSign, zExp, zSig0, status);
-    }
-}
-
 /*----------------------------------------------------------------------------
 | Returns the square root of the double-precision floating-point value `a'.
 | The operation is performed according to the IEC/IEEE Standard for Binary
--- a/qemu/header_gen.py
+++ b/qemu/header_gen.py
@ -500,6 +500,7 @@ symbols = (
    'float16_is_signaling_nan',
    'float16_maybe_silence_nan',
    'float16_mul',
+    'float16_muladd',
    'float16_squash_input_denormal',
    'float16_sub',
    'float16_to_float32',
--- a/qemu/include/fpu/softfloat.h
+++ b/qemu/include/fpu/softfloat.h
@ -247,6 +247,7 @@ float64 float16_to_float64(float16 a, flag ieee, float_status *status);
 float16 float16_add(float16, float16, float_status *status);
 float16 float16_sub(float16, float16, float_status *status);
 float16 float16_mul(float16, float16, float_status *status);
+float16 float16_muladd(float16, float16, float16, int, float_status *status);
 float16 float16_div(float16, float16, float_status *status);

 int float16_is_quiet_nan(float16, float_status *status);
--- a/qemu/m68k.h
+++ b/qemu/m68k.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_m68k
 #define float16_maybe_silence_nan float16_maybe_silence_nan_m68k
 #define float16_mul float16_mul_m68k
+#define float16_muladd float16_muladd_m68k
 #define float16_squash_input_denormal float16_squash_input_denormal_m68k
 #define float16_sub float16_sub_m68k
 #define float16_to_float32 float16_to_float32_m68k
--- a/qemu/mips.h
+++ b/qemu/mips.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_mips
 #define float16_maybe_silence_nan float16_maybe_silence_nan_mips
 #define float16_mul float16_mul_mips
+#define float16_muladd float16_muladd_mips
 #define float16_squash_input_denormal float16_squash_input_denormal_mips
 #define float16_sub float16_sub_mips
 #define float16_to_float32 float16_to_float32_mips
--- a/qemu/mips64.h
+++ b/qemu/mips64.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_mips64
 #define float16_maybe_silence_nan float16_maybe_silence_nan_mips64
 #define float16_mul float16_mul_mips64
+#define float16_muladd float16_muladd_mips64
 #define float16_squash_input_denormal float16_squash_input_denormal_mips64
 #define float16_sub float16_sub_mips64
 #define float16_to_float32 float16_to_float32_mips64
--- a/qemu/mips64el.h
+++ b/qemu/mips64el.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_mips64el
 #define float16_maybe_silence_nan float16_maybe_silence_nan_mips64el
 #define float16_mul float16_mul_mips64el
+#define float16_muladd float16_muladd_mips64el
 #define float16_squash_input_denormal float16_squash_input_denormal_mips64el
 #define float16_sub float16_sub_mips64el
 #define float16_to_float32 float16_to_float32_mips64el
--- a/qemu/mipsel.h
+++ b/qemu/mipsel.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_mipsel
 #define float16_maybe_silence_nan float16_maybe_silence_nan_mipsel
 #define float16_mul float16_mul_mipsel
+#define float16_muladd float16_muladd_mipsel
 #define float16_squash_input_denormal float16_squash_input_denormal_mipsel
 #define float16_sub float16_sub_mipsel
 #define float16_to_float32 float16_to_float32_mipsel
--- a/qemu/powerpc.h
+++ b/qemu/powerpc.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_powerpc
 #define float16_maybe_silence_nan float16_maybe_silence_nan_powerpc
 #define float16_mul float16_mul_powerpc
+#define float16_muladd float16_muladd_powerpc
 #define float16_squash_input_denormal float16_squash_input_denormal_powerpc
 #define float16_sub float16_sub_powerpc
 #define float16_to_float32 float16_to_float32_powerpc
--- a/qemu/sparc.h
+++ b/qemu/sparc.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_sparc
 #define float16_maybe_silence_nan float16_maybe_silence_nan_sparc
 #define float16_mul float16_mul_sparc
+#define float16_muladd float16_muladd_sparc
 #define float16_squash_input_denormal float16_squash_input_denormal_sparc
 #define float16_sub float16_sub_sparc
 #define float16_to_float32 float16_to_float32_sparc
--- a/qemu/sparc64.h
+++ b/qemu/sparc64.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_sparc64
 #define float16_maybe_silence_nan float16_maybe_silence_nan_sparc64
 #define float16_mul float16_mul_sparc64
+#define float16_muladd float16_muladd_sparc64
 #define float16_squash_input_denormal float16_squash_input_denormal_sparc64
 #define float16_sub float16_sub_sparc64
 #define float16_to_float32 float16_to_float32_sparc64
--- a/qemu/x86_64.h
+++ b/qemu/x86_64.h
@ -494,6 +494,7 @@
 #define float16_is_signaling_nan float16_is_signaling_nan_x86_64
 #define float16_maybe_silence_nan float16_maybe_silence_nan_x86_64
 #define float16_mul float16_mul_x86_64
+#define float16_muladd float16_muladd_x86_64
 #define float16_squash_input_denormal float16_squash_input_denormal_x86_64
 #define float16_sub float16_sub_x86_64
 #define float16_to_float32 float16_to_float32_x86_64