From 283abedc684d94f3625b4b1f8d429fc592dca7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Thu, 8 Mar 2018 12:23:13 -0500 Subject: [PATCH] fpu/softfloat: re-factor sqrt This is a little bit of a departure from softfloat's original approach as we skip the estimate step in favour of a straight iteration. There is a minor optimisation to avoid calculating more bits of precision than we need however this still brings a performance drop, especially for float64 operations. Backports commit c13bb2da9eedfbc5886c8048df1bc1114b285fb0 from qemu --- qemu/aarch64.h | 1 + qemu/aarch64eb.h | 1 + qemu/arm.h | 1 + qemu/armeb.h | 1 + qemu/fpu/softfloat.c | 206 ++++++++++++++++------------------- qemu/header_gen.py | 1 + qemu/include/fpu/softfloat.h | 1 + qemu/m68k.h | 1 + qemu/mips.h | 1 + qemu/mips64.h | 1 + qemu/mips64el.h | 1 + qemu/mipsel.h | 1 + qemu/powerpc.h | 1 + qemu/sparc.h | 1 + qemu/sparc64.h | 1 + qemu/x86_64.h | 1 + 16 files changed, 110 insertions(+), 111 deletions(-) diff --git a/qemu/aarch64.h b/qemu/aarch64.h index 4533d4fb..4863e159 100644 --- a/qemu/aarch64.h +++ b/qemu/aarch64.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_aarch64 #define float16_round_to_int float16_round_to_int_aarch64 #define float16_scalbn float16_scalbn_aarch64 +#define float16_sqrt float16_sqrt_aarch64 #define float16_squash_input_denormal float16_squash_input_denormal_aarch64 #define float16_sub float16_sub_aarch64 #define float16_to_int16 float16_to_int16_aarch64 diff --git a/qemu/aarch64eb.h b/qemu/aarch64eb.h index 935a4e9e..10e06be3 100644 --- a/qemu/aarch64eb.h +++ b/qemu/aarch64eb.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_aarch64eb #define float16_round_to_int float16_round_to_int_aarch64eb #define float16_scalbn float16_scalbn_aarch64eb +#define float16_sqrt float16_sqrt_aarch64eb #define float16_squash_input_denormal float16_squash_input_denormal_aarch64eb #define float16_sub float16_sub_aarch64eb #define float16_to_int16 float16_to_int16_aarch64eb diff --git a/qemu/arm.h b/qemu/arm.h index 6dd8359a..1ed3d935 100644 --- a/qemu/arm.h +++ b/qemu/arm.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_arm #define float16_round_to_int float16_round_to_int_arm #define float16_scalbn float16_scalbn_arm +#define float16_sqrt float16_sqrt_arm #define float16_squash_input_denormal float16_squash_input_denormal_arm #define float16_sub float16_sub_arm #define float16_to_int16 float16_to_int16_arm diff --git a/qemu/armeb.h b/qemu/armeb.h index 8ade8111..2196f0bf 100644 --- a/qemu/armeb.h +++ b/qemu/armeb.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_armeb #define float16_round_to_int float16_round_to_int_armeb #define float16_scalbn float16_scalbn_armeb +#define float16_sqrt float16_sqrt_armeb #define float16_squash_input_denormal float16_squash_input_denormal_armeb #define float16_sub float16_sub_armeb #define float16_to_int16 float16_to_int16_armeb diff --git a/qemu/fpu/softfloat.c b/qemu/fpu/softfloat.c index 29b0b1ab..be81b3ed 100644 --- a/qemu/fpu/softfloat.c +++ b/qemu/fpu/softfloat.c @@ -1897,6 +1897,101 @@ MINMAX(64, maxnummag, false, true, true) #undef MINMAX +/* + * Square Root + * + * The old softfloat code did an approximation step before zeroing in + * on the final result. However for simpleness we just compute the + * square root by iterating down from the implicit bit to enough extra + * bits to ensure we get a correctly rounded result. + * + * This does mean however the calculation is slower than before, + * especially for 64 bit floats. + */ + +static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p) +{ + uint64_t a_frac, r_frac, s_frac; + int bit, last_bit; + + if (is_nan(a.cls)) { + return return_nan(a, s); + } + if (a.cls == float_class_zero) { + return a; /* sqrt(+-0) = +-0 */ + } + if (a.sign) { + s->float_exception_flags |= float_flag_invalid; + a.cls = float_class_dnan; + return a; + } + if (a.cls == float_class_inf) { + return a; /* sqrt(+inf) = +inf */ + } + + assert(a.cls == float_class_normal); + + /* We need two overflow bits at the top. Adding room for that is a + * right shift. If the exponent is odd, we can discard the low bit + * by multiplying the fraction by 2; that's a left shift. Combine + * those and we shift right if the exponent is even. + */ + a_frac = a.frac; + if (!(a.exp & 1)) { + a_frac >>= 1; + } + a.exp >>= 1; + + /* Bit-by-bit computation of sqrt. */ + r_frac = 0; + s_frac = 0; + + /* Iterate from implicit bit down to the 3 extra bits to compute a + * properly rounded result. Remember we've inserted one more bit + * at the top, so these positions are one less. + */ + bit = DECOMPOSED_BINARY_POINT - 1; + last_bit = MAX(p->frac_shift - 4, 0); + do { + uint64_t q = 1ULL << bit; + uint64_t t_frac = s_frac + q; + if (t_frac <= a_frac) { + s_frac = t_frac + q; + a_frac -= t_frac; + r_frac += q; + } + a_frac <<= 1; + } while (--bit >= last_bit); + + /* Undo the right shift done above. If there is any remaining + * fraction, the result is inexact. Set the sticky bit. + */ + a.frac = (r_frac << 1) + (a_frac != 0); + + return a; +} + +float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) +{ + FloatParts pa = float16_unpack_canonical(a, status); + FloatParts pr = sqrt_float(pa, status, &float16_params); + return float16_round_pack_canonical(pr, status); +} + +float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) +{ + FloatParts pa = float32_unpack_canonical(a, status); + FloatParts pr = sqrt_float(pa, status, &float32_params); + return float32_round_pack_canonical(pr, status); +} + +float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) +{ + FloatParts pa = float64_unpack_canonical(a, status); + FloatParts pr = sqrt_float(pa, status, &float64_params); + return float64_round_pack_canonical(pr, status); +} + /*---------------------------------------------------------------------------- | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 | and 7, and returns the properly rounded 32-bit integer corresponding to the @@ -3272,63 +3367,6 @@ float32 float32_rem(float32 a, float32 b, float_status *status) } -/*---------------------------------------------------------------------------- -| Returns the square root of the single-precision floating-point value `a'. -| The operation is performed according to the IEC/IEEE Standard for Binary -| Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float32 float32_sqrt(float32 a, float_status *status) -{ - flag aSign; - int aExp, zExp; - uint32_t aSig, zSig; - uint64_t rem, term; - a = float32_squash_input_denormal(a, status); - - aSig = extractFloat32Frac( a ); - aExp = extractFloat32Exp( a ); - aSign = extractFloat32Sign( a ); - if ( aExp == 0xFF ) { - if (aSig) { - return propagateFloat32NaN(a, float32_zero, status); - } - if ( ! aSign ) return a; - float_raise(float_flag_invalid, status); - return float32_default_nan(status); - } - if ( aSign ) { - if ( ( aExp | aSig ) == 0 ) return a; - float_raise(float_flag_invalid, status); - return float32_default_nan(status); - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return float32_zero; - normalizeFloat32Subnormal( aSig, &aExp, &aSig ); - } - zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; - aSig = ( aSig | 0x00800000 )<<8; - zSig = estimateSqrt32( aExp, aSig ) + 2; - if ( ( zSig & 0x7F ) <= 5 ) { - if ( zSig < 2 ) { - zSig = 0x7FFFFFFF; - goto roundAndPack; - } - aSig >>= aExp & 1; - term = ( (uint64_t) zSig ) * zSig; - rem = ( ( (uint64_t) aSig )<<32 ) - term; - while ( (int64_t) rem < 0 ) { - --zSig; - rem += ( ( (uint64_t) zSig )<<1 ) | 1; - } - zSig |= ( rem != 0 ); - } - shift32RightJamming( zSig, 1, &zSig ); - roundAndPack: - return roundAndPackFloat32( 0, zExp, zSig, status ); - -} - /*---------------------------------------------------------------------------- | Returns the binary exponential of the single-precision floating-point value | `a'. The operation is performed according to the IEC/IEEE Standard for @@ -4168,60 +4206,6 @@ float64 float64_rem(float64 a, float64 b, float_status *status) } -/*---------------------------------------------------------------------------- -| Returns the square root of the double-precision floating-point value `a'. -| The operation is performed according to the IEC/IEEE Standard for Binary -| Floating-Point Arithmetic. -*----------------------------------------------------------------------------*/ - -float64 float64_sqrt(float64 a, float_status *status) -{ - flag aSign; - int aExp, zExp; - uint64_t aSig, zSig, doubleZSig; - uint64_t rem0, rem1, term0, term1; - a = float64_squash_input_denormal(a, status); - - aSig = extractFloat64Frac( a ); - aExp = extractFloat64Exp( a ); - aSign = extractFloat64Sign( a ); - if ( aExp == 0x7FF ) { - if (aSig) { - return propagateFloat64NaN(a, a, status); - } - if ( ! aSign ) return a; - float_raise(float_flag_invalid, status); - return float64_default_nan(status); - } - if ( aSign ) { - if ( ( aExp | aSig ) == 0 ) return a; - float_raise(float_flag_invalid, status); - return float64_default_nan(status); - } - if ( aExp == 0 ) { - if ( aSig == 0 ) return float64_zero; - normalizeFloat64Subnormal( aSig, &aExp, &aSig ); - } - zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; - aSig |= LIT64( 0x0010000000000000 ); - zSig = estimateSqrt32( aExp, (uint32_t)(aSig>>21) ); - aSig <<= 9 - ( aExp & 1 ); - zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); - if ( ( zSig & 0x1FF ) <= 5 ) { - doubleZSig = zSig<<1; - mul64To128( zSig, zSig, &term0, &term1 ); - sub128( aSig, 0, term0, term1, &rem0, &rem1 ); - while ( (int64_t) rem0 < 0 ) { - --zSig; - doubleZSig -= 2; - add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); - } - zSig |= ( ( rem0 | rem1 ) != 0 ); - } - return roundAndPackFloat64( 0, zExp, zSig, status ); - -} - /*---------------------------------------------------------------------------- | Returns the binary log of the double-precision floating-point value `a'. | The operation is performed according to the IEC/IEEE Standard for Binary diff --git a/qemu/header_gen.py b/qemu/header_gen.py index c5cac451..cd452a73 100644 --- a/qemu/header_gen.py +++ b/qemu/header_gen.py @@ -511,6 +511,7 @@ symbols = ( 'float16_muladd', 'float16_round_to_int', 'float16_scalbn', + 'float16_sqrt', 'float16_squash_input_denormal', 'float16_sub', 'float16_to_int16', diff --git a/qemu/include/fpu/softfloat.h b/qemu/include/fpu/softfloat.h index f210bc6f..73b3dcf3 100644 --- a/qemu/include/fpu/softfloat.h +++ b/qemu/include/fpu/softfloat.h @@ -258,6 +258,7 @@ float16 float16_minnum(float16, float16, float_status *status); float16 float16_maxnum(float16, float16, float_status *status); float16 float16_minnummag(float16, float16, float_status *status); float16 float16_maxnummag(float16, float16, float_status *status); +float16 float16_sqrt(float16, float_status *status); int float16_compare(float16, float16, float_status *status); int float16_compare_quiet(float16, float16, float_status *status); diff --git a/qemu/m68k.h b/qemu/m68k.h index fb8a5327..189e567e 100644 --- a/qemu/m68k.h +++ b/qemu/m68k.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_m68k #define float16_round_to_int float16_round_to_int_m68k #define float16_scalbn float16_scalbn_m68k +#define float16_sqrt float16_sqrt_m68k #define float16_squash_input_denormal float16_squash_input_denormal_m68k #define float16_sub float16_sub_m68k #define float16_to_int16 float16_to_int16_m68k diff --git a/qemu/mips.h b/qemu/mips.h index e38d932e..e6e1b4e4 100644 --- a/qemu/mips.h +++ b/qemu/mips.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_mips #define float16_round_to_int float16_round_to_int_mips #define float16_scalbn float16_scalbn_mips +#define float16_sqrt float16_sqrt_mips #define float16_squash_input_denormal float16_squash_input_denormal_mips #define float16_sub float16_sub_mips #define float16_to_int16 float16_to_int16_mips diff --git a/qemu/mips64.h b/qemu/mips64.h index 2e4cf338..ea32b389 100644 --- a/qemu/mips64.h +++ b/qemu/mips64.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_mips64 #define float16_round_to_int float16_round_to_int_mips64 #define float16_scalbn float16_scalbn_mips64 +#define float16_sqrt float16_sqrt_mips64 #define float16_squash_input_denormal float16_squash_input_denormal_mips64 #define float16_sub float16_sub_mips64 #define float16_to_int16 float16_to_int16_mips64 diff --git a/qemu/mips64el.h b/qemu/mips64el.h index 620e93cd..a6c1aa7b 100644 --- a/qemu/mips64el.h +++ b/qemu/mips64el.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_mips64el #define float16_round_to_int float16_round_to_int_mips64el #define float16_scalbn float16_scalbn_mips64el +#define float16_sqrt float16_sqrt_mips64el #define float16_squash_input_denormal float16_squash_input_denormal_mips64el #define float16_sub float16_sub_mips64el #define float16_to_int16 float16_to_int16_mips64el diff --git a/qemu/mipsel.h b/qemu/mipsel.h index 98ef126e..a6456169 100644 --- a/qemu/mipsel.h +++ b/qemu/mipsel.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_mipsel #define float16_round_to_int float16_round_to_int_mipsel #define float16_scalbn float16_scalbn_mipsel +#define float16_sqrt float16_sqrt_mipsel #define float16_squash_input_denormal float16_squash_input_denormal_mipsel #define float16_sub float16_sub_mipsel #define float16_to_int16 float16_to_int16_mipsel diff --git a/qemu/powerpc.h b/qemu/powerpc.h index 4d514c5d..7e0d0118 100644 --- a/qemu/powerpc.h +++ b/qemu/powerpc.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_powerpc #define float16_round_to_int float16_round_to_int_powerpc #define float16_scalbn float16_scalbn_powerpc +#define float16_sqrt float16_sqrt_powerpc #define float16_squash_input_denormal float16_squash_input_denormal_powerpc #define float16_sub float16_sub_powerpc #define float16_to_int16 float16_to_int16_powerpc diff --git a/qemu/sparc.h b/qemu/sparc.h index f260903f..a09942f6 100644 --- a/qemu/sparc.h +++ b/qemu/sparc.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_sparc #define float16_round_to_int float16_round_to_int_sparc #define float16_scalbn float16_scalbn_sparc +#define float16_sqrt float16_sqrt_sparc #define float16_squash_input_denormal float16_squash_input_denormal_sparc #define float16_sub float16_sub_sparc #define float16_to_int16 float16_to_int16_sparc diff --git a/qemu/sparc64.h b/qemu/sparc64.h index f3ceb164..355ba1ec 100644 --- a/qemu/sparc64.h +++ b/qemu/sparc64.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_sparc64 #define float16_round_to_int float16_round_to_int_sparc64 #define float16_scalbn float16_scalbn_sparc64 +#define float16_sqrt float16_sqrt_sparc64 #define float16_squash_input_denormal float16_squash_input_denormal_sparc64 #define float16_sub float16_sub_sparc64 #define float16_to_int16 float16_to_int16_sparc64 diff --git a/qemu/x86_64.h b/qemu/x86_64.h index 5298e8ab..92c1355f 100644 --- a/qemu/x86_64.h +++ b/qemu/x86_64.h @@ -505,6 +505,7 @@ #define float16_muladd float16_muladd_x86_64 #define float16_round_to_int float16_round_to_int_x86_64 #define float16_scalbn float16_scalbn_x86_64 +#define float16_sqrt float16_sqrt_x86_64 #define float16_squash_input_denormal float16_squash_input_denormal_x86_64 #define float16_sub float16_sub_x86_64 #define float16_to_int16 float16_to_int16_x86_64