From ddb2f1d4ddc39defd0fe92ec7dc621b11753b33c Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Thu, 25 Feb 2021 13:46:27 -0500 Subject: [PATCH] target/i386: reimplement fyl2x using floatx80 operations The x87 fyl2x emulation is currently based around conversion to double. This is inherently unsuitable for a good emulation of any floatx80 operation. Reimplement using the soft-float operations, building on top of the reimplementation of fyl2xp1 and factoring out code to be shared between the two instructions. The included test assumes that the result in round-to-nearest mode should always be one of the two closest floating-point numbers to the mathematically exact result (including that it should be exact, in the exact cases which cover more cases than for fyl2xp1). Backports commit 1f18a1e6ab8368a4eab2d22894d3b2ae75250cd3 from qemu --- qemu/target/i386/fpu_helper.c | 372 ++++++++++++++++++++++++---------- 1 file changed, 262 insertions(+), 110 deletions(-) diff --git a/qemu/target/i386/fpu_helper.c b/qemu/target/i386/fpu_helper.c index 776e93d6..7bf2db7c 100644 --- a/qemu/target/i386/fpu_helper.c +++ b/qemu/target/i386/fpu_helper.c @@ -1191,21 +1191,6 @@ void helper_f2xm1(CPUX86State *env) merge_exception_flags(env, old_flags); } -void helper_fyl2x(CPUX86State *env) -{ - double fptemp = floatx80_to_double(env, ST0); - - if (fptemp > 0.0) { - fptemp = log(fptemp) / log(2.0); /* log2(ST) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); - } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; - } -} - void helper_fptan(CPUX86State *env) { double fptemp = floatx80_to_double(env, ST0); @@ -1364,6 +1349,118 @@ void helper_fprem(CPUX86State *env) #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL) #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL) +/* + * Compute an approximation of log2(1+arg), where 1+arg is in the + * interval [sqrt(2)/2, sqrt(2)]. It is assumed that when this + * function is called, rounding precision is set to 80 and the + * round-to-nearest mode is in effect. arg must not be exactly zero, + * and must not be so close to zero that underflow might occur. + */ +static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp, + uint64_t *sig0, uint64_t *sig1) +{ + uint64_t arg0_sig = extractFloatx80Frac(arg); + int32_t arg0_exp = extractFloatx80Exp(arg); + bool arg0_sign = extractFloatx80Sign(arg); + bool asign; + int32_t dexp, texp, aexp; + uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2; + uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3; + uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1; + floatx80 t2, accum; + + /* + * Compute an approximation of arg/(2+arg), with extra precision, + * as the argument to a polynomial approximation. The extra + * precision is only needed for the first term of the + * approximation, with subsequent terms being significantly + * smaller; the approximation only uses odd exponents, and the + * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029.... + */ + if (arg0_sign) { + dexp = 0x3fff; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1); + } else { + dexp = 0x4000; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + dsig0 |= 0x8000000000000000ULL; + } + texp = arg0_exp - dexp + 0x3ffe; + rsig0 = arg0_sig; + rsig1 = 0; + rsig2 = 0; + if (dsig0 <= rsig0) { + shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1); + ++texp; + } + tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0); + mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2); + sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2, + &rsig0, &rsig1, &rsig2); + while ((int64_t) rsig0 < 0) { + --tsig0; + add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1, + &rsig0, &rsig1, &rsig2); + } + tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0); + /* + * No need to correct any estimation error in tsig1; even with + * such error, it is accurate enough. Now compute the square of + * that approximation. + */ + mul128To256(tsig0, tsig1, tsig0, tsig1, + &t2sig0, &t2sig1, &t2sig2, &t2sig3); + t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe, + t2sig0, t2sig1, &env->fp_status); + + /* Compute the lower parts of the polynomial expansion. */ + accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status); + + /* + * The full polynomial expansion is fyl2x_coeff_0 + accum (where + * accum has much lower magnitude, and so, in particular, carry + * out of the addition is not possible), multiplied by t. (This + * expansion is only accurate to about 70 bits, not 128 bits.) + */ + aexp = extractFloatx80Exp(fyl2x_coeff_0); + asign = extractFloatx80Sign(fyl2x_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + bsig0 = extractFloatx80Frac(fyl2x_coeff_0); + bsig1 = 0; + if (asign == extractFloatx80Sign(accum)) { + add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } + /* Multiply by t to compute the required result. */ + mul128To256(asig0, asig1, tsig0, tsig1, + &asig0, &asig1, &asig2, &asig3); + aexp += texp - 0x3ffe; + *exp = aexp; + *sig0 = asig0; + *sig1 = asig1; +} + void helper_fyl2xp1(CPUX86State *env) { uint8_t old_flags = save_exception_flags(env); @@ -1431,109 +1528,19 @@ void helper_fyl2xp1(CPUX86State *env) ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp, sig0, sig1, &env->fp_status); } else { - bool asign; - uint32_t dexp, texp, aexp; - uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2; - uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3; - uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1; - floatx80 t2, accum; + int32_t aexp; + uint64_t asig0, asig1, asig2; FloatRoundMode save_mode = env->fp_status.float_rounding_mode; signed char save_prec = env->fp_status.floatx80_rounding_precision; env->fp_status.float_rounding_mode = float_round_nearest_even; env->fp_status.floatx80_rounding_precision = 80; - /* - * Compute an approximation of ST0/(2+ST0), with extra - * precision, as the argument to a polynomial approximation. - * The extra precision is only needed for the first term of - * the approximation, with subsequent terms being - * significantly smaller; the approximation only uses odd - * exponents, and the square of ST0/(2+ST0) is at most - * 17-12*sqrt(2) = 0.029.... - */ - if (arg0_sign) { - dexp = 0x3fff; - shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); - sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1); - } else { - dexp = 0x4000; - shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); - dsig0 |= 0x8000000000000000ULL; - } - texp = arg0_exp - dexp + 0x3ffe; - rsig0 = arg0_sig; - rsig1 = 0; - rsig2 = 0; - if (dsig0 <= rsig0) { - shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1); - ++texp; - } - tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0); - mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2); - sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2, - &rsig0, &rsig1, &rsig2); - while ((int64_t) rsig0 < 0) { - --tsig0; - add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1, - &rsig0, &rsig1, &rsig2); - } - tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0); - /* - * No need to correct any estimation error in tsig1; even with - * such error, it is accurate enough. Now compute the square - * of that approximation. - */ - mul128To256(tsig0, tsig1, tsig0, tsig1, - &t2sig0, &t2sig1, &t2sig2, &t2sig3); - t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe, - t2sig0, t2sig1, &env->fp_status); - - /* Compute the lower parts of the polynomial expansion. */ - accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status); - accum = floatx80_mul(accum, t2, &env->fp_status); - accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status); + helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1); /* - * The full polynomial expansion is fyl2x_coeff_0 + accum - * (where accum has much lower magnitude, and so, in - * particular, carry out of the addition is not possible), - * multiplied by t. (This expansion is only accurate to about - * 70 bits, not 128 bits.) + * Multiply by the second argument to compute the required + * result. */ - aexp = extractFloatx80Exp(fyl2x_coeff_0); - asign = extractFloatx80Sign(fyl2x_coeff_0); - shift128RightJamming(extractFloatx80Frac(accum), 0, - aexp - extractFloatx80Exp(accum), - &asig0, &asig1); - bsig0 = extractFloatx80Frac(fyl2x_coeff_0); - bsig1 = 0; - if (asign == extractFloatx80Sign(accum)) { - add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); - } else { - sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); - } - /* - * Multiply by t and by the second argument to compute the - * required result. - */ - mul128To256(asig0, asig1, tsig0, tsig1, - &asig0, &asig1, &asig2, &asig3); - aexp += texp - 0x3ffe; if (arg1_exp == 0) { normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); } @@ -1550,6 +1557,151 @@ void helper_fyl2xp1(CPUX86State *env) merge_exception_flags(env, old_flags); } +void helper_fyl2x(CPUX86State *env) +{ + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0, &env->fp_status); + } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1, &env->fp_status); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_sign && !floatx80_is_zero(ST0)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_infinity(ST1)) { + FloatRelation cmp = floatx80_compare(ST0, floatx80_one, + &env->fp_status); + switch (cmp) { + case float_relation_less: + ST1 = floatx80_chs(ST1); + break; + case float_relation_greater: + /* Result is infinity of the same sign as ST1. */ + break; + default: + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + break; + } + } else if (floatx80_is_infinity(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (arg1_sign) { + ST1 = floatx80_chs(ST0); + } else { + ST1 = ST0; + } + } else if (floatx80_is_zero(ST0)) { + if (floatx80_is_zero(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else { + /* Result is infinity with opposite sign to ST1. */ + float_raise(float_flag_divbyzero, &env->fp_status); + ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff, + 0x8000000000000000ULL); + } + } else if (floatx80_is_zero(ST1)) { + if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) { + ST1 = floatx80_chs(ST1); + } + /* Otherwise, ST1 is already the correct result. */ + } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) { + if (arg1_sign) { + ST1 = floatx80_chs(floatx80_zero); + } else { + ST1 = floatx80_zero; + } + } else { + int32_t int_exp; + floatx80 arg0_m1; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; + + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + int_exp = arg0_exp - 0x3fff; + if (arg0_sig > 0xb504f333f9de6484ULL) { + ++int_exp; + } + arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp, + &env->fp_status), + floatx80_one, &env->fp_status); + if (floatx80_is_zero(arg0_m1)) { + /* Exact power of 2; multiply by ST1. */ + env->fp_status.float_rounding_mode = save_mode; + ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status), + ST1, &env->fp_status); + } else { + bool asign = extractFloatx80Sign(arg0_m1); + int32_t aexp; + uint64_t asig0, asig1, asig2; + helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1); + if (int_exp != 0) { + bool isign = (int_exp < 0); + int32_t iexp; + uint64_t isig; + int shift; + int_exp = isign ? -int_exp : int_exp; + shift = clz32(int_exp) + 32; + isig = int_exp; + isig <<= shift; + iexp = 0x403e - shift; + shift128RightJamming(asig0, asig1, iexp - aexp, + &asig0, &asig1); + if (asign == isign) { + add128(isig, 0, asig0, asig1, &asig0, &asig1); + } else { + sub128(isig, 0, asig0, asig1, &asig0, &asig1); + } + aexp = iexp; + asign = isign; + } + /* + * Multiply by the second argument to compute the required + * result. + */ + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(80, asign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + } + + env->fp_status.floatx80_rounding_precision = save_prec; + } + fpop(env); + merge_exception_flags(env, old_flags); +} + void helper_fsqrt(CPUX86State *env) { uint8_t old_flags = save_exception_flags(env);