sewardj | cc3d219 | 2013-03-27 11:37:33 +0000 | [diff] [blame] | 1 | |
| 2 | /*---------------------------------------------------------------*/ |
| 3 | /*--- begin host_generic_maddf.c ---*/ |
| 4 | /*---------------------------------------------------------------*/ |
| 5 | |
| 6 | /* |
| 7 | Compute x * y + z as ternary operation. |
| 8 | Copyright (C) 2010-2013 Free Software Foundation, Inc. |
| 9 | This file is part of the GNU C Library. |
| 10 | Contributed by Jakub Jelinek <jakub@redhat.com>, 2010. |
| 11 | |
| 12 | The GNU C Library is free software; you can redistribute it and/or |
| 13 | modify it under the terms of the GNU Lesser General Public |
| 14 | License as published by the Free Software Foundation; either |
| 15 | version 2.1 of the License, or (at your option) any later version. |
| 16 | |
| 17 | The GNU C Library is distributed in the hope that it will be useful, |
| 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 20 | Lesser General Public License for more details. |
| 21 | |
| 22 | You should have received a copy of the GNU Lesser General Public |
| 23 | License along with the GNU C Library; if not, see |
| 24 | <http://www.gnu.org/licenses/>. |
| 25 | */ |
| 26 | |
| 27 | /* Generic helper functions for doing FMA, i.e. compute x * y + z |
| 28 | as ternary operation. |
| 29 | These are purely back-end entities and cannot be seen/referenced |
| 30 | from IR. */ |
| 31 | |
| 32 | #include "libvex_basictypes.h" |
| 33 | #include "host_generic_maddf.h" |
| 34 | #include "main_util.h" |
| 35 | |
| 36 | /* This implementation relies on Double being more than twice as |
| 37 | precise as Float and uses rounding to odd in order to avoid problems |
| 38 | with double rounding. |
| 39 | See a paper by Boldo and Melquiond: |
| 40 | http://www.lri.fr/~melquion/doc/08-tc.pdf */ |
| 41 | |
| 42 | #define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X)) |
| 43 | |
| 44 | #if defined(__x86_64__) && defined(__SSE2_MATH__) |
| 45 | # define ENV_TYPE unsigned int |
| 46 | /* Save current rounding mode into ENV, hold exceptions, set rounding |
| 47 | mode to rounding toward zero. */ |
| 48 | # define ROUNDTOZERO(env) \ |
| 49 | do { \ |
| 50 | unsigned int mxcsr; \ |
| 51 | __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ |
| 52 | (env) = mxcsr; \ |
| 53 | mxcsr = (mxcsr | 0x7f80) & ~0x3f; \ |
| 54 | __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ |
| 55 | } while (0) |
| 56 | /* Restore exceptions from ENV, return if inexact exception has been raised |
| 57 | since ROUNDTOZERO. */ |
| 58 | # define RESET_TESTINEXACT(env) \ |
| 59 | ({ \ |
| 60 | unsigned int mxcsr, ret; \ |
| 61 | __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ |
| 62 | ret = (mxcsr >> 5) & 1; \ |
| 63 | mxcsr = (mxcsr & 0x3d) | (env); \ |
| 64 | __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\ |
| 65 | ret; \ |
| 66 | }) |
| 67 | /* Return if inexact exception has been raised since ROUNDTOZERO. */ |
| 68 | # define TESTINEXACT() \ |
| 69 | ({ \ |
| 70 | unsigned int mxcsr; \ |
| 71 | __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \ |
| 72 | (mxcsr >> 5) & 1; \ |
| 73 | }) |
| 74 | #endif |
| 75 | |
| 76 | #define DBL_MANT_DIG 53 |
| 77 | #define IEEE754_DOUBLE_BIAS 0x3ff |
| 78 | |
| 79 | union vg_ieee754_double { |
| 80 | Double d; |
| 81 | |
| 82 | /* This is the IEEE 754 double-precision format. */ |
| 83 | struct { |
| 84 | #ifdef VKI_BIG_ENDIAN |
| 85 | unsigned int negative:1; |
| 86 | unsigned int exponent:11; |
| 87 | unsigned int mantissa0:20; |
| 88 | unsigned int mantissa1:32; |
| 89 | #else |
| 90 | unsigned int mantissa1:32; |
| 91 | unsigned int mantissa0:20; |
| 92 | unsigned int exponent:11; |
| 93 | unsigned int negative:1; |
| 94 | #endif |
| 95 | } ieee; |
| 96 | }; |
| 97 | |
| 98 | void VEX_REGPARM(3) |
| 99 | h_generic_calc_MAddF32 ( /*OUT*/Float* res, |
| 100 | Float* argX, Float* argY, Float* argZ ) |
| 101 | { |
| 102 | #ifndef ENV_TYPE |
| 103 | /* Lame fallback implementation. */ |
| 104 | *res = *argX * *argY + *argZ; |
| 105 | #else |
| 106 | ENV_TYPE env; |
| 107 | /* Multiplication is always exact. */ |
| 108 | Double temp = (Double) *argX * (Double) *argY; |
| 109 | union vg_ieee754_double u; |
| 110 | |
| 111 | ROUNDTOZERO (env); |
| 112 | |
| 113 | /* Perform addition with round to odd. */ |
| 114 | u.d = temp + (Double) *argZ; |
| 115 | /* Ensure the addition is not scheduled after fetestexcept call. */ |
| 116 | FORCE_EVAL (u.d); |
| 117 | |
| 118 | /* Reset rounding mode and test for inexact simultaneously. */ |
| 119 | int j = RESET_TESTINEXACT (env); |
| 120 | |
| 121 | if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) |
| 122 | u.ieee.mantissa1 |= j; |
| 123 | |
| 124 | /* And finally truncation with round to nearest. */ |
| 125 | *res = (Float) u.d; |
| 126 | #endif |
| 127 | } |
| 128 | |
| 129 | |
| 130 | void VEX_REGPARM(3) |
| 131 | h_generic_calc_MAddF64 ( /*OUT*/Double* res, |
| 132 | Double* argX, Double* argY, Double* argZ ) |
| 133 | { |
| 134 | #ifndef ENV_TYPE |
| 135 | /* Lame fallback implementation. */ |
| 136 | *res = *argX * *argY + *argZ; |
| 137 | #else |
| 138 | Double x = *argX, y = *argY, z = *argZ; |
| 139 | union vg_ieee754_double u, v, w; |
| 140 | int adjust = 0; |
| 141 | u.d = x; |
| 142 | v.d = y; |
| 143 | w.d = z; |
| 144 | if (UNLIKELY (u.ieee.exponent + v.ieee.exponent |
| 145 | >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) |
| 146 | || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) |
| 147 | || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) |
| 148 | || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) |
| 149 | || UNLIKELY (u.ieee.exponent + v.ieee.exponent |
| 150 | <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) { |
| 151 | /* If z is Inf, but x and y are finite, the result should be |
| 152 | z rather than NaN. */ |
| 153 | if (w.ieee.exponent == 0x7ff |
| 154 | && u.ieee.exponent != 0x7ff |
| 155 | && v.ieee.exponent != 0x7ff) { |
| 156 | *res = (z + x) + y; |
| 157 | return; |
| 158 | } |
| 159 | /* If x or y or z is Inf/NaN, or if fma will certainly overflow, |
| 160 | or if x * y is less than half of DBL_DENORM_MIN, |
| 161 | compute as x * y + z. */ |
| 162 | if (u.ieee.exponent == 0x7ff |
| 163 | || v.ieee.exponent == 0x7ff |
| 164 | || w.ieee.exponent == 0x7ff |
| 165 | || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS |
| 166 | || u.ieee.exponent + v.ieee.exponent |
| 167 | < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) { |
| 168 | *res = x * y + z; |
| 169 | return; |
| 170 | } |
| 171 | if (u.ieee.exponent + v.ieee.exponent |
| 172 | >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) { |
| 173 | /* Compute 1p-53 times smaller result and multiply |
| 174 | at the end. */ |
| 175 | if (u.ieee.exponent > v.ieee.exponent) |
| 176 | u.ieee.exponent -= DBL_MANT_DIG; |
| 177 | else |
| 178 | v.ieee.exponent -= DBL_MANT_DIG; |
| 179 | /* If x + y exponent is very large and z exponent is very small, |
| 180 | it doesn't matter if we don't adjust it. */ |
| 181 | if (w.ieee.exponent > DBL_MANT_DIG) |
| 182 | w.ieee.exponent -= DBL_MANT_DIG; |
| 183 | adjust = 1; |
| 184 | } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { |
| 185 | /* Similarly. |
| 186 | If z exponent is very large and x and y exponents are |
| 187 | very small, it doesn't matter if we don't adjust it. */ |
| 188 | if (u.ieee.exponent > v.ieee.exponent) { |
| 189 | if (u.ieee.exponent > DBL_MANT_DIG) |
| 190 | u.ieee.exponent -= DBL_MANT_DIG; |
| 191 | } else if (v.ieee.exponent > DBL_MANT_DIG) |
| 192 | v.ieee.exponent -= DBL_MANT_DIG; |
| 193 | w.ieee.exponent -= DBL_MANT_DIG; |
| 194 | adjust = 1; |
| 195 | } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { |
| 196 | u.ieee.exponent -= DBL_MANT_DIG; |
| 197 | if (v.ieee.exponent) |
| 198 | v.ieee.exponent += DBL_MANT_DIG; |
| 199 | else |
| 200 | v.d *= 0x1p53; |
| 201 | } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) { |
| 202 | v.ieee.exponent -= DBL_MANT_DIG; |
| 203 | if (u.ieee.exponent) |
| 204 | u.ieee.exponent += DBL_MANT_DIG; |
| 205 | else |
| 206 | u.d *= 0x1p53; |
| 207 | } else /* if (u.ieee.exponent + v.ieee.exponent |
| 208 | <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ { |
| 209 | if (u.ieee.exponent > v.ieee.exponent) |
| 210 | u.ieee.exponent += 2 * DBL_MANT_DIG; |
| 211 | else |
| 212 | v.ieee.exponent += 2 * DBL_MANT_DIG; |
| 213 | if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) { |
| 214 | if (w.ieee.exponent) |
| 215 | w.ieee.exponent += 2 * DBL_MANT_DIG; |
| 216 | else |
| 217 | w.d *= 0x1p106; |
| 218 | adjust = -1; |
| 219 | } |
| 220 | /* Otherwise x * y should just affect inexact |
| 221 | and nothing else. */ |
| 222 | } |
| 223 | x = u.d; |
| 224 | y = v.d; |
| 225 | z = w.d; |
| 226 | } |
| 227 | /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */ |
| 228 | # define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1) |
| 229 | Double x1 = x * C; |
| 230 | Double y1 = y * C; |
| 231 | Double m1 = x * y; |
| 232 | x1 = (x - x1) + x1; |
| 233 | y1 = (y - y1) + y1; |
| 234 | Double x2 = x - x1; |
| 235 | Double y2 = y - y1; |
| 236 | Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2; |
| 237 | # undef C |
| 238 | |
| 239 | /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */ |
| 240 | Double a1 = z + m1; |
| 241 | Double t1 = a1 - z; |
| 242 | Double t2 = a1 - t1; |
| 243 | t1 = m1 - t1; |
| 244 | t2 = z - t2; |
| 245 | Double a2 = t1 + t2; |
| 246 | |
| 247 | ENV_TYPE env; |
| 248 | ROUNDTOZERO (env); |
| 249 | |
| 250 | /* Perform m2 + a2 addition with round to odd. */ |
| 251 | u.d = a2 + m2; |
| 252 | |
| 253 | if (UNLIKELY (adjust < 0)) { |
| 254 | if ((u.ieee.mantissa1 & 1) == 0) |
| 255 | u.ieee.mantissa1 |= TESTINEXACT (); |
| 256 | v.d = a1 + u.d; |
| 257 | /* Ensure the addition is not scheduled after fetestexcept call. */ |
| 258 | FORCE_EVAL (v.d); |
| 259 | } |
| 260 | |
| 261 | /* Reset rounding mode and test for inexact simultaneously. */ |
| 262 | int j = RESET_TESTINEXACT (env) != 0; |
| 263 | |
| 264 | if (LIKELY (adjust == 0)) { |
| 265 | if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) |
| 266 | u.ieee.mantissa1 |= j; |
| 267 | /* Result is a1 + u.d. */ |
| 268 | *res = a1 + u.d; |
| 269 | } else if (LIKELY (adjust > 0)) { |
| 270 | if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff) |
| 271 | u.ieee.mantissa1 |= j; |
| 272 | /* Result is a1 + u.d, scaled up. */ |
| 273 | *res = (a1 + u.d) * 0x1p53; |
| 274 | } else { |
| 275 | /* If a1 + u.d is exact, the only rounding happens during |
| 276 | scaling down. */ |
| 277 | if (j == 0) { |
| 278 | *res = v.d * 0x1p-106; |
| 279 | return; |
| 280 | } |
| 281 | /* If result rounded to zero is not subnormal, no double |
| 282 | rounding will occur. */ |
| 283 | if (v.ieee.exponent > 106) { |
| 284 | *res = (a1 + u.d) * 0x1p-106; |
| 285 | return; |
| 286 | } |
| 287 | /* If v.d * 0x1p-106 with round to zero is a subnormal above |
| 288 | or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa |
| 289 | down just by 1 bit, which means v.ieee.mantissa1 |= j would |
| 290 | change the round bit, not sticky or guard bit. |
| 291 | v.d * 0x1p-106 never normalizes by shifting up, |
| 292 | so round bit plus sticky bit should be already enough |
| 293 | for proper rounding. */ |
| 294 | if (v.ieee.exponent == 106) { |
| 295 | /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding, |
| 296 | v.ieee.mantissa1 & 1 is the round bit and j is our sticky |
| 297 | bit. In round-to-nearest 001 rounds down like 00, |
| 298 | 011 rounds up, even though 01 rounds down (thus we need |
| 299 | to adjust), 101 rounds down like 10 and 111 rounds up |
| 300 | like 11. */ |
| 301 | if ((v.ieee.mantissa1 & 3) == 1) { |
| 302 | v.d *= 0x1p-106; |
| 303 | if (v.ieee.negative) |
| 304 | *res = v.d - 0x1p-1074; |
| 305 | else |
| 306 | *res = v.d + 0x1p-1074; |
| 307 | } else |
| 308 | *res = v.d * 0x1p-106; |
| 309 | return; |
| 310 | } |
| 311 | v.ieee.mantissa1 |= j; |
| 312 | *res = v.d * 0x1p-106; |
| 313 | return; |
| 314 | } |
| 315 | #endif |
| 316 | } |
| 317 | |
| 318 | /*---------------------------------------------------------------*/ |
| 319 | /*--- end host_generic_maddf.c --*/ |
| 320 | /*---------------------------------------------------------------*/ |