Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (c) 2014 Advanced Micro Devices, Inc. |
| 3 | * |
| 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 5 | * of this software and associated documentation files (the "Software"), to deal |
| 6 | * in the Software without restriction, including without limitation the rights |
| 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 8 | * copies of the Software, and to permit persons to whom the Software is |
| 9 | * furnished to do so, subject to the following conditions: |
| 10 | * |
| 11 | * The above copyright notice and this permission notice shall be included in |
| 12 | * all copies or substantial portions of the Software. |
| 13 | * |
| 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 20 | * THE SOFTWARE. |
| 21 | */ |
| 22 | |
| 23 | #include <clc/clc.h> |
| 24 | |
| 25 | #include "math.h" |
Tom Stellard | 2e6ff0c | 2015-05-12 17:18:46 +0000 | [diff] [blame] | 26 | #include "tables.h" |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 27 | #include "sincos_helpers.h" |
| 28 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 29 | #define bitalign(hi, lo, shift) \ |
| 30 | ((hi) << (32 - (shift))) | ((lo) >> (shift)); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 31 | |
Tom Stellard | 2e6ff0c | 2015-05-12 17:18:46 +0000 | [diff] [blame] | 32 | #define bytealign(src0, src1, src2) \ |
| 33 | ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8))) |
| 34 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 35 | _CLC_DEF float __clc_sinf_piby4(float x, float y) { |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 36 | // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... |
| 37 | // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... |
| 38 | // = x * f(w) |
| 39 | // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... |
| 40 | // We use a minimax approximation of (f(w) - 1) / w |
| 41 | // because this produces an expansion in even powers of x. |
| 42 | |
| 43 | const float c1 = -0.1666666666e0f; |
| 44 | const float c2 = 0.8333331876e-2f; |
| 45 | const float c3 = -0.198400874e-3f; |
| 46 | const float c4 = 0.272500015e-5f; |
| 47 | const float c5 = -2.5050759689e-08f; // 0xb2d72f34 |
| 48 | const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3 |
| 49 | |
| 50 | float z = x * x; |
| 51 | float v = z * x; |
| 52 | float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2); |
| 53 | float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y)); |
| 54 | |
| 55 | return ret; |
| 56 | } |
| 57 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 58 | _CLC_DEF float __clc_cosf_piby4(float x, float y) { |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 59 | // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... |
| 60 | // = f(w) |
| 61 | // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... |
| 62 | // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) |
| 63 | // because this produces an expansion in even powers of x. |
| 64 | |
| 65 | const float c1 = 0.416666666e-1f; |
| 66 | const float c2 = -0.138888876e-2f; |
| 67 | const float c3 = 0.248006008e-4f; |
| 68 | const float c4 = -0.2730101334e-6f; |
| 69 | const float c5 = 2.0875723372e-09f; // 0x310f74f6 |
| 70 | const float c6 = -1.1359647598e-11f; // 0xad47d74e |
| 71 | |
| 72 | float z = x * x; |
| 73 | float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1); |
| 74 | |
| 75 | // if |x| < 0.3 |
| 76 | float qx = 0.0f; |
| 77 | |
| 78 | int ix = as_int(x) & EXSIGNBIT_SP32; |
| 79 | |
| 80 | // 0.78125 > |x| >= 0.3 |
| 81 | float xby4 = as_float(ix - 0x01000000); |
| 82 | qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx; |
| 83 | |
| 84 | // x > 0.78125 |
| 85 | qx = ix > 0x3f480000 ? 0.28125f : qx; |
| 86 | |
| 87 | float hz = mad(z, 0.5f, -qx); |
| 88 | float a = 1.0f - qx; |
| 89 | float ret = a - (hz - mad(z, r, -x*y)); |
| 90 | return ret; |
| 91 | } |
| 92 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 93 | _CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt) |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 94 | { |
| 95 | if (HAVE_HW_FMA32()) { |
| 96 | float ph = a * b; |
| 97 | *hi = ph; |
| 98 | *lo = fma(a, b, -ph); |
| 99 | } else { |
| 100 | float ah = as_float(as_uint(a) & 0xfffff000U); |
| 101 | float at = a - ah; |
| 102 | float ph = a * b; |
| 103 | float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph)))); |
| 104 | *hi = ph; |
| 105 | *lo = pt; |
| 106 | } |
| 107 | } |
| 108 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 109 | _CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 110 | { |
| 111 | // 72 bits of pi/2 |
| 112 | const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f; |
| 113 | const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f; |
| 114 | const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f; |
| 115 | |
| 116 | const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f; |
| 117 | const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f; |
| 118 | const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f; |
| 119 | |
| 120 | const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f; |
| 121 | const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f; |
| 122 | const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f; |
| 123 | |
| 124 | const float twobypi = 0x1.45f306p-1f; |
| 125 | |
| 126 | float fnpi2 = trunc(mad(x, twobypi, 0.5f)); |
| 127 | |
| 128 | // subtract n * pi/2 from x |
| 129 | float rhead, rtail; |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 130 | __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 131 | float v = x - rhead; |
| 132 | float rem = v + (((x - v) - rhead) - rtail); |
| 133 | |
| 134 | float rhead2, rtail2; |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 135 | __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 136 | v = rem - rhead2; |
| 137 | rem = v + (((rem - v) - rhead2) - rtail2); |
| 138 | |
| 139 | float rhead3, rtail3; |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 140 | __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 141 | v = rem - rhead3; |
| 142 | |
| 143 | *hi = v + ((rem - v) - rhead3); |
| 144 | *lo = -rtail3; |
| 145 | return fnpi2; |
| 146 | } |
| 147 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 148 | _CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 149 | { |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 150 | float fnpi2 = __clc_removePi2S(r, rr, x); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 151 | return (int)fnpi2 & 0x3; |
| 152 | } |
| 153 | |
| 154 | #define FULL_MUL(A, B, HI, LO) \ |
| 155 | LO = A * B; \ |
| 156 | HI = mul_hi(A, B) |
| 157 | |
| 158 | #define FULL_MAD(A, B, C, HI, LO) \ |
| 159 | LO = ((A) * (B) + (C)); \ |
| 160 | HI = mul_hi(A, B); \ |
| 161 | HI += LO < C |
| 162 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 163 | _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 164 | { |
| 165 | int xe = (int)(as_uint(x) >> 23) - 127; |
| 166 | uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); |
| 167 | |
| 168 | // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB |
| 169 | const uint b6 = 0xA2F9836EU; |
| 170 | const uint b5 = 0x4E441529U; |
| 171 | const uint b4 = 0xFC2757D1U; |
| 172 | const uint b3 = 0xF534DDC0U; |
| 173 | const uint b2 = 0xDB629599U; |
| 174 | const uint b1 = 0x3C439041U; |
| 175 | const uint b0 = 0xFE5163ABU; |
| 176 | |
| 177 | uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; |
| 178 | |
| 179 | FULL_MUL(xm, b0, c0, p0); |
| 180 | FULL_MAD(xm, b1, c0, c1, p1); |
| 181 | FULL_MAD(xm, b2, c1, c0, p2); |
| 182 | FULL_MAD(xm, b3, c0, c1, p3); |
| 183 | FULL_MAD(xm, b4, c1, c0, p4); |
| 184 | FULL_MAD(xm, b5, c0, c1, p5); |
| 185 | FULL_MAD(xm, b6, c1, p7, p6); |
| 186 | |
| 187 | uint fbits = 224 + 23 - xe; |
| 188 | |
| 189 | // shift amount to get 2 lsb of integer part at top 2 bits |
| 190 | // min: 25 (xe=18) max: 134 (xe=127) |
| 191 | uint shift = 256U - 2 - fbits; |
| 192 | |
| 193 | // Shift by up to 134/32 = 4 words |
| 194 | int c = shift > 31; |
| 195 | p7 = c ? p6 : p7; |
| 196 | p6 = c ? p5 : p6; |
| 197 | p5 = c ? p4 : p5; |
| 198 | p4 = c ? p3 : p4; |
| 199 | p3 = c ? p2 : p3; |
| 200 | p2 = c ? p1 : p2; |
| 201 | p1 = c ? p0 : p1; |
| 202 | shift -= (-c) & 32; |
| 203 | |
| 204 | c = shift > 31; |
| 205 | p7 = c ? p6 : p7; |
| 206 | p6 = c ? p5 : p6; |
| 207 | p5 = c ? p4 : p5; |
| 208 | p4 = c ? p3 : p4; |
| 209 | p3 = c ? p2 : p3; |
| 210 | p2 = c ? p1 : p2; |
| 211 | shift -= (-c) & 32; |
| 212 | |
| 213 | c = shift > 31; |
| 214 | p7 = c ? p6 : p7; |
| 215 | p6 = c ? p5 : p6; |
| 216 | p5 = c ? p4 : p5; |
| 217 | p4 = c ? p3 : p4; |
| 218 | p3 = c ? p2 : p3; |
| 219 | shift -= (-c) & 32; |
| 220 | |
| 221 | c = shift > 31; |
| 222 | p7 = c ? p6 : p7; |
| 223 | p6 = c ? p5 : p6; |
| 224 | p5 = c ? p4 : p5; |
| 225 | p4 = c ? p3 : p4; |
| 226 | shift -= (-c) & 32; |
| 227 | |
| 228 | // bitalign cannot handle a shift of 32 |
| 229 | c = shift > 0; |
| 230 | shift = 32 - shift; |
| 231 | uint t7 = bitalign(p7, p6, shift); |
| 232 | uint t6 = bitalign(p6, p5, shift); |
| 233 | uint t5 = bitalign(p5, p4, shift); |
| 234 | p7 = c ? t7 : p7; |
| 235 | p6 = c ? t6 : p6; |
| 236 | p5 = c ? t5 : p5; |
| 237 | |
| 238 | // Get 2 lsb of int part and msb of fraction |
| 239 | int i = p7 >> 29; |
| 240 | |
| 241 | // Scoot up 2 more bits so only fraction remains |
| 242 | p7 = bitalign(p7, p6, 30); |
| 243 | p6 = bitalign(p6, p5, 30); |
| 244 | p5 = bitalign(p5, p4, 30); |
| 245 | |
| 246 | // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5 |
| 247 | uint flip = i & 1 ? 0xffffffffU : 0U; |
| 248 | uint sign = i & 1 ? 0x80000000U : 0U; |
| 249 | p7 = p7 ^ flip; |
| 250 | p6 = p6 ^ flip; |
| 251 | p5 = p5 ^ flip; |
| 252 | |
| 253 | // Find exponent and shift away leading zeroes and hidden bit |
| 254 | xe = clz(p7) + 1; |
| 255 | shift = 32 - xe; |
| 256 | p7 = bitalign(p7, p6, shift); |
| 257 | p6 = bitalign(p6, p5, shift); |
| 258 | |
| 259 | // Most significant part of fraction |
| 260 | float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9)); |
| 261 | |
| 262 | // Shift out bits we captured on q1 |
| 263 | p7 = bitalign(p7, p6, 32-23); |
| 264 | |
| 265 | // Get 24 more bits of fraction in another float, there are not long strings of zeroes here |
| 266 | int xxe = clz(p7) + 1; |
| 267 | p7 = bitalign(p7, p6, 32-xxe); |
| 268 | float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9)); |
| 269 | |
| 270 | // At this point, the fraction q1 + q0 is correct to at least 48 bits |
| 271 | // Now we need to multiply the fraction by pi/2 |
| 272 | // This loses us about 4 bits |
| 273 | // pi/2 = C90 FDA A22 168 C23 4C4 |
| 274 | |
| 275 | const float pio2h = (float)0xc90fda / 0x1.0p+23f; |
| 276 | const float pio2hh = (float)0xc90 / 0x1.0p+11f; |
| 277 | const float pio2ht = (float)0xfda / 0x1.0p+23f; |
| 278 | const float pio2t = (float)0xa22168 / 0x1.0p+47f; |
| 279 | |
| 280 | float rh, rt; |
| 281 | |
| 282 | if (HAVE_HW_FMA32()) { |
| 283 | rh = q1 * pio2h; |
| 284 | rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh))); |
| 285 | } else { |
| 286 | float q1h = as_float(as_uint(q1) & 0xfffff000); |
| 287 | float q1t = q1 - q1h; |
| 288 | rh = q1 * pio2h; |
| 289 | rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh)))); |
| 290 | rt = mad(q0, pio2h, mad(q1, pio2t, rt)); |
| 291 | } |
| 292 | |
| 293 | float t = rh + rt; |
| 294 | rt = rt - (t - rh); |
| 295 | |
| 296 | *r = t; |
| 297 | *rr = rt; |
| 298 | return ((i >> 1) + (i & 1)) & 0x3; |
| 299 | } |
| 300 | |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 301 | _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 302 | { |
| 303 | if (x < 0x1.0p+23f) |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 304 | return __clc_argReductionSmallS(r, rr, x); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 305 | else |
Tom Stellard | 8d3a4e3 | 2015-03-23 16:20:24 +0000 | [diff] [blame] | 306 | return __clc_argReductionLargeS(r, rr, x); |
Tom Stellard | c0ab2f8 | 2014-07-23 15:16:18 +0000 | [diff] [blame] | 307 | } |
| 308 | |
Tom Stellard | 2e6ff0c | 2015-05-12 17:18:46 +0000 | [diff] [blame] | 309 | #ifdef cl_khr_fp64 |
| 310 | |
| 311 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable |
| 312 | |
| 313 | // Reduction for medium sized arguments |
| 314 | _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) { |
| 315 | // How many pi/2 is x a multiple of? |
| 316 | const double two_by_pi = 0x1.45f306dc9c883p-1; |
| 317 | double dnpi2 = trunc(fma(x, two_by_pi, 0.5)); |
| 318 | |
| 319 | const double piby2_h = -7074237752028440.0 / 0x1.0p+52; |
| 320 | const double piby2_m = -2483878800010755.0 / 0x1.0p+105; |
| 321 | const double piby2_t = -3956492004828932.0 / 0x1.0p+158; |
| 322 | |
| 323 | // Compute product of npi2 with 159 bits of 2/pi |
| 324 | double p_hh = piby2_h * dnpi2; |
| 325 | double p_ht = fma(piby2_h, dnpi2, -p_hh); |
| 326 | double p_mh = piby2_m * dnpi2; |
| 327 | double p_mt = fma(piby2_m, dnpi2, -p_mh); |
| 328 | double p_th = piby2_t * dnpi2; |
| 329 | double p_tt = fma(piby2_t, dnpi2, -p_th); |
| 330 | |
| 331 | // Reduce to 159 bits |
| 332 | double ph = p_hh; |
| 333 | double pm = p_ht + p_mh; |
| 334 | double t = p_mh - (pm - p_ht); |
| 335 | double pt = p_th + t + p_mt + p_tt; |
| 336 | t = ph + pm; pm = pm - (t - ph); ph = t; |
| 337 | t = pm + pt; pt = pt - (t - pm); pm = t; |
| 338 | |
| 339 | // Subtract from x |
| 340 | t = x + ph; |
| 341 | double qh = t + pm; |
| 342 | double qt = pm - (qh - t) + pt; |
| 343 | |
| 344 | *r = qh; |
| 345 | *rr = qt; |
| 346 | *regn = (int)(long)dnpi2 & 0x3; |
| 347 | } |
| 348 | |
| 349 | // Given positive argument x, reduce it to the range [-pi/4,pi/4] using |
| 350 | // extra precision, and return the result in r, rr. |
| 351 | // Return value "regn" tells how many lots of pi/2 were subtracted |
| 352 | // from x to put it in the range [-pi/4,pi/4], mod 4. |
| 353 | |
| 354 | _CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) { |
| 355 | |
| 356 | long ux = as_long(x); |
| 357 | int e = (int)(ux >> 52) - 1023; |
| 358 | int i = max(23, (e >> 3) + 17); |
| 359 | int j = 150 - i; |
| 360 | int j16 = j & ~0xf; |
| 361 | double fract_temp; |
| 362 | |
| 363 | // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary |
| 364 | uint4 q0 = USE_TABLE(pibits_tbl, j16); |
| 365 | uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16)); |
| 366 | uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32)); |
| 367 | |
| 368 | int k = (j >> 2) & 0x3; |
| 369 | int4 c = (int4)k == (int4)(0, 1, 2, 3); |
| 370 | |
| 371 | uint u0, u1, u2, u3, u4, u5, u6; |
| 372 | |
| 373 | u0 = c.s1 ? q0.s1 : q0.s0; |
| 374 | u0 = c.s2 ? q0.s2 : u0; |
| 375 | u0 = c.s3 ? q0.s3 : u0; |
| 376 | |
| 377 | u1 = c.s1 ? q0.s2 : q0.s1; |
| 378 | u1 = c.s2 ? q0.s3 : u1; |
| 379 | u1 = c.s3 ? q1.s0 : u1; |
| 380 | |
| 381 | u2 = c.s1 ? q0.s3 : q0.s2; |
| 382 | u2 = c.s2 ? q1.s0 : u2; |
| 383 | u2 = c.s3 ? q1.s1 : u2; |
| 384 | |
| 385 | u3 = c.s1 ? q1.s0 : q0.s3; |
| 386 | u3 = c.s2 ? q1.s1 : u3; |
| 387 | u3 = c.s3 ? q1.s2 : u3; |
| 388 | |
| 389 | u4 = c.s1 ? q1.s1 : q1.s0; |
| 390 | u4 = c.s2 ? q1.s2 : u4; |
| 391 | u4 = c.s3 ? q1.s3 : u4; |
| 392 | |
| 393 | u5 = c.s1 ? q1.s2 : q1.s1; |
| 394 | u5 = c.s2 ? q1.s3 : u5; |
| 395 | u5 = c.s3 ? q2.s0 : u5; |
| 396 | |
| 397 | u6 = c.s1 ? q1.s3 : q1.s2; |
| 398 | u6 = c.s2 ? q2.s0 : u6; |
| 399 | u6 = c.s3 ? q2.s1 : u6; |
| 400 | |
| 401 | uint v0 = bytealign(u1, u0, j); |
| 402 | uint v1 = bytealign(u2, u1, j); |
| 403 | uint v2 = bytealign(u3, u2, j); |
| 404 | uint v3 = bytealign(u4, u3, j); |
| 405 | uint v4 = bytealign(u5, u4, j); |
| 406 | uint v5 = bytealign(u6, u5, j); |
| 407 | |
| 408 | // Place those 192 bits in 4 48-bit doubles along with correct exponent |
| 409 | // If i > 1018 we would get subnormals so we scale p up and x down to get the same product |
| 410 | i = 2 + 8*i; |
| 411 | x *= i > 1018 ? 0x1.0p-136 : 1.0; |
| 412 | i -= i > 1018 ? 136 : 0; |
| 413 | |
| 414 | uint ua = (uint)(1023 + 52 - i) << 20; |
| 415 | double a = as_double((uint2)(0, ua)); |
| 416 | double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; |
| 417 | ua += 0x03000000U; |
| 418 | a = as_double((uint2)(0, ua)); |
| 419 | double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; |
| 420 | ua += 0x03000000U; |
| 421 | a = as_double((uint2)(0, ua)); |
| 422 | double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; |
| 423 | ua += 0x03000000U; |
| 424 | a = as_double((uint2)(0, ua)); |
| 425 | double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; |
| 426 | |
| 427 | // Exact multiply |
| 428 | double f0h = p0 * x; |
| 429 | double f0l = fma(p0, x, -f0h); |
| 430 | double f1h = p1 * x; |
| 431 | double f1l = fma(p1, x, -f1h); |
| 432 | double f2h = p2 * x; |
| 433 | double f2l = fma(p2, x, -f2h); |
| 434 | double f3h = p3 * x; |
| 435 | double f3l = fma(p3, x, -f3h); |
| 436 | |
| 437 | // Accumulate product into 4 doubles |
| 438 | double s, t; |
| 439 | |
| 440 | double f3 = f3h + f2h; |
| 441 | t = f2h - (f3 - f3h); |
| 442 | s = f3l + t; |
| 443 | t = t - (s - f3l); |
| 444 | |
| 445 | double f2 = s + f1h; |
| 446 | t = f1h - (f2 - s) + t; |
| 447 | s = f2l + t; |
| 448 | t = t - (s - f2l); |
| 449 | |
| 450 | double f1 = s + f0h; |
| 451 | t = f0h - (f1 - s) + t; |
| 452 | s = f1l + t; |
| 453 | |
| 454 | double f0 = s + f0l; |
| 455 | |
| 456 | // Strip off unwanted large integer bits |
| 457 | f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp); |
| 458 | f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; |
| 459 | |
| 460 | // Compute least significant integer bits |
| 461 | t = f3 + f2; |
| 462 | double di = t - fract(t, &fract_temp); |
| 463 | i = (float)di; |
| 464 | |
| 465 | // Shift out remaining integer part |
| 466 | f3 -= di; |
| 467 | s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t; |
| 468 | s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t; |
| 469 | f1 += f0; |
| 470 | |
| 471 | // Subtract 1 if fraction is >= 0.5, and update regn |
| 472 | int g = f3 >= 0.5; |
| 473 | i += g; |
| 474 | f3 -= (float)g; |
| 475 | |
| 476 | // Shift up bits |
| 477 | s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1; |
| 478 | |
| 479 | // Multiply precise fraction by pi/2 to get radians |
| 480 | const double p2h = 7074237752028440.0 / 0x1.0p+52; |
| 481 | const double p2t = 4967757600021510.0 / 0x1.0p+106; |
| 482 | |
| 483 | double rhi = f3 * p2h; |
| 484 | double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi))); |
| 485 | |
| 486 | *r = rhi + rlo; |
| 487 | *rr = rlo - (*r - rhi); |
| 488 | *regn = i & 0x3; |
| 489 | } |
| 490 | |
| 491 | |
| 492 | _CLC_DEF double2 __clc_sincos_piby4(double x, double xx) { |
| 493 | // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... |
| 494 | // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... |
| 495 | // = x * f(w) |
| 496 | // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... |
| 497 | // We use a minimax approximation of (f(w) - 1) / w |
| 498 | // because this produces an expansion in even powers of x. |
| 499 | // If xx (the tail of x) is non-zero, we add a correction |
| 500 | // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) |
| 501 | // is an approximation to cos(x)*sin(xx) valid because |
| 502 | // xx is tiny relative to x. |
| 503 | |
| 504 | // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... |
| 505 | // = f(w) |
| 506 | // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... |
| 507 | // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) |
| 508 | // because this produces an expansion in even powers of x. |
| 509 | // If xx (the tail of x) is non-zero, we subtract a correction |
| 510 | // term g(x,xx) = x*xx to the result, where g(x,xx) |
| 511 | // is an approximation to sin(x)*sin(xx) valid because |
| 512 | // xx is tiny relative to x. |
| 513 | |
| 514 | const double sc1 = -0.166666666666666646259241729; |
| 515 | const double sc2 = 0.833333333333095043065222816e-2; |
| 516 | const double sc3 = -0.19841269836761125688538679e-3; |
| 517 | const double sc4 = 0.275573161037288022676895908448e-5; |
| 518 | const double sc5 = -0.25051132068021699772257377197e-7; |
| 519 | const double sc6 = 0.159181443044859136852668200e-9; |
| 520 | |
| 521 | const double cc1 = 0.41666666666666665390037e-1; |
| 522 | const double cc2 = -0.13888888888887398280412e-2; |
| 523 | const double cc3 = 0.248015872987670414957399e-4; |
| 524 | const double cc4 = -0.275573172723441909470836e-6; |
| 525 | const double cc5 = 0.208761463822329611076335e-8; |
| 526 | const double cc6 = -0.113826398067944859590880e-10; |
| 527 | |
| 528 | double x2 = x * x; |
| 529 | double x3 = x2 * x; |
| 530 | double r = 0.5 * x2; |
| 531 | double t = 1.0 - r; |
| 532 | |
| 533 | double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); |
| 534 | |
| 535 | double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), |
| 536 | x2*x2, fma(x, xx, (1.0 - t) - r)); |
| 537 | |
| 538 | double2 ret; |
| 539 | ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); |
| 540 | ret.hi = cp; |
| 541 | |
| 542 | return ret; |
| 543 | } |
| 544 | |
| 545 | #endif |