blob: 251b7f94efdc081cc749bf9a2d160a958c415b1d [file] [log] [blame]
Tom Stellardc0ab2f82014-07-23 15:16:18 +00001/*
2 * Copyright (c) 2014 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 * THE SOFTWARE.
21 */
22
23#include <clc/clc.h>
24
25#include "math.h"
Tom Stellard2e6ff0c2015-05-12 17:18:46 +000026#include "tables.h"
Tom Stellardc0ab2f82014-07-23 15:16:18 +000027#include "sincos_helpers.h"
28
Tom Stellard8d3a4e32015-03-23 16:20:24 +000029#define bitalign(hi, lo, shift) \
30 ((hi) << (32 - (shift))) | ((lo) >> (shift));
Tom Stellardc0ab2f82014-07-23 15:16:18 +000031
Tom Stellard2e6ff0c2015-05-12 17:18:46 +000032#define bytealign(src0, src1, src2) \
33 ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
34
Tom Stellard8d3a4e32015-03-23 16:20:24 +000035_CLC_DEF float __clc_sinf_piby4(float x, float y) {
Tom Stellardc0ab2f82014-07-23 15:16:18 +000036 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
37 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
38 // = x * f(w)
39 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
40 // We use a minimax approximation of (f(w) - 1) / w
41 // because this produces an expansion in even powers of x.
42
43 const float c1 = -0.1666666666e0f;
44 const float c2 = 0.8333331876e-2f;
45 const float c3 = -0.198400874e-3f;
46 const float c4 = 0.272500015e-5f;
47 const float c5 = -2.5050759689e-08f; // 0xb2d72f34
48 const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3
49
50 float z = x * x;
51 float v = z * x;
52 float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
53 float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
54
55 return ret;
56}
57
Tom Stellard8d3a4e32015-03-23 16:20:24 +000058_CLC_DEF float __clc_cosf_piby4(float x, float y) {
Tom Stellardc0ab2f82014-07-23 15:16:18 +000059 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
60 // = f(w)
61 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
62 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
63 // because this produces an expansion in even powers of x.
64
65 const float c1 = 0.416666666e-1f;
66 const float c2 = -0.138888876e-2f;
67 const float c3 = 0.248006008e-4f;
68 const float c4 = -0.2730101334e-6f;
69 const float c5 = 2.0875723372e-09f; // 0x310f74f6
70 const float c6 = -1.1359647598e-11f; // 0xad47d74e
71
72 float z = x * x;
73 float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1);
74
75 // if |x| < 0.3
76 float qx = 0.0f;
77
78 int ix = as_int(x) & EXSIGNBIT_SP32;
79
80 // 0.78125 > |x| >= 0.3
81 float xby4 = as_float(ix - 0x01000000);
82 qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
83
84 // x > 0.78125
85 qx = ix > 0x3f480000 ? 0.28125f : qx;
86
87 float hz = mad(z, 0.5f, -qx);
88 float a = 1.0f - qx;
89 float ret = a - (hz - mad(z, r, -x*y));
90 return ret;
91}
92
Tom Stellard8d3a4e32015-03-23 16:20:24 +000093_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, float bt)
Tom Stellardc0ab2f82014-07-23 15:16:18 +000094{
95 if (HAVE_HW_FMA32()) {
96 float ph = a * b;
97 *hi = ph;
98 *lo = fma(a, b, -ph);
99 } else {
100 float ah = as_float(as_uint(a) & 0xfffff000U);
101 float at = a - ah;
102 float ph = a * b;
103 float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
104 *hi = ph;
105 *lo = pt;
106 }
107}
108
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000109_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x)
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000110{
111 // 72 bits of pi/2
112 const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
113 const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
114 const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
115
116 const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
117 const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
118 const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
119
120 const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
121 const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
122 const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
123
124 const float twobypi = 0x1.45f306p-1f;
125
126 float fnpi2 = trunc(mad(x, twobypi, 0.5f));
127
128 // subtract n * pi/2 from x
129 float rhead, rtail;
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000130 __clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000131 float v = x - rhead;
132 float rem = v + (((x - v) - rhead) - rtail);
133
134 float rhead2, rtail2;
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000135 __clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000136 v = rem - rhead2;
137 rem = v + (((rem - v) - rhead2) - rtail2);
138
139 float rhead3, rtail3;
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000140 __clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000141 v = rem - rhead3;
142
143 *hi = v + ((rem - v) - rhead3);
144 *lo = -rtail3;
145 return fnpi2;
146}
147
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000148_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000149{
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000150 float fnpi2 = __clc_removePi2S(r, rr, x);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000151 return (int)fnpi2 & 0x3;
152}
153
154#define FULL_MUL(A, B, HI, LO) \
155 LO = A * B; \
156 HI = mul_hi(A, B)
157
158#define FULL_MAD(A, B, C, HI, LO) \
159 LO = ((A) * (B) + (C)); \
160 HI = mul_hi(A, B); \
161 HI += LO < C
162
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000163_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000164{
165 int xe = (int)(as_uint(x) >> 23) - 127;
166 uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU);
167
168 // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
169 const uint b6 = 0xA2F9836EU;
170 const uint b5 = 0x4E441529U;
171 const uint b4 = 0xFC2757D1U;
172 const uint b3 = 0xF534DDC0U;
173 const uint b2 = 0xDB629599U;
174 const uint b1 = 0x3C439041U;
175 const uint b0 = 0xFE5163ABU;
176
177 uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
178
179 FULL_MUL(xm, b0, c0, p0);
180 FULL_MAD(xm, b1, c0, c1, p1);
181 FULL_MAD(xm, b2, c1, c0, p2);
182 FULL_MAD(xm, b3, c0, c1, p3);
183 FULL_MAD(xm, b4, c1, c0, p4);
184 FULL_MAD(xm, b5, c0, c1, p5);
185 FULL_MAD(xm, b6, c1, p7, p6);
186
187 uint fbits = 224 + 23 - xe;
188
189 // shift amount to get 2 lsb of integer part at top 2 bits
190 // min: 25 (xe=18) max: 134 (xe=127)
191 uint shift = 256U - 2 - fbits;
192
193 // Shift by up to 134/32 = 4 words
194 int c = shift > 31;
195 p7 = c ? p6 : p7;
196 p6 = c ? p5 : p6;
197 p5 = c ? p4 : p5;
198 p4 = c ? p3 : p4;
199 p3 = c ? p2 : p3;
200 p2 = c ? p1 : p2;
201 p1 = c ? p0 : p1;
202 shift -= (-c) & 32;
203
204 c = shift > 31;
205 p7 = c ? p6 : p7;
206 p6 = c ? p5 : p6;
207 p5 = c ? p4 : p5;
208 p4 = c ? p3 : p4;
209 p3 = c ? p2 : p3;
210 p2 = c ? p1 : p2;
211 shift -= (-c) & 32;
212
213 c = shift > 31;
214 p7 = c ? p6 : p7;
215 p6 = c ? p5 : p6;
216 p5 = c ? p4 : p5;
217 p4 = c ? p3 : p4;
218 p3 = c ? p2 : p3;
219 shift -= (-c) & 32;
220
221 c = shift > 31;
222 p7 = c ? p6 : p7;
223 p6 = c ? p5 : p6;
224 p5 = c ? p4 : p5;
225 p4 = c ? p3 : p4;
226 shift -= (-c) & 32;
227
228 // bitalign cannot handle a shift of 32
229 c = shift > 0;
230 shift = 32 - shift;
231 uint t7 = bitalign(p7, p6, shift);
232 uint t6 = bitalign(p6, p5, shift);
233 uint t5 = bitalign(p5, p4, shift);
234 p7 = c ? t7 : p7;
235 p6 = c ? t6 : p6;
236 p5 = c ? t5 : p5;
237
238 // Get 2 lsb of int part and msb of fraction
239 int i = p7 >> 29;
240
241 // Scoot up 2 more bits so only fraction remains
242 p7 = bitalign(p7, p6, 30);
243 p6 = bitalign(p6, p5, 30);
244 p5 = bitalign(p5, p4, 30);
245
246 // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
247 uint flip = i & 1 ? 0xffffffffU : 0U;
248 uint sign = i & 1 ? 0x80000000U : 0U;
249 p7 = p7 ^ flip;
250 p6 = p6 ^ flip;
251 p5 = p5 ^ flip;
252
253 // Find exponent and shift away leading zeroes and hidden bit
254 xe = clz(p7) + 1;
255 shift = 32 - xe;
256 p7 = bitalign(p7, p6, shift);
257 p6 = bitalign(p6, p5, shift);
258
259 // Most significant part of fraction
260 float q1 = as_float(sign | ((127 - xe) << 23) | (p7 >> 9));
261
262 // Shift out bits we captured on q1
263 p7 = bitalign(p7, p6, 32-23);
264
265 // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
266 int xxe = clz(p7) + 1;
267 p7 = bitalign(p7, p6, 32-xxe);
268 float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));
269
270 // At this point, the fraction q1 + q0 is correct to at least 48 bits
271 // Now we need to multiply the fraction by pi/2
272 // This loses us about 4 bits
273 // pi/2 = C90 FDA A22 168 C23 4C4
274
275 const float pio2h = (float)0xc90fda / 0x1.0p+23f;
276 const float pio2hh = (float)0xc90 / 0x1.0p+11f;
277 const float pio2ht = (float)0xfda / 0x1.0p+23f;
278 const float pio2t = (float)0xa22168 / 0x1.0p+47f;
279
280 float rh, rt;
281
282 if (HAVE_HW_FMA32()) {
283 rh = q1 * pio2h;
284 rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
285 } else {
286 float q1h = as_float(as_uint(q1) & 0xfffff000);
287 float q1t = q1 - q1h;
288 rh = q1 * pio2h;
289 rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
290 rt = mad(q0, pio2h, mad(q1, pio2t, rt));
291 }
292
293 float t = rh + rt;
294 rt = rt - (t - rh);
295
296 *r = t;
297 *rr = rt;
298 return ((i >> 1) + (i & 1)) & 0x3;
299}
300
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000301_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000302{
303 if (x < 0x1.0p+23f)
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000304 return __clc_argReductionSmallS(r, rr, x);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000305 else
Tom Stellard8d3a4e32015-03-23 16:20:24 +0000306 return __clc_argReductionLargeS(r, rr, x);
Tom Stellardc0ab2f82014-07-23 15:16:18 +0000307}
308
Tom Stellard2e6ff0c2015-05-12 17:18:46 +0000309#ifdef cl_khr_fp64
310
311#pragma OPENCL EXTENSION cl_khr_fp64 : enable
312
313// Reduction for medium sized arguments
314_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
315 // How many pi/2 is x a multiple of?
316 const double two_by_pi = 0x1.45f306dc9c883p-1;
317 double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
318
319 const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
320 const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
321 const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
322
323 // Compute product of npi2 with 159 bits of 2/pi
324 double p_hh = piby2_h * dnpi2;
325 double p_ht = fma(piby2_h, dnpi2, -p_hh);
326 double p_mh = piby2_m * dnpi2;
327 double p_mt = fma(piby2_m, dnpi2, -p_mh);
328 double p_th = piby2_t * dnpi2;
329 double p_tt = fma(piby2_t, dnpi2, -p_th);
330
331 // Reduce to 159 bits
332 double ph = p_hh;
333 double pm = p_ht + p_mh;
334 double t = p_mh - (pm - p_ht);
335 double pt = p_th + t + p_mt + p_tt;
336 t = ph + pm; pm = pm - (t - ph); ph = t;
337 t = pm + pt; pt = pt - (t - pm); pm = t;
338
339 // Subtract from x
340 t = x + ph;
341 double qh = t + pm;
342 double qt = pm - (qh - t) + pt;
343
344 *r = qh;
345 *rr = qt;
346 *regn = (int)(long)dnpi2 & 0x3;
347}
348
349// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
350// extra precision, and return the result in r, rr.
351// Return value "regn" tells how many lots of pi/2 were subtracted
352// from x to put it in the range [-pi/4,pi/4], mod 4.
353
354_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
355
356 long ux = as_long(x);
357 int e = (int)(ux >> 52) - 1023;
358 int i = max(23, (e >> 3) + 17);
359 int j = 150 - i;
360 int j16 = j & ~0xf;
361 double fract_temp;
362
363 // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
364 uint4 q0 = USE_TABLE(pibits_tbl, j16);
365 uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
366 uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
367
368 int k = (j >> 2) & 0x3;
369 int4 c = (int4)k == (int4)(0, 1, 2, 3);
370
371 uint u0, u1, u2, u3, u4, u5, u6;
372
373 u0 = c.s1 ? q0.s1 : q0.s0;
374 u0 = c.s2 ? q0.s2 : u0;
375 u0 = c.s3 ? q0.s3 : u0;
376
377 u1 = c.s1 ? q0.s2 : q0.s1;
378 u1 = c.s2 ? q0.s3 : u1;
379 u1 = c.s3 ? q1.s0 : u1;
380
381 u2 = c.s1 ? q0.s3 : q0.s2;
382 u2 = c.s2 ? q1.s0 : u2;
383 u2 = c.s3 ? q1.s1 : u2;
384
385 u3 = c.s1 ? q1.s0 : q0.s3;
386 u3 = c.s2 ? q1.s1 : u3;
387 u3 = c.s3 ? q1.s2 : u3;
388
389 u4 = c.s1 ? q1.s1 : q1.s0;
390 u4 = c.s2 ? q1.s2 : u4;
391 u4 = c.s3 ? q1.s3 : u4;
392
393 u5 = c.s1 ? q1.s2 : q1.s1;
394 u5 = c.s2 ? q1.s3 : u5;
395 u5 = c.s3 ? q2.s0 : u5;
396
397 u6 = c.s1 ? q1.s3 : q1.s2;
398 u6 = c.s2 ? q2.s0 : u6;
399 u6 = c.s3 ? q2.s1 : u6;
400
401 uint v0 = bytealign(u1, u0, j);
402 uint v1 = bytealign(u2, u1, j);
403 uint v2 = bytealign(u3, u2, j);
404 uint v3 = bytealign(u4, u3, j);
405 uint v4 = bytealign(u5, u4, j);
406 uint v5 = bytealign(u6, u5, j);
407
408 // Place those 192 bits in 4 48-bit doubles along with correct exponent
409 // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
410 i = 2 + 8*i;
411 x *= i > 1018 ? 0x1.0p-136 : 1.0;
412 i -= i > 1018 ? 136 : 0;
413
414 uint ua = (uint)(1023 + 52 - i) << 20;
415 double a = as_double((uint2)(0, ua));
416 double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
417 ua += 0x03000000U;
418 a = as_double((uint2)(0, ua));
419 double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
420 ua += 0x03000000U;
421 a = as_double((uint2)(0, ua));
422 double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
423 ua += 0x03000000U;
424 a = as_double((uint2)(0, ua));
425 double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
426
427 // Exact multiply
428 double f0h = p0 * x;
429 double f0l = fma(p0, x, -f0h);
430 double f1h = p1 * x;
431 double f1l = fma(p1, x, -f1h);
432 double f2h = p2 * x;
433 double f2l = fma(p2, x, -f2h);
434 double f3h = p3 * x;
435 double f3l = fma(p3, x, -f3h);
436
437 // Accumulate product into 4 doubles
438 double s, t;
439
440 double f3 = f3h + f2h;
441 t = f2h - (f3 - f3h);
442 s = f3l + t;
443 t = t - (s - f3l);
444
445 double f2 = s + f1h;
446 t = f1h - (f2 - s) + t;
447 s = f2l + t;
448 t = t - (s - f2l);
449
450 double f1 = s + f0h;
451 t = f0h - (f1 - s) + t;
452 s = f1l + t;
453
454 double f0 = s + f0l;
455
456 // Strip off unwanted large integer bits
457 f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
458 f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
459
460 // Compute least significant integer bits
461 t = f3 + f2;
462 double di = t - fract(t, &fract_temp);
463 i = (float)di;
464
465 // Shift out remaining integer part
466 f3 -= di;
467 s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
468 s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
469 f1 += f0;
470
471 // Subtract 1 if fraction is >= 0.5, and update regn
472 int g = f3 >= 0.5;
473 i += g;
474 f3 -= (float)g;
475
476 // Shift up bits
477 s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
478
479 // Multiply precise fraction by pi/2 to get radians
480 const double p2h = 7074237752028440.0 / 0x1.0p+52;
481 const double p2t = 4967757600021510.0 / 0x1.0p+106;
482
483 double rhi = f3 * p2h;
484 double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
485
486 *r = rhi + rlo;
487 *rr = rlo - (*r - rhi);
488 *regn = i & 0x3;
489}
490
491
492_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
493 // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
494 // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
495 // = x * f(w)
496 // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
497 // We use a minimax approximation of (f(w) - 1) / w
498 // because this produces an expansion in even powers of x.
499 // If xx (the tail of x) is non-zero, we add a correction
500 // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
501 // is an approximation to cos(x)*sin(xx) valid because
502 // xx is tiny relative to x.
503
504 // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
505 // = f(w)
506 // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
507 // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
508 // because this produces an expansion in even powers of x.
509 // If xx (the tail of x) is non-zero, we subtract a correction
510 // term g(x,xx) = x*xx to the result, where g(x,xx)
511 // is an approximation to sin(x)*sin(xx) valid because
512 // xx is tiny relative to x.
513
514 const double sc1 = -0.166666666666666646259241729;
515 const double sc2 = 0.833333333333095043065222816e-2;
516 const double sc3 = -0.19841269836761125688538679e-3;
517 const double sc4 = 0.275573161037288022676895908448e-5;
518 const double sc5 = -0.25051132068021699772257377197e-7;
519 const double sc6 = 0.159181443044859136852668200e-9;
520
521 const double cc1 = 0.41666666666666665390037e-1;
522 const double cc2 = -0.13888888888887398280412e-2;
523 const double cc3 = 0.248015872987670414957399e-4;
524 const double cc4 = -0.275573172723441909470836e-6;
525 const double cc5 = 0.208761463822329611076335e-8;
526 const double cc6 = -0.113826398067944859590880e-10;
527
528 double x2 = x * x;
529 double x3 = x2 * x;
530 double r = 0.5 * x2;
531 double t = 1.0 - r;
532
533 double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
534
535 double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
536 x2*x2, fma(x, xx, (1.0 - t) - r));
537
538 double2 ret;
539 ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
540 ret.hi = cp;
541
542 return ret;
543}
544
545#endif