Blame - libclc/generic/lib/math/sincos_helpers.cl - toolchain/llvm-project

blob: 251b7f94efdc081cc749bf9a2d160a958c415b1d [file] [log] [blame]

Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2014 Advanced Micro Devices, Inc.
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining a copy
				5	* of this software and associated documentation files (the "Software"), to deal
				6	* in the Software without restriction, including without limitation the rights
				7	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
				8	* copies of the Software, and to permit persons to whom the Software is
				9	* furnished to do so, subject to the following conditions:
				10	*
				11	* The above copyright notice and this permission notice shall be included in
				12	* all copies or substantial portions of the Software.
				13	*
				14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
				17	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				18	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
				19	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
				20	* THE SOFTWARE.
				21	*/
				22
				23	#include <clc/clc.h>
				24
				25	#include "math.h"
Tom Stellard	2e6ff0c	2015-05-12 17:18:46 +0000	[diff] [blame]	26	#include "tables.h"
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	27	#include "sincos_helpers.h"
				28
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	29	#define bitalign(hi, lo, shift) \
				30	((hi) << (32 - (shift))) \| ((lo) >> (shift));
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	31
Tom Stellard	2e6ff0c	2015-05-12 17:18:46 +0000	[diff] [blame]	32	#define bytealign(src0, src1, src2) \
				33	((uint) (((((long)(src0)) << 32) \| (long)(src1)) >> (((src2) & 3)*8)))
				34
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	35	_CLC_DEF float __clc_sinf_piby4(float x, float y) {
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	36	// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
				37	// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
				38	// = x * f(w)
				39	// where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
				40	// We use a minimax approximation of (f(w) - 1) / w
				41	// because this produces an expansion in even powers of x.
				42
				43	const float c1 = -0.1666666666e0f;
				44	const float c2 = 0.8333331876e-2f;
				45	const float c3 = -0.198400874e-3f;
				46	const float c4 = 0.272500015e-5f;
				47	const float c5 = -2.5050759689e-08f; // 0xb2d72f34
				48	const float c6 = 1.5896910177e-10f; // 0x2f2ec9d3
				49
				50	float z = x * x;
				51	float v = z * x;
				52	float r = mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2);
				53	float ret = x - mad(v, -c1, mad(z, mad(y, 0.5f, -v*r), -y));
				54
				55	return ret;
				56	}
				57
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	58	_CLC_DEF float __clc_cosf_piby4(float x, float y) {
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	59	// Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
				60	// = f(w)
				61	// where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
				62	// We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
				63	// because this produces an expansion in even powers of x.
				64
				65	const float c1 = 0.416666666e-1f;
				66	const float c2 = -0.138888876e-2f;
				67	const float c3 = 0.248006008e-4f;
				68	const float c4 = -0.2730101334e-6f;
				69	const float c5 = 2.0875723372e-09f; // 0x310f74f6
				70	const float c6 = -1.1359647598e-11f; // 0xad47d74e
				71
				72	float z = x * x;
				73	float r = z * mad(z, mad(z, mad(z, mad(z, mad(z, c6, c5), c4), c3), c2), c1);
				74
				75	// if \|x\| < 0.3
				76	float qx = 0.0f;
				77
				78	int ix = as_int(x) & EXSIGNBIT_SP32;
				79
				80	// 0.78125 > \|x\| >= 0.3
				81	float xby4 = as_float(ix - 0x01000000);
				82	qx = (ix >= 0x3e99999a) & (ix <= 0x3f480000) ? xby4 : qx;
				83
				84	// x > 0.78125
				85	qx = ix > 0x3f480000 ? 0.28125f : qx;
				86
				87	float hz = mad(z, 0.5f, -qx);
				88	float a = 1.0f - qx;
				89	float ret = a - (hz - mad(z, r, -x*y));
				90	return ret;
				91	}
				92
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	93	_CLC_DEF void __clc_fullMulS(float hi, float lo, float a, float b, float bh, float bt)
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	94	{
				95	if (HAVE_HW_FMA32()) {
				96	float ph = a * b;
				97	*hi = ph;
				98	*lo = fma(a, b, -ph);
				99	} else {
				100	float ah = as_float(as_uint(a) & 0xfffff000U);
				101	float at = a - ah;
				102	float ph = a * b;
				103	float pt = mad(at, bt, mad(at, bh, mad(ah, bt, mad(ah, bh, -ph))));
				104	*hi = ph;
				105	*lo = pt;
				106	}
				107	}
				108
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	109	_CLC_DEF float __clc_removePi2S(float hi, float lo, float x)
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	110	{
				111	// 72 bits of pi/2
				112	const float fpiby2_1 = (float) 0xC90FDA / 0x1.0p+23f;
				113	const float fpiby2_1_h = (float) 0xC90 / 0x1.0p+11f;
				114	const float fpiby2_1_t = (float) 0xFDA / 0x1.0p+23f;
				115
				116	const float fpiby2_2 = (float) 0xA22168 / 0x1.0p+47f;
				117	const float fpiby2_2_h = (float) 0xA22 / 0x1.0p+35f;
				118	const float fpiby2_2_t = (float) 0x168 / 0x1.0p+47f;
				119
				120	const float fpiby2_3 = (float) 0xC234C4 / 0x1.0p+71f;
				121	const float fpiby2_3_h = (float) 0xC23 / 0x1.0p+59f;
				122	const float fpiby2_3_t = (float) 0x4C4 / 0x1.0p+71f;
				123
				124	const float twobypi = 0x1.45f306p-1f;
				125
				126	float fnpi2 = trunc(mad(x, twobypi, 0.5f));
				127
				128	// subtract n * pi/2 from x
				129	float rhead, rtail;
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	130	__clc_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	131	float v = x - rhead;
				132	float rem = v + (((x - v) - rhead) - rtail);
				133
				134	float rhead2, rtail2;
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	135	__clc_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	136	v = rem - rhead2;
				137	rem = v + (((rem - v) - rhead2) - rtail2);
				138
				139	float rhead3, rtail3;
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	140	__clc_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	141	v = rem - rhead3;
				142
				143	*hi = v + ((rem - v) - rhead3);
				144	*lo = -rtail3;
				145	return fnpi2;
				146	}
				147
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	148	_CLC_DEF int __clc_argReductionSmallS(float r, float rr, float x)
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	149	{
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	150	float fnpi2 = __clc_removePi2S(r, rr, x);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	151	return (int)fnpi2 & 0x3;
				152	}
				153
				154	#define FULL_MUL(A, B, HI, LO) \
				155	LO = A * B; \
				156	HI = mul_hi(A, B)
				157
				158	#define FULL_MAD(A, B, C, HI, LO) \
				159	LO = ((A) * (B) + (C)); \
				160	HI = mul_hi(A, B); \
				161	HI += LO < C
				162
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	163	_CLC_DEF int __clc_argReductionLargeS(float r, float rr, float x)
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	164	{
				165	int xe = (int)(as_uint(x) >> 23) - 127;
				166	uint xm = 0x00800000U \| (as_uint(x) & 0x7fffffU);
				167
				168	// 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
				169	const uint b6 = 0xA2F9836EU;
				170	const uint b5 = 0x4E441529U;
				171	const uint b4 = 0xFC2757D1U;
				172	const uint b3 = 0xF534DDC0U;
				173	const uint b2 = 0xDB629599U;
				174	const uint b1 = 0x3C439041U;
				175	const uint b0 = 0xFE5163ABU;
				176
				177	uint p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
				178
				179	FULL_MUL(xm, b0, c0, p0);
				180	FULL_MAD(xm, b1, c0, c1, p1);
				181	FULL_MAD(xm, b2, c1, c0, p2);
				182	FULL_MAD(xm, b3, c0, c1, p3);
				183	FULL_MAD(xm, b4, c1, c0, p4);
				184	FULL_MAD(xm, b5, c0, c1, p5);
				185	FULL_MAD(xm, b6, c1, p7, p6);
				186
				187	uint fbits = 224 + 23 - xe;
				188
				189	// shift amount to get 2 lsb of integer part at top 2 bits
				190	// min: 25 (xe=18) max: 134 (xe=127)
				191	uint shift = 256U - 2 - fbits;
				192
				193	// Shift by up to 134/32 = 4 words
				194	int c = shift > 31;
				195	p7 = c ? p6 : p7;
				196	p6 = c ? p5 : p6;
				197	p5 = c ? p4 : p5;
				198	p4 = c ? p3 : p4;
				199	p3 = c ? p2 : p3;
				200	p2 = c ? p1 : p2;
				201	p1 = c ? p0 : p1;
				202	shift -= (-c) & 32;
				203
				204	c = shift > 31;
				205	p7 = c ? p6 : p7;
				206	p6 = c ? p5 : p6;
				207	p5 = c ? p4 : p5;
				208	p4 = c ? p3 : p4;
				209	p3 = c ? p2 : p3;
				210	p2 = c ? p1 : p2;
				211	shift -= (-c) & 32;
				212
				213	c = shift > 31;
				214	p7 = c ? p6 : p7;
				215	p6 = c ? p5 : p6;
				216	p5 = c ? p4 : p5;
				217	p4 = c ? p3 : p4;
				218	p3 = c ? p2 : p3;
				219	shift -= (-c) & 32;
				220
				221	c = shift > 31;
				222	p7 = c ? p6 : p7;
				223	p6 = c ? p5 : p6;
				224	p5 = c ? p4 : p5;
				225	p4 = c ? p3 : p4;
				226	shift -= (-c) & 32;
				227
				228	// bitalign cannot handle a shift of 32
				229	c = shift > 0;
				230	shift = 32 - shift;
				231	uint t7 = bitalign(p7, p6, shift);
				232	uint t6 = bitalign(p6, p5, shift);
				233	uint t5 = bitalign(p5, p4, shift);
				234	p7 = c ? t7 : p7;
				235	p6 = c ? t6 : p6;
				236	p5 = c ? t5 : p5;
				237
				238	// Get 2 lsb of int part and msb of fraction
				239	int i = p7 >> 29;
				240
				241	// Scoot up 2 more bits so only fraction remains
				242	p7 = bitalign(p7, p6, 30);
				243	p6 = bitalign(p6, p5, 30);
				244	p5 = bitalign(p5, p4, 30);
				245
				246	// Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
				247	uint flip = i & 1 ? 0xffffffffU : 0U;
				248	uint sign = i & 1 ? 0x80000000U : 0U;
				249	p7 = p7 ^ flip;
				250	p6 = p6 ^ flip;
				251	p5 = p5 ^ flip;
				252
				253	// Find exponent and shift away leading zeroes and hidden bit
				254	xe = clz(p7) + 1;
				255	shift = 32 - xe;
				256	p7 = bitalign(p7, p6, shift);
				257	p6 = bitalign(p6, p5, shift);
				258
				259	// Most significant part of fraction
				260	float q1 = as_float(sign \| ((127 - xe) << 23) \| (p7 >> 9));
				261
				262	// Shift out bits we captured on q1
				263	p7 = bitalign(p7, p6, 32-23);
				264
				265	// Get 24 more bits of fraction in another float, there are not long strings of zeroes here
				266	int xxe = clz(p7) + 1;
				267	p7 = bitalign(p7, p6, 32-xxe);
				268	float q0 = as_float(sign \| ((127 - (xe + 23 + xxe)) << 23) \| (p7 >> 9));
				269
				270	// At this point, the fraction q1 + q0 is correct to at least 48 bits
				271	// Now we need to multiply the fraction by pi/2
				272	// This loses us about 4 bits
				273	// pi/2 = C90 FDA A22 168 C23 4C4
				274
				275	const float pio2h = (float)0xc90fda / 0x1.0p+23f;
				276	const float pio2hh = (float)0xc90 / 0x1.0p+11f;
				277	const float pio2ht = (float)0xfda / 0x1.0p+23f;
				278	const float pio2t = (float)0xa22168 / 0x1.0p+47f;
				279
				280	float rh, rt;
				281
				282	if (HAVE_HW_FMA32()) {
				283	rh = q1 * pio2h;
				284	rt = fma(q0, pio2h, fma(q1, pio2t, fma(q1, pio2h, -rh)));
				285	} else {
				286	float q1h = as_float(as_uint(q1) & 0xfffff000);
				287	float q1t = q1 - q1h;
				288	rh = q1 * pio2h;
				289	rt = mad(q1t, pio2ht, mad(q1t, pio2hh, mad(q1h, pio2ht, mad(q1h, pio2hh, -rh))));
				290	rt = mad(q0, pio2h, mad(q1, pio2t, rt));
				291	}
				292
				293	float t = rh + rt;
				294	rt = rt - (t - rh);
				295
				296	*r = t;
				297	*rr = rt;
				298	return ((i >> 1) + (i & 1)) & 0x3;
				299	}
				300
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	301	_CLC_DEF int __clc_argReductionS(float r, float rr, float x)
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	302	{
				303	if (x < 0x1.0p+23f)
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	304	return __clc_argReductionSmallS(r, rr, x);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	305	else
Tom Stellard	8d3a4e3	2015-03-23 16:20:24 +0000	[diff] [blame]	306	return __clc_argReductionLargeS(r, rr, x);
Tom Stellard	c0ab2f8	2014-07-23 15:16:18 +0000	[diff] [blame]	307	}
				308
Tom Stellard	2e6ff0c	2015-05-12 17:18:46 +0000	[diff] [blame]	309	#ifdef cl_khr_fp64
				310
				311	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
				312
				313	// Reduction for medium sized arguments
				314	_CLC_DEF void __clc_remainder_piby2_medium(double x, double r, double rr, int *regn) {
				315	// How many pi/2 is x a multiple of?
				316	const double two_by_pi = 0x1.45f306dc9c883p-1;
				317	double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
				318
				319	const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
				320	const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
				321	const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
				322
				323	// Compute product of npi2 with 159 bits of 2/pi
				324	double p_hh = piby2_h * dnpi2;
				325	double p_ht = fma(piby2_h, dnpi2, -p_hh);
				326	double p_mh = piby2_m * dnpi2;
				327	double p_mt = fma(piby2_m, dnpi2, -p_mh);
				328	double p_th = piby2_t * dnpi2;
				329	double p_tt = fma(piby2_t, dnpi2, -p_th);
				330
				331	// Reduce to 159 bits
				332	double ph = p_hh;
				333	double pm = p_ht + p_mh;
				334	double t = p_mh - (pm - p_ht);
				335	double pt = p_th + t + p_mt + p_tt;
				336	t = ph + pm; pm = pm - (t - ph); ph = t;
				337	t = pm + pt; pt = pt - (t - pm); pm = t;
				338
				339	// Subtract from x
				340	t = x + ph;
				341	double qh = t + pm;
				342	double qt = pm - (qh - t) + pt;
				343
				344	*r = qh;
				345	*rr = qt;
				346	*regn = (int)(long)dnpi2 & 0x3;
				347	}
				348
				349	// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
				350	// extra precision, and return the result in r, rr.
				351	// Return value "regn" tells how many lots of pi/2 were subtracted
				352	// from x to put it in the range [-pi/4,pi/4], mod 4.
				353
				354	_CLC_DEF void __clc_remainder_piby2_large(double x, double r, double rr, int *regn) {
				355
				356	long ux = as_long(x);
				357	int e = (int)(ux >> 52) - 1023;
				358	int i = max(23, (e >> 3) + 17);
				359	int j = 150 - i;
				360	int j16 = j & ~0xf;
				361	double fract_temp;
				362
				363	// The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
				364	uint4 q0 = USE_TABLE(pibits_tbl, j16);
				365	uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
				366	uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
				367
				368	int k = (j >> 2) & 0x3;
				369	int4 c = (int4)k == (int4)(0, 1, 2, 3);
				370
				371	uint u0, u1, u2, u3, u4, u5, u6;
				372
				373	u0 = c.s1 ? q0.s1 : q0.s0;
				374	u0 = c.s2 ? q0.s2 : u0;
				375	u0 = c.s3 ? q0.s3 : u0;
				376
				377	u1 = c.s1 ? q0.s2 : q0.s1;
				378	u1 = c.s2 ? q0.s3 : u1;
				379	u1 = c.s3 ? q1.s0 : u1;
				380
				381	u2 = c.s1 ? q0.s3 : q0.s2;
				382	u2 = c.s2 ? q1.s0 : u2;
				383	u2 = c.s3 ? q1.s1 : u2;
				384
				385	u3 = c.s1 ? q1.s0 : q0.s3;
				386	u3 = c.s2 ? q1.s1 : u3;
				387	u3 = c.s3 ? q1.s2 : u3;
				388
				389	u4 = c.s1 ? q1.s1 : q1.s0;
				390	u4 = c.s2 ? q1.s2 : u4;
				391	u4 = c.s3 ? q1.s3 : u4;
				392
				393	u5 = c.s1 ? q1.s2 : q1.s1;
				394	u5 = c.s2 ? q1.s3 : u5;
				395	u5 = c.s3 ? q2.s0 : u5;
				396
				397	u6 = c.s1 ? q1.s3 : q1.s2;
				398	u6 = c.s2 ? q2.s0 : u6;
				399	u6 = c.s3 ? q2.s1 : u6;
				400
				401	uint v0 = bytealign(u1, u0, j);
				402	uint v1 = bytealign(u2, u1, j);
				403	uint v2 = bytealign(u3, u2, j);
				404	uint v3 = bytealign(u4, u3, j);
				405	uint v4 = bytealign(u5, u4, j);
				406	uint v5 = bytealign(u6, u5, j);
				407
				408	// Place those 192 bits in 4 48-bit doubles along with correct exponent
				409	// If i > 1018 we would get subnormals so we scale p up and x down to get the same product
				410	i = 2 + 8*i;
				411	x *= i > 1018 ? 0x1.0p-136 : 1.0;
				412	i -= i > 1018 ? 136 : 0;
				413
				414	uint ua = (uint)(1023 + 52 - i) << 20;
				415	double a = as_double((uint2)(0, ua));
				416	double p0 = as_double((uint2)(v0, ua \| (v1 & 0xffffU))) - a;
				417	ua += 0x03000000U;
				418	a = as_double((uint2)(0, ua));
				419	double p1 = as_double((uint2)((v2 << 16) \| (v1 >> 16), ua \| (v2 >> 16))) - a;
				420	ua += 0x03000000U;
				421	a = as_double((uint2)(0, ua));
				422	double p2 = as_double((uint2)(v3, ua \| (v4 & 0xffffU))) - a;
				423	ua += 0x03000000U;
				424	a = as_double((uint2)(0, ua));
				425	double p3 = as_double((uint2)((v5 << 16) \| (v4 >> 16), ua \| (v5 >> 16))) - a;
				426
				427	// Exact multiply
				428	double f0h = p0 * x;
				429	double f0l = fma(p0, x, -f0h);
				430	double f1h = p1 * x;
				431	double f1l = fma(p1, x, -f1h);
				432	double f2h = p2 * x;
				433	double f2l = fma(p2, x, -f2h);
				434	double f3h = p3 * x;
				435	double f3l = fma(p3, x, -f3h);
				436
				437	// Accumulate product into 4 doubles
				438	double s, t;
				439
				440	double f3 = f3h + f2h;
				441	t = f2h - (f3 - f3h);
				442	s = f3l + t;
				443	t = t - (s - f3l);
				444
				445	double f2 = s + f1h;
				446	t = f1h - (f2 - s) + t;
				447	s = f2l + t;
				448	t = t - (s - f2l);
				449
				450	double f1 = s + f0h;
				451	t = f0h - (f1 - s) + t;
				452	s = f1l + t;
				453
				454	double f0 = s + f0l;
				455
				456	// Strip off unwanted large integer bits
				457	f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
				458	f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
				459
				460	// Compute least significant integer bits
				461	t = f3 + f2;
				462	double di = t - fract(t, &fract_temp);
				463	i = (float)di;
				464
				465	// Shift out remaining integer part
				466	f3 -= di;
				467	s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
				468	s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
				469	f1 += f0;
				470
				471	// Subtract 1 if fraction is >= 0.5, and update regn
				472	int g = f3 >= 0.5;
				473	i += g;
				474	f3 -= (float)g;
				475
				476	// Shift up bits
				477	s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
				478
				479	// Multiply precise fraction by pi/2 to get radians
				480	const double p2h = 7074237752028440.0 / 0x1.0p+52;
				481	const double p2t = 4967757600021510.0 / 0x1.0p+106;
				482
				483	double rhi = f3 * p2h;
				484	double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
				485
				486	*r = rhi + rlo;
				487	rr = rlo - (r - rhi);
				488	*regn = i & 0x3;
				489	}
				490
				491
				492	_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
				493	// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
				494	// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
				495	// = x * f(w)
				496	// where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
				497	// We use a minimax approximation of (f(w) - 1) / w
				498	// because this produces an expansion in even powers of x.
				499	// If xx (the tail of x) is non-zero, we add a correction
				500	// term g(x,xx) = (1-xx/2)xx to the result, where g(x,xx)
				501	// is an approximation to cos(x)*sin(xx) valid because
				502	// xx is tiny relative to x.
				503
				504	// Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
				505	// = f(w)
				506	// where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
				507	// We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
				508	// because this produces an expansion in even powers of x.
				509	// If xx (the tail of x) is non-zero, we subtract a correction
				510	// term g(x,xx) = x*xx to the result, where g(x,xx)
				511	// is an approximation to sin(x)*sin(xx) valid because
				512	// xx is tiny relative to x.
				513
				514	const double sc1 = -0.166666666666666646259241729;
				515	const double sc2 = 0.833333333333095043065222816e-2;
				516	const double sc3 = -0.19841269836761125688538679e-3;
				517	const double sc4 = 0.275573161037288022676895908448e-5;
				518	const double sc5 = -0.25051132068021699772257377197e-7;
				519	const double sc6 = 0.159181443044859136852668200e-9;
				520
				521	const double cc1 = 0.41666666666666665390037e-1;
				522	const double cc2 = -0.13888888888887398280412e-2;
				523	const double cc3 = 0.248015872987670414957399e-4;
				524	const double cc4 = -0.275573172723441909470836e-6;
				525	const double cc5 = 0.208761463822329611076335e-8;
				526	const double cc6 = -0.113826398067944859590880e-10;
				527
				528	double x2 = x * x;
				529	double x3 = x2 * x;
				530	double r = 0.5 * x2;
				531	double t = 1.0 - r;
				532
				533	double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
				534
				535	double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
				536	x2*x2, fma(x, xx, (1.0 - t) - r));
				537
				538	double2 ret;
				539	ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
				540	ret.hi = cp;
				541
				542	return ret;
				543	}
				544
				545	#endif