Blame - libm/arm64/e_pow64.S - fp2-dev/platform/bionic

blob: 2a30c4aafbd633c946cf5b73c3b89d7086ac6b4e [file] [log] [blame]

Brent DeGraaf	9aa974c	2014-09-19 09:27:24 -0400	[diff] [blame^]	1	/* Copyright (c) 2009-2014 The Linux Foundation. All rights reserved.
				2	*
				3	* Redistribution and use in source and binary forms, with or without
				4	* modification, are permitted provided that the following conditions are met:
				5	* * Redistributions of source code must retain the above copyright
				6	* notice, this list of conditions and the following disclaimer.
				7	* * Redistributions in binary form must reproduce the above copyright
				8	* notice, this list of conditions and the following disclaimer in the
				9	* documentation and/or other materials provided with the distribution.
				10	* * Neither the name of The Linux Foundation nor the names of its contributors may
				11	* be used to endorse or promote products derived from this software
				12	* without specific prior written permission.
				13	*
				14	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
				15	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				17	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
				18	* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				19	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				20	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
				21	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
				22	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
				23	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
				24	* POSSIBILITY OF SUCH DAMAGE.
				25	*/
				26
				27	#include <private/bionic_asm.h>
				28	//#define NO_FUSED_MULTIPLY
				29
				30	#define KRAIT_NO_AAPCS_VFP_MODE
				31
				32	ENTRY(pow)
				33	#if defined(KRAIT_NO_AAPCS_VFP_MODE)
				34	// ARM ABI has inputs coming in via d registers, lets copy to x registers
				35	fmov x0, d0
				36	fmov x1, d1
				37	#endif
				38	mov w12, #0x40100000 // high word of 64-bit 4.0
				39
				40	// pre-staged bp values
				41	ldr d5, .LbpA
				42	ldr d3, .LbpB
				43
				44	// load two fifths into constant term in case we need it due to offsets
				45	ldr d6, .Ltwofifths
				46
				47	// bp is initially 1.0, may adjust later based on x value
				48	fmov d4, #1.0
				49
				50	// twoto1o5 = 2^(1/5) (input bracketing)
				51	ldr x4, .Ltwoto1o5
				52
				53	// twoto3o5 = 2^(3/5) (input bracketing)
				54	ldr x5, .Ltwoto3o5
				55
				56	// extract xmantissa
				57	bic x6, x0, #0xFFF0000000000000
				58
				59	// begin preparing a mask for normalization (high 32-bit mask)
				60	movi d31, #0xFFFFFFFF00000000
				61
				62	// double_1 = (double) 1.0
				63	fmov d28, #1.0
				64
				65	cmp x6, x4
				66
				67	shl d30, d31, #20 // d30 can mask just sign/exp bits
				68	ushr d29, d31, #63 // mask has only bit 1 set
				69
				70	adr x10, .LliteralTable // x10->k4 in literal table (below)
				71
				72	bhi .Lxgt2to1over5
				73	// zero out lg2 constant term if don't offset our input
				74	fsub d6, d6, d6
				75	b .Lxle2to1over5
				76
				77	.Lxgt2to1over5:
				78	// if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5)
				79	fadd d4, d4, d5
				80
				81	.Lxle2to1over5:
				82	ldr d5, .Lln2 // d5 = ln2 = 0.69314718056
				83
				84	cmp x6, x5 // non-normalized compare
				85
				86	//@@@ X Value Normalization @@@@
				87
				88	// ss = abs(x) 2^(-1024)
				89	bic v16.8B, v0.8B, v30.8B // mantissa of x into v16.8B (aka d16)
				90
				91	// N = (floor(log2(x)) + 0x3ff) * 2^52
				92	and v2.8B, v0.8B, v30.8B // exponent of x (d0) into v2.8B aka d2
				93
				94	bls .Lxle2to3over5 // branch not taken if (x6 > x5)
				95	// if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5)
				96	fadd d4, d4, d3 // d4 = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5)
				97	fadd d6, d6, d6 // d6 = 2/5 + 2/5 = 4/5 or 0 (see logic above)
				98
				99	.Lxle2to3over5:
				100
				101	lsr x2, x0, #32 // Need just high word of x...x2 can take it
				102	cmp w2, w12 // Compare x to 4.0 (high word only)
				103	lsr x3, x1, #32 // Need just high word of y...x3 can take it.
				104	ccmp w3, w12, #2, ls // If x < 4.0, compare y to 4.0 (high word)
				105	bic w12, w12, #0xFFFF0000 // Change w12 for compare to 0.0000325
				106	orr w12, w12, #0x3fe00000
				107	ccmp w12, w2, #2, ls // If y < 4, compare 0.5 to x
				108
				109	// load log2 polynomial series constants
				110	ldp d24, d25, [x10, #0]
				111	ldp d26, d27, [x10, #16]
				112
				113	// s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1)
				114	orr v16.8B, v16.8B, v28.8B
				115
				116	//@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@
				117
				118	fsub d19, d16, d4 // take normalized x and subtract 2^(4/5) from it
				119	fadd d20, d16, d4
				120	bhi .LuseFullImpl // \|x\| < 0.5 or x > 4 or y > 4
				121
				122	// s = (x-1)/(x+1)
				123	fdiv d16, d19, d20
				124
				125	// load 2/(3log2) into lg2coeff
				126	ldr d21, .Ltwooverthreeln2
				127
				128	// N = floor(log2(x)) * 2^52
				129	sub d2, d2, d28
				130
				131	//@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@
				132
				133	// ss2 = ((x-bp)/(x+bp))^2
				134	fmul d17, d16, d16
				135
				136	// ylg2x = 3.0
				137	fmov d0, #3.0
				138	fmul d18, d17, d17
				139
				140	// todo: useful later for two-way clamp
				141	fmul d21, d21, d1
				142
				143	// N = floor(log2(x))
				144	sshr d2, d2, #52
				145	// k3 = ss^2 * L4 + L3
				146	#ifdef NO_FUSED_MULTIPLY
				147	fmul d3, d17, v24.2D[0]
				148	fadd d25, d25, d3
				149
				150	// k1 = ss^2 * L2 + L1
				151	fmul d3, d17, v26.2D[0]
				152	fadd d27, d27, d3
				153	#else
				154	fmla d25, d17, v24.2D[0]
				155
				156	// k1 = ss^2 * L2 + L1
				157	fmla d27, d17, v26.2D[0]
				158	#endif
				159
				160	// scale ss by 2/(3 ln 2)
				161	fmul d21, d16, d21
				162
				163	// ylg2x = 3.0 + s^2
				164	fadd d0, d0, d17
				165
				166	fmov x2, d2
				167	scvtf d3, w2 // Low-order 32-bit integer half of d2 to fp64
				168
				169	// k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1
				170	#ifdef NO_FUSED_MULTIPLY
				171	fmul d31, d18, v25.2D[0]
				172	fadd d27, d27, d31
				173	#else
				174	fmla d27, d18, v25.2D[0]
				175	#endif
				176	// add in constant term
				177	fadd d3, d3, d6
				178
				179	// ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1)
				180	#ifdef NO_FUSED_MULTIPLY
				181	fmul d31, d18, v27.2D[0]
				182	fadd d0, d0, d31
				183	#else
				184	fmla d0, d18, v27.2D[0]
				185	#endif
				186	// ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1)
				187	fmul d0, d21, d0
				188
				189	//@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@
				190
				191	// mask to extract bit 1 (2^-2 from our fixed-point representation)
				192	shl d4, d29, #1
				193
				194	// double_n = y * n
				195	fmul d3, d3, d1
				196
				197	// Load 2^(1/4) for later computations
				198	ldr d6, .Ltwoto1o4
				199
				200	// either add or subtract one based on the sign of double_n and ylg2x
				201	sshr d16, d0, #62
				202	sshr d19, d3, #62
				203
				204	// move unmodified y*lg2x into temp space
				205	fmov d17, d0
				206
				207	// compute floor(8 y * n + 1)/8
				208	// and floor(8 y (log2(x)) + 1)/8
				209	fcvtzs w2, d0, #3 // no instruction exists to use s0 as a direct target
				210	fmov s0, w2 // run our conversion into w2, then mov it to compensate
				211	// move unmodified y*n into temp space
				212	fmov d18, d3
				213	fcvtzs w2, d3, #3
				214	fmov s3, w2
				215
				216	// load exp polynomial series constants
				217	ldp d20, d21, [x10, #32]
				218	ldp d22, d23, [x10, #48]
				219	ldp d24, d25, [x10, #64]
				220	ldp d26, d27, [x10, #80]
				221
				222	// mask to extract bit 2 (2^-1 from our fixed-point representation)
				223	shl d1, d29, #2
				224
				225	// make rounding offsets either 1 or -1 instead of 0 or -2
				226	orr v16.8B, v16.8B, v29.8B
				227	orr v19.8B, v19.8B, v29.8B
				228
				229	// round up to the nearest 1/8th
				230	add d0, d0, d16
				231	add d3, d3, d19
				232
				233	// clear out round-up bit for y log2(x)
				234	bic v0.8B, v0.8B, v29.8B
				235	// clear out round-up bit for yn
				236	bic v3.8B, v3.8B, v29.8B
				237	// add together the (fixed precision) rounded parts
				238	add d31, d3, d0
				239	// turn int_n into a double with value 2^int_n
				240	shl d2, d31, #49
				241	// compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values:
				242	and v4.8B, v4.8B, v31.8B
				243	and v1.8B, v1.8B, v31.8B
				244
				245	// convert back into floating point, d3 now holds (double) floor(8 y * n + 1)/8
				246	// d0 now holds (double) floor(8 y * log2(x) + 1)/8
				247	fmov w2, s0
				248	scvtf d0, w2, #3
				249	fmov w2, s3
				250	scvtf d3, w2, #3
				251
				252	// put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff)
				253	uqshl d1, d1, #62
				254
				255	// put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff)
				256	uqshl d4, d4, #63
				257
				258	// center ylog2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y log2(x) + 1)/8
				259	fsub d17, d17, d0
				260	// center yn fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y n + 1)/8
				261	fsub d18, d18, d3
				262
				263	// Add fractional parts of yn and y log2(x) together
				264	fadd d16, d17, d18
				265
				266	// Result = 1.0 (offset for exp(s) series)
				267	fmov d0, #1.0
				268
				269	// multiply fractional part of y * log2(x) by ln(2)
				270	fmul d16, d5, d16
				271
				272	//@@@ 10th order polynomial series for Exp(s) @@@@
				273
				274	// ss2 = (ss)^2
				275	fmul d17, d16, d16
				276
				277	// twoto1o2mask = twoto1o2mask & twoto1o4
				278	and v1.8B, v1.8B, v6.8B
				279	// twoto1o2mask = twoto1o2mask & twoto1o4
				280	and v4.8B, v4.8B, v6.8B
				281
				282	// Result = 1.0 + ss
				283	fadd d0, d0, d16
				284
				285	// k7 = ss k8 + k7
				286	#ifdef NO_FUSED_MULTIPLY
				287	fmul d31, d16, v20.2D[0]
				288	fadd d21, d21, d31
				289	#else
				290	fmla d21, d16, v20.2D[0]
				291	#endif
				292	// ss4 = (ssss) (ss*ss)
				293	fmul d18, d17, d17
				294
				295	// twoto1o2mask = twoto1o2mask \| (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask
				296	orr v1.8B, v1.8B, v28.8B
				297	// twoto1o2mask = twoto1o4mask \| (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask
				298	orr v4.8B, v4.8B, v28.8B
				299
				300	// sign could be set up here, but for now expadjustment = 1.0
				301	fmov d7, #1.0
				302
				303	// ss3 = (ssss) ss
				304	fmul d19, d17, d16
				305
				306	// k0 = 1/2 (first non-unity coefficient)
				307	fmov d28, #0.5
				308
				309	// Mask out non-exponent bits to make sure we have just 2^int_n
				310	and v2.8B, v2.8B, v30.8B
				311
				312	// square twoto1o2mask to get 1.0 or 2^(1/2)
				313	fmul d1, d1, d1
				314
				315	// multiply twoto2o4mask into the exponent output adjustment value
				316	fmul d7, d7, d4
				317
				318	#ifdef NO_FUSED_MULTIPLY
				319	// k5 = ss k6 + k5
				320	fmul d31, d16, v22.2D[0]
				321	fadd d23, d23, d31
				322
				323	// k3 = ss k4 + k3
				324	fmul d31, d16, v24.2D[0]
				325	fadd d25, d25, d31
				326
				327	// k1 = ss k2 + k1
				328	fmul d31, d16, v26.2D[0]
				329	fadd d27, d27, d31
				330	#else
				331	// k5 = ss k6 + k5
				332	fmla d23, d16, v22.2D[0]
				333
				334	// k3 = ss k4 + k3
				335	fmla d25, d16, v24.2D[0]
				336
				337	// k1 = ss k2 + k1
				338	fmla d27, d16, v26.2D[0]
				339	#endif
				340	// multiply twoto1o2mask into exponent output adjustment value
				341	fmul d7, d7, d1
				342	#ifdef NO_FUSED_MULTIPLY
				343	// k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
				344	fmul d31, d17, v21.2D[0]
				345	fadd d23, d23, d31
				346
				347	// k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
				348	fmul d31, d17, v25.2D[0]
				349	fadd d27, d27, d31
				350
				351	// Result = 1.0 + ss + 1/2 ss^2
				352	fmul d31, d17, v28.2D[0]
				353	fadd d0, d0, d31
				354	#else
				355	// k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
				356	fmla d23, d17, v21.2D[0]
				357
				358	// k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
				359	fmla d27, d17, v25.2D[0]
				360
				361	// Result = 1.0 + ss + 1/2 ss^2
				362	fmla d0, d17, v28.2D[0]
				363	#endif
				364	// Adjust int_n so that it's a double precision value that can be multiplied by Result
				365	add d7, d2, d7
				366	#ifdef NO_FUSED_MULTIPLY
				367	// k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
				368	fmul d31, d18, v23.2D[0]
				369	fadd d27, d27, d31
				370
				371	// Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
				372	fmul d31, d19, v27.2D[0]
				373	fadd d0, d0, d31
				374	#else
				375	// k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
				376	fmla d27, d18, v23.2D[0]
				377
				378	// Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
				379	fmla d0, d19, v27.2D[0]
				380	#endif
				381	// multiply by adjustment (sign(rounding ? sqrt(2) : 1) 2^int_n)
				382	fmul d0, d7, d0
				383
				384	.LleavePow:
				385	#if defined(KRAIT_NO_AAPCS_VFP_MODE)
				386	// return Result (FP)
				387	// fmov x0, d0
				388	#endif
				389	.LleavePowDirect:
				390	// leave directly returning whatever is in d0
				391	ret
				392	.LuseFullImpl:
				393	fmov d0, x0
				394	fmov d1, x1
				395	b __full_ieee754_pow
				396
				397	.align 6
				398	.LliteralTable:
				399	// Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)):
				400	.LL4: // ~3/11
				401	.long 0x53a79915, 0x3fd1b108
				402	.LL3: // ~1/3
				403	.long 0x9ca0567a, 0x3fd554fa
				404	.LL2: // ~3/7
				405	.long 0x1408e660, 0x3fdb6db7
				406	.LL1: // ~3/5
				407	.long 0x332D4313, 0x3fe33333
				408
				409	// Least-squares tuned constants for 10th order exp(s):
				410	.LE10: // ~1/3628800
				411	.long 0x25c7ba0a, 0x3e92819b
				412	.LE9: // ~1/362880
				413	.long 0x9499b49c, 0x3ec72294
				414	.LE8: // ~1/40320
				415	.long 0xabb79d95, 0x3efa019f
				416	.LE7: // ~1/5040
				417	.long 0x8723aeaa, 0x3f2a019f
				418	.LE6: // ~1/720
				419	.long 0x16c76a94, 0x3f56c16c
				420	.LE5: // ~1/120
				421	.long 0x11185da8, 0x3f811111
				422	.LE4: // ~1/24
				423	.long 0x5555551c, 0x3fa55555
				424	.LE3: // ~1/6
				425	.long 0x555554db, 0x3fc55555
				426
				427	.LbpA: // (2^(2/5) - 1)
				428	.long 0x4ee54db1, 0x3fd472d1
				429
				430	.LbpB: // (2^(4/5) - 2^(2/5))
				431	.long 0x1c8a36cf, 0x3fdafb62
				432
				433	.Ltwofifths: // 2/5
				434	.long 0x9999999a, 0x3fd99999
				435
				436	.Ltwooverthreeln2:
				437	.long 0xDC3A03FD, 0x3FEEC709
				438
				439	.Ltwoto1o5: // 2^(1/5) exponent 3ff stripped for non-normalized compares
				440	.long 0x86BAE675, 0x00026111
				441
				442	.Ltwoto3o5: // 2^(3/5) exponent 3ff stripped for non-normalized compares
				443	.long 0x03B2AE5C, 0x00084060
				444
				445	.Lln2: // ln(2)
				446	.long 0xFEFA39EF, 0x3FE62E42
				447
				448	.Ltwoto1o4: // 2^1/4
				449	.long 0x0a31b715, 0x3ff306fe
				450	END(pow)