blob: 2a30c4aafbd633c946cf5b73c3b89d7086ac6b4e [file] [log] [blame]
Brent DeGraaf9aa974c2014-09-19 09:27:24 -04001/* Copyright (c) 2009-2014 The Linux Foundation. All rights reserved.
2 *
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions are met:
5 * * Redistributions of source code must retain the above copyright
6 * notice, this list of conditions and the following disclaimer.
7 * * Redistributions in binary form must reproduce the above copyright
8 * notice, this list of conditions and the following disclaimer in the
9 * documentation and/or other materials provided with the distribution.
10 * * Neither the name of The Linux Foundation nor the names of its contributors may
11 * be used to endorse or promote products derived from this software
12 * without specific prior written permission.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
18 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
19 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
20 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
21 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
22 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
23 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24 * POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <private/bionic_asm.h>
28//#define NO_FUSED_MULTIPLY
29
30#define KRAIT_NO_AAPCS_VFP_MODE
31
32ENTRY(pow)
33#if defined(KRAIT_NO_AAPCS_VFP_MODE)
34 // ARM ABI has inputs coming in via d registers, lets copy to x registers
35 fmov x0, d0
36 fmov x1, d1
37#endif
38 mov w12, #0x40100000 // high word of 64-bit 4.0
39
40 // pre-staged bp values
41 ldr d5, .LbpA
42 ldr d3, .LbpB
43
44 // load two fifths into constant term in case we need it due to offsets
45 ldr d6, .Ltwofifths
46
47 // bp is initially 1.0, may adjust later based on x value
48 fmov d4, #1.0
49
50 // twoto1o5 = 2^(1/5) (input bracketing)
51 ldr x4, .Ltwoto1o5
52
53 // twoto3o5 = 2^(3/5) (input bracketing)
54 ldr x5, .Ltwoto3o5
55
56 // extract xmantissa
57 bic x6, x0, #0xFFF0000000000000
58
59 // begin preparing a mask for normalization (high 32-bit mask)
60 movi d31, #0xFFFFFFFF00000000
61
62 // double_1 = (double) 1.0
63 fmov d28, #1.0
64
65 cmp x6, x4
66
67 shl d30, d31, #20 // d30 can mask just sign/exp bits
68 ushr d29, d31, #63 // mask has only bit 1 set
69
70 adr x10, .LliteralTable // x10->k4 in literal table (below)
71
72 bhi .Lxgt2to1over5
73 // zero out lg2 constant term if don't offset our input
74 fsub d6, d6, d6
75 b .Lxle2to1over5
76
77.Lxgt2to1over5:
78 // if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5)
79 fadd d4, d4, d5
80
81.Lxle2to1over5:
82 ldr d5, .Lln2 // d5 = ln2 = 0.69314718056
83
84 cmp x6, x5 // non-normalized compare
85
86//@@@ X Value Normalization @@@@
87
88 // ss = abs(x) 2^(-1024)
89 bic v16.8B, v0.8B, v30.8B // mantissa of x into v16.8B (aka d16)
90
91 // N = (floor(log2(x)) + 0x3ff) * 2^52
92 and v2.8B, v0.8B, v30.8B // exponent of x (d0) into v2.8B aka d2
93
94 bls .Lxle2to3over5 // branch not taken if (x6 > x5)
95 // if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5)
96 fadd d4, d4, d3 // d4 = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5)
97 fadd d6, d6, d6 // d6 = 2/5 + 2/5 = 4/5 or 0 (see logic above)
98
99.Lxle2to3over5:
100
101 lsr x2, x0, #32 // Need just high word of x...x2 can take it
102 cmp w2, w12 // Compare x to 4.0 (high word only)
103 lsr x3, x1, #32 // Need just high word of y...x3 can take it.
104 ccmp w3, w12, #2, ls // If x < 4.0, compare y to 4.0 (high word)
105 bic w12, w12, #0xFFFF0000 // Change w12 for compare to 0.0000325
106 orr w12, w12, #0x3fe00000
107 ccmp w12, w2, #2, ls // If y < 4, compare 0.5 to x
108
109 // load log2 polynomial series constants
110 ldp d24, d25, [x10, #0]
111 ldp d26, d27, [x10, #16]
112
113 // s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1)
114 orr v16.8B, v16.8B, v28.8B
115
116//@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@
117
118 fsub d19, d16, d4 // take normalized x and subtract 2^(4/5) from it
119 fadd d20, d16, d4
120 bhi .LuseFullImpl // |x| < 0.5 or x > 4 or y > 4
121
122 // s = (x-1)/(x+1)
123 fdiv d16, d19, d20
124
125 // load 2/(3log2) into lg2coeff
126 ldr d21, .Ltwooverthreeln2
127
128 // N = floor(log2(x)) * 2^52
129 sub d2, d2, d28
130
131//@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@
132
133 // ss2 = ((x-bp)/(x+bp))^2
134 fmul d17, d16, d16
135
136 // ylg2x = 3.0
137 fmov d0, #3.0
138 fmul d18, d17, d17
139
140 // todo: useful later for two-way clamp
141 fmul d21, d21, d1
142
143 // N = floor(log2(x))
144 sshr d2, d2, #52
145 // k3 = ss^2 * L4 + L3
146#ifdef NO_FUSED_MULTIPLY
147 fmul d3, d17, v24.2D[0]
148 fadd d25, d25, d3
149
150 // k1 = ss^2 * L2 + L1
151 fmul d3, d17, v26.2D[0]
152 fadd d27, d27, d3
153#else
154 fmla d25, d17, v24.2D[0]
155
156 // k1 = ss^2 * L2 + L1
157 fmla d27, d17, v26.2D[0]
158#endif
159
160 // scale ss by 2/(3 ln 2)
161 fmul d21, d16, d21
162
163 // ylg2x = 3.0 + s^2
164 fadd d0, d0, d17
165
166 fmov x2, d2
167 scvtf d3, w2 // Low-order 32-bit integer half of d2 to fp64
168
169 // k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1
170#ifdef NO_FUSED_MULTIPLY
171 fmul d31, d18, v25.2D[0]
172 fadd d27, d27, d31
173#else
174 fmla d27, d18, v25.2D[0]
175#endif
176 // add in constant term
177 fadd d3, d3, d6
178
179 // ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1)
180#ifdef NO_FUSED_MULTIPLY
181 fmul d31, d18, v27.2D[0]
182 fadd d0, d0, d31
183#else
184 fmla d0, d18, v27.2D[0]
185#endif
186 // ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1)
187 fmul d0, d21, d0
188
189//@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@
190
191 // mask to extract bit 1 (2^-2 from our fixed-point representation)
192 shl d4, d29, #1
193
194 // double_n = y * n
195 fmul d3, d3, d1
196
197 // Load 2^(1/4) for later computations
198 ldr d6, .Ltwoto1o4
199
200 // either add or subtract one based on the sign of double_n and ylg2x
201 sshr d16, d0, #62
202 sshr d19, d3, #62
203
204 // move unmodified y*lg2x into temp space
205 fmov d17, d0
206
207 // compute floor(8 y * n + 1)/8
208 // and floor(8 y (log2(x)) + 1)/8
209 fcvtzs w2, d0, #3 // no instruction exists to use s0 as a direct target
210 fmov s0, w2 // run our conversion into w2, then mov it to compensate
211 // move unmodified y*n into temp space
212 fmov d18, d3
213 fcvtzs w2, d3, #3
214 fmov s3, w2
215
216 // load exp polynomial series constants
217 ldp d20, d21, [x10, #32]
218 ldp d22, d23, [x10, #48]
219 ldp d24, d25, [x10, #64]
220 ldp d26, d27, [x10, #80]
221
222 // mask to extract bit 2 (2^-1 from our fixed-point representation)
223 shl d1, d29, #2
224
225 // make rounding offsets either 1 or -1 instead of 0 or -2
226 orr v16.8B, v16.8B, v29.8B
227 orr v19.8B, v19.8B, v29.8B
228
229 // round up to the nearest 1/8th
230 add d0, d0, d16
231 add d3, d3, d19
232
233 // clear out round-up bit for y log2(x)
234 bic v0.8B, v0.8B, v29.8B
235 // clear out round-up bit for yn
236 bic v3.8B, v3.8B, v29.8B
237 // add together the (fixed precision) rounded parts
238 add d31, d3, d0
239 // turn int_n into a double with value 2^int_n
240 shl d2, d31, #49
241 // compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values:
242 and v4.8B, v4.8B, v31.8B
243 and v1.8B, v1.8B, v31.8B
244
245 // convert back into floating point, d3 now holds (double) floor(8 y * n + 1)/8
246 // d0 now holds (double) floor(8 y * log2(x) + 1)/8
247 fmov w2, s0
248 scvtf d0, w2, #3
249 fmov w2, s3
250 scvtf d3, w2, #3
251
252 // put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff)
253 uqshl d1, d1, #62
254
255 // put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff)
256 uqshl d4, d4, #63
257
258 // center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8
259 fsub d17, d17, d0
260 // center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8
261 fsub d18, d18, d3
262
263 // Add fractional parts of yn and y log2(x) together
264 fadd d16, d17, d18
265
266 // Result = 1.0 (offset for exp(s) series)
267 fmov d0, #1.0
268
269 // multiply fractional part of y * log2(x) by ln(2)
270 fmul d16, d5, d16
271
272//@@@ 10th order polynomial series for Exp(s) @@@@
273
274 // ss2 = (ss)^2
275 fmul d17, d16, d16
276
277 // twoto1o2mask = twoto1o2mask & twoto1o4
278 and v1.8B, v1.8B, v6.8B
279 // twoto1o2mask = twoto1o2mask & twoto1o4
280 and v4.8B, v4.8B, v6.8B
281
282 // Result = 1.0 + ss
283 fadd d0, d0, d16
284
285 // k7 = ss k8 + k7
286#ifdef NO_FUSED_MULTIPLY
287 fmul d31, d16, v20.2D[0]
288 fadd d21, d21, d31
289#else
290 fmla d21, d16, v20.2D[0]
291#endif
292 // ss4 = (ss*ss) * (ss*ss)
293 fmul d18, d17, d17
294
295 // twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask
296 orr v1.8B, v1.8B, v28.8B
297 // twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask
298 orr v4.8B, v4.8B, v28.8B
299
300 // sign could be set up here, but for now expadjustment = 1.0
301 fmov d7, #1.0
302
303 // ss3 = (ss*ss) * ss
304 fmul d19, d17, d16
305
306 // k0 = 1/2 (first non-unity coefficient)
307 fmov d28, #0.5
308
309 // Mask out non-exponent bits to make sure we have just 2^int_n
310 and v2.8B, v2.8B, v30.8B
311
312 // square twoto1o2mask to get 1.0 or 2^(1/2)
313 fmul d1, d1, d1
314
315 // multiply twoto2o4mask into the exponent output adjustment value
316 fmul d7, d7, d4
317
318#ifdef NO_FUSED_MULTIPLY
319 // k5 = ss k6 + k5
320 fmul d31, d16, v22.2D[0]
321 fadd d23, d23, d31
322
323 // k3 = ss k4 + k3
324 fmul d31, d16, v24.2D[0]
325 fadd d25, d25, d31
326
327 // k1 = ss k2 + k1
328 fmul d31, d16, v26.2D[0]
329 fadd d27, d27, d31
330#else
331 // k5 = ss k6 + k5
332 fmla d23, d16, v22.2D[0]
333
334 // k3 = ss k4 + k3
335 fmla d25, d16, v24.2D[0]
336
337 // k1 = ss k2 + k1
338 fmla d27, d16, v26.2D[0]
339#endif
340 // multiply twoto1o2mask into exponent output adjustment value
341 fmul d7, d7, d1
342#ifdef NO_FUSED_MULTIPLY
343 // k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
344 fmul d31, d17, v21.2D[0]
345 fadd d23, d23, d31
346
347 // k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
348 fmul d31, d17, v25.2D[0]
349 fadd d27, d27, d31
350
351 // Result = 1.0 + ss + 1/2 ss^2
352 fmul d31, d17, v28.2D[0]
353 fadd d0, d0, d31
354#else
355 // k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5
356 fmla d23, d17, v21.2D[0]
357
358 // k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1
359 fmla d27, d17, v25.2D[0]
360
361 // Result = 1.0 + ss + 1/2 ss^2
362 fmla d0, d17, v28.2D[0]
363#endif
364 // Adjust int_n so that it's a double precision value that can be multiplied by Result
365 add d7, d2, d7
366#ifdef NO_FUSED_MULTIPLY
367 // k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
368 fmul d31, d18, v23.2D[0]
369 fadd d27, d27, d31
370
371 // Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
372 fmul d31, d19, v27.2D[0]
373 fadd d0, d0, d31
374#else
375 // k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1
376 fmla d27, d18, v23.2D[0]
377
378 // Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 )
379 fmla d0, d19, v27.2D[0]
380#endif
381 // multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n)
382 fmul d0, d7, d0
383
384.LleavePow:
385#if defined(KRAIT_NO_AAPCS_VFP_MODE)
386 // return Result (FP)
387 // fmov x0, d0
388#endif
389.LleavePowDirect:
390 // leave directly returning whatever is in d0
391 ret
392.LuseFullImpl:
393 fmov d0, x0
394 fmov d1, x1
395 b __full_ieee754_pow
396
397.align 6
398.LliteralTable:
399// Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)):
400.LL4: // ~3/11
401 .long 0x53a79915, 0x3fd1b108
402.LL3: // ~1/3
403 .long 0x9ca0567a, 0x3fd554fa
404.LL2: // ~3/7
405 .long 0x1408e660, 0x3fdb6db7
406.LL1: // ~3/5
407 .long 0x332D4313, 0x3fe33333
408
409// Least-squares tuned constants for 10th order exp(s):
410.LE10: // ~1/3628800
411 .long 0x25c7ba0a, 0x3e92819b
412.LE9: // ~1/362880
413 .long 0x9499b49c, 0x3ec72294
414.LE8: // ~1/40320
415 .long 0xabb79d95, 0x3efa019f
416.LE7: // ~1/5040
417 .long 0x8723aeaa, 0x3f2a019f
418.LE6: // ~1/720
419 .long 0x16c76a94, 0x3f56c16c
420.LE5: // ~1/120
421 .long 0x11185da8, 0x3f811111
422.LE4: // ~1/24
423 .long 0x5555551c, 0x3fa55555
424.LE3: // ~1/6
425 .long 0x555554db, 0x3fc55555
426
427.LbpA: // (2^(2/5) - 1)
428 .long 0x4ee54db1, 0x3fd472d1
429
430.LbpB: // (2^(4/5) - 2^(2/5))
431 .long 0x1c8a36cf, 0x3fdafb62
432
433.Ltwofifths: // 2/5
434 .long 0x9999999a, 0x3fd99999
435
436.Ltwooverthreeln2:
437 .long 0xDC3A03FD, 0x3FEEC709
438
439.Ltwoto1o5: // 2^(1/5) exponent 3ff stripped for non-normalized compares
440 .long 0x86BAE675, 0x00026111
441
442.Ltwoto3o5: // 2^(3/5) exponent 3ff stripped for non-normalized compares
443 .long 0x03B2AE5C, 0x00084060
444
445.Lln2: // ln(2)
446 .long 0xFEFA39EF, 0x3FE62E42
447
448.Ltwoto1o4: // 2^1/4
449 .long 0x0a31b715, 0x3ff306fe
450END(pow)