Brent DeGraaf | 9aa974c | 2014-09-19 09:27:24 -0400 | [diff] [blame^] | 1 | /* Copyright (c) 2009-2014 The Linux Foundation. All rights reserved. |
| 2 | * |
| 3 | * Redistribution and use in source and binary forms, with or without |
| 4 | * modification, are permitted provided that the following conditions are met: |
| 5 | * * Redistributions of source code must retain the above copyright |
| 6 | * notice, this list of conditions and the following disclaimer. |
| 7 | * * Redistributions in binary form must reproduce the above copyright |
| 8 | * notice, this list of conditions and the following disclaimer in the |
| 9 | * documentation and/or other materials provided with the distribution. |
| 10 | * * Neither the name of The Linux Foundation nor the names of its contributors may |
| 11 | * be used to endorse or promote products derived from this software |
| 12 | * without specific prior written permission. |
| 13 | * |
| 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 15 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE |
| 18 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 19 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 20 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
| 21 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
| 22 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
| 23 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
| 24 | * POSSIBILITY OF SUCH DAMAGE. |
| 25 | */ |
| 26 | |
| 27 | #include <private/bionic_asm.h> |
| 28 | //#define NO_FUSED_MULTIPLY |
| 29 | |
| 30 | #define KRAIT_NO_AAPCS_VFP_MODE |
| 31 | |
| 32 | ENTRY(pow) |
| 33 | #if defined(KRAIT_NO_AAPCS_VFP_MODE) |
| 34 | // ARM ABI has inputs coming in via d registers, lets copy to x registers |
| 35 | fmov x0, d0 |
| 36 | fmov x1, d1 |
| 37 | #endif |
| 38 | mov w12, #0x40100000 // high word of 64-bit 4.0 |
| 39 | |
| 40 | // pre-staged bp values |
| 41 | ldr d5, .LbpA |
| 42 | ldr d3, .LbpB |
| 43 | |
| 44 | // load two fifths into constant term in case we need it due to offsets |
| 45 | ldr d6, .Ltwofifths |
| 46 | |
| 47 | // bp is initially 1.0, may adjust later based on x value |
| 48 | fmov d4, #1.0 |
| 49 | |
| 50 | // twoto1o5 = 2^(1/5) (input bracketing) |
| 51 | ldr x4, .Ltwoto1o5 |
| 52 | |
| 53 | // twoto3o5 = 2^(3/5) (input bracketing) |
| 54 | ldr x5, .Ltwoto3o5 |
| 55 | |
| 56 | // extract xmantissa |
| 57 | bic x6, x0, #0xFFF0000000000000 |
| 58 | |
| 59 | // begin preparing a mask for normalization (high 32-bit mask) |
| 60 | movi d31, #0xFFFFFFFF00000000 |
| 61 | |
| 62 | // double_1 = (double) 1.0 |
| 63 | fmov d28, #1.0 |
| 64 | |
| 65 | cmp x6, x4 |
| 66 | |
| 67 | shl d30, d31, #20 // d30 can mask just sign/exp bits |
| 68 | ushr d29, d31, #63 // mask has only bit 1 set |
| 69 | |
| 70 | adr x10, .LliteralTable // x10->k4 in literal table (below) |
| 71 | |
| 72 | bhi .Lxgt2to1over5 |
| 73 | // zero out lg2 constant term if don't offset our input |
| 74 | fsub d6, d6, d6 |
| 75 | b .Lxle2to1over5 |
| 76 | |
| 77 | .Lxgt2to1over5: |
| 78 | // if normalized x > 2^(1/5), bp = 1 + (2^(2/5)-1) = 2^(2/5) |
| 79 | fadd d4, d4, d5 |
| 80 | |
| 81 | .Lxle2to1over5: |
| 82 | ldr d5, .Lln2 // d5 = ln2 = 0.69314718056 |
| 83 | |
| 84 | cmp x6, x5 // non-normalized compare |
| 85 | |
| 86 | //@@@ X Value Normalization @@@@ |
| 87 | |
| 88 | // ss = abs(x) 2^(-1024) |
| 89 | bic v16.8B, v0.8B, v30.8B // mantissa of x into v16.8B (aka d16) |
| 90 | |
| 91 | // N = (floor(log2(x)) + 0x3ff) * 2^52 |
| 92 | and v2.8B, v0.8B, v30.8B // exponent of x (d0) into v2.8B aka d2 |
| 93 | |
| 94 | bls .Lxle2to3over5 // branch not taken if (x6 > x5) |
| 95 | // if normalized x > 2^(3/5), bp = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5) |
| 96 | fadd d4, d4, d3 // d4 = 2^(2/5) + (2^(4/5) - 2^(2/5)) = 2^(4/5) |
| 97 | fadd d6, d6, d6 // d6 = 2/5 + 2/5 = 4/5 or 0 (see logic above) |
| 98 | |
| 99 | .Lxle2to3over5: |
| 100 | |
| 101 | lsr x2, x0, #32 // Need just high word of x...x2 can take it |
| 102 | cmp w2, w12 // Compare x to 4.0 (high word only) |
| 103 | lsr x3, x1, #32 // Need just high word of y...x3 can take it. |
| 104 | ccmp w3, w12, #2, ls // If x < 4.0, compare y to 4.0 (high word) |
| 105 | bic w12, w12, #0xFFFF0000 // Change w12 for compare to 0.0000325 |
| 106 | orr w12, w12, #0x3fe00000 |
| 107 | ccmp w12, w2, #2, ls // If y < 4, compare 0.5 to x |
| 108 | |
| 109 | // load log2 polynomial series constants |
| 110 | ldp d24, d25, [x10, #0] |
| 111 | ldp d26, d27, [x10, #16] |
| 112 | |
| 113 | // s = abs(x) 2^(-floor(log2(x))) (normalize abs(x) to around 1) |
| 114 | orr v16.8B, v16.8B, v28.8B |
| 115 | |
| 116 | //@@@ 3/2 (Log(bp(1+s)/(1-s))) input computation (s = (x-bp)/(x+bp)) @@@@ |
| 117 | |
| 118 | fsub d19, d16, d4 // take normalized x and subtract 2^(4/5) from it |
| 119 | fadd d20, d16, d4 |
| 120 | bhi .LuseFullImpl // |x| < 0.5 or x > 4 or y > 4 |
| 121 | |
| 122 | // s = (x-1)/(x+1) |
| 123 | fdiv d16, d19, d20 |
| 124 | |
| 125 | // load 2/(3log2) into lg2coeff |
| 126 | ldr d21, .Ltwooverthreeln2 |
| 127 | |
| 128 | // N = floor(log2(x)) * 2^52 |
| 129 | sub d2, d2, d28 |
| 130 | |
| 131 | //@@@ 3/2 (Log(bp(1+s)/(1-s))) polynomial series @@@@ |
| 132 | |
| 133 | // ss2 = ((x-bp)/(x+bp))^2 |
| 134 | fmul d17, d16, d16 |
| 135 | |
| 136 | // ylg2x = 3.0 |
| 137 | fmov d0, #3.0 |
| 138 | fmul d18, d17, d17 |
| 139 | |
| 140 | // todo: useful later for two-way clamp |
| 141 | fmul d21, d21, d1 |
| 142 | |
| 143 | // N = floor(log2(x)) |
| 144 | sshr d2, d2, #52 |
| 145 | // k3 = ss^2 * L4 + L3 |
| 146 | #ifdef NO_FUSED_MULTIPLY |
| 147 | fmul d3, d17, v24.2D[0] |
| 148 | fadd d25, d25, d3 |
| 149 | |
| 150 | // k1 = ss^2 * L2 + L1 |
| 151 | fmul d3, d17, v26.2D[0] |
| 152 | fadd d27, d27, d3 |
| 153 | #else |
| 154 | fmla d25, d17, v24.2D[0] |
| 155 | |
| 156 | // k1 = ss^2 * L2 + L1 |
| 157 | fmla d27, d17, v26.2D[0] |
| 158 | #endif |
| 159 | |
| 160 | // scale ss by 2/(3 ln 2) |
| 161 | fmul d21, d16, d21 |
| 162 | |
| 163 | // ylg2x = 3.0 + s^2 |
| 164 | fadd d0, d0, d17 |
| 165 | |
| 166 | fmov x2, d2 |
| 167 | scvtf d3, w2 // Low-order 32-bit integer half of d2 to fp64 |
| 168 | |
| 169 | // k1 = s^4 (s^2 L4 + L3) + s^2 L2 + L1 |
| 170 | #ifdef NO_FUSED_MULTIPLY |
| 171 | fmul d31, d18, v25.2D[0] |
| 172 | fadd d27, d27, d31 |
| 173 | #else |
| 174 | fmla d27, d18, v25.2D[0] |
| 175 | #endif |
| 176 | // add in constant term |
| 177 | fadd d3, d3, d6 |
| 178 | |
| 179 | // ylg2x = 3.0 + s^2 + s^4 (s^4 (s^2 L4 + L3) + s^2 L2 + L1) |
| 180 | #ifdef NO_FUSED_MULTIPLY |
| 181 | fmul d31, d18, v27.2D[0] |
| 182 | fadd d0, d0, d31 |
| 183 | #else |
| 184 | fmla d0, d18, v27.2D[0] |
| 185 | #endif |
| 186 | // ylg2x = y 2 s / (3 ln(2)) (3.0 + s^2 + s^4 (s^4(s^2 L4 + L3) + s^2 L2 + L1) |
| 187 | fmul d0, d21, d0 |
| 188 | |
| 189 | //@@@ Compute input to Exp(s) (s = y(n + log2(x)) - (floor(8 yn + 1)/8 + floor(8 ylog2(x) + 1)/8) @@@@@ |
| 190 | |
| 191 | // mask to extract bit 1 (2^-2 from our fixed-point representation) |
| 192 | shl d4, d29, #1 |
| 193 | |
| 194 | // double_n = y * n |
| 195 | fmul d3, d3, d1 |
| 196 | |
| 197 | // Load 2^(1/4) for later computations |
| 198 | ldr d6, .Ltwoto1o4 |
| 199 | |
| 200 | // either add or subtract one based on the sign of double_n and ylg2x |
| 201 | sshr d16, d0, #62 |
| 202 | sshr d19, d3, #62 |
| 203 | |
| 204 | // move unmodified y*lg2x into temp space |
| 205 | fmov d17, d0 |
| 206 | |
| 207 | // compute floor(8 y * n + 1)/8 |
| 208 | // and floor(8 y (log2(x)) + 1)/8 |
| 209 | fcvtzs w2, d0, #3 // no instruction exists to use s0 as a direct target |
| 210 | fmov s0, w2 // run our conversion into w2, then mov it to compensate |
| 211 | // move unmodified y*n into temp space |
| 212 | fmov d18, d3 |
| 213 | fcvtzs w2, d3, #3 |
| 214 | fmov s3, w2 |
| 215 | |
| 216 | // load exp polynomial series constants |
| 217 | ldp d20, d21, [x10, #32] |
| 218 | ldp d22, d23, [x10, #48] |
| 219 | ldp d24, d25, [x10, #64] |
| 220 | ldp d26, d27, [x10, #80] |
| 221 | |
| 222 | // mask to extract bit 2 (2^-1 from our fixed-point representation) |
| 223 | shl d1, d29, #2 |
| 224 | |
| 225 | // make rounding offsets either 1 or -1 instead of 0 or -2 |
| 226 | orr v16.8B, v16.8B, v29.8B |
| 227 | orr v19.8B, v19.8B, v29.8B |
| 228 | |
| 229 | // round up to the nearest 1/8th |
| 230 | add d0, d0, d16 |
| 231 | add d3, d3, d19 |
| 232 | |
| 233 | // clear out round-up bit for y log2(x) |
| 234 | bic v0.8B, v0.8B, v29.8B |
| 235 | // clear out round-up bit for yn |
| 236 | bic v3.8B, v3.8B, v29.8B |
| 237 | // add together the (fixed precision) rounded parts |
| 238 | add d31, d3, d0 |
| 239 | // turn int_n into a double with value 2^int_n |
| 240 | shl d2, d31, #49 |
| 241 | // compute masks for 2^(1/4) and 2^(1/2) fixups for fractional part of fixed-precision rounded values: |
| 242 | and v4.8B, v4.8B, v31.8B |
| 243 | and v1.8B, v1.8B, v31.8B |
| 244 | |
| 245 | // convert back into floating point, d3 now holds (double) floor(8 y * n + 1)/8 |
| 246 | // d0 now holds (double) floor(8 y * log2(x) + 1)/8 |
| 247 | fmov w2, s0 |
| 248 | scvtf d0, w2, #3 |
| 249 | fmov w2, s3 |
| 250 | scvtf d3, w2, #3 |
| 251 | |
| 252 | // put the 2 bit (0.5) through the roof of twoto1o2mask (make it 0x0 or 0xffffffffffffffff) |
| 253 | uqshl d1, d1, #62 |
| 254 | |
| 255 | // put the 1 bit (0.25) through the roof of twoto1o4mask (make it 0x0 or 0xffffffffffffffff) |
| 256 | uqshl d4, d4, #63 |
| 257 | |
| 258 | // center y*log2(x) fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * log2(x) + 1)/8 |
| 259 | fsub d17, d17, d0 |
| 260 | // center y*n fractional part between -0.125 and 0.125 by subtracting (double) floor(8 y * n + 1)/8 |
| 261 | fsub d18, d18, d3 |
| 262 | |
| 263 | // Add fractional parts of yn and y log2(x) together |
| 264 | fadd d16, d17, d18 |
| 265 | |
| 266 | // Result = 1.0 (offset for exp(s) series) |
| 267 | fmov d0, #1.0 |
| 268 | |
| 269 | // multiply fractional part of y * log2(x) by ln(2) |
| 270 | fmul d16, d5, d16 |
| 271 | |
| 272 | //@@@ 10th order polynomial series for Exp(s) @@@@ |
| 273 | |
| 274 | // ss2 = (ss)^2 |
| 275 | fmul d17, d16, d16 |
| 276 | |
| 277 | // twoto1o2mask = twoto1o2mask & twoto1o4 |
| 278 | and v1.8B, v1.8B, v6.8B |
| 279 | // twoto1o2mask = twoto1o2mask & twoto1o4 |
| 280 | and v4.8B, v4.8B, v6.8B |
| 281 | |
| 282 | // Result = 1.0 + ss |
| 283 | fadd d0, d0, d16 |
| 284 | |
| 285 | // k7 = ss k8 + k7 |
| 286 | #ifdef NO_FUSED_MULTIPLY |
| 287 | fmul d31, d16, v20.2D[0] |
| 288 | fadd d21, d21, d31 |
| 289 | #else |
| 290 | fmla d21, d16, v20.2D[0] |
| 291 | #endif |
| 292 | // ss4 = (ss*ss) * (ss*ss) |
| 293 | fmul d18, d17, d17 |
| 294 | |
| 295 | // twoto1o2mask = twoto1o2mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o2mask |
| 296 | orr v1.8B, v1.8B, v28.8B |
| 297 | // twoto1o2mask = twoto1o4mask | (double) 1.0 - results in either 1.0 or 2^(1/4) in twoto1o4mask |
| 298 | orr v4.8B, v4.8B, v28.8B |
| 299 | |
| 300 | // sign could be set up here, but for now expadjustment = 1.0 |
| 301 | fmov d7, #1.0 |
| 302 | |
| 303 | // ss3 = (ss*ss) * ss |
| 304 | fmul d19, d17, d16 |
| 305 | |
| 306 | // k0 = 1/2 (first non-unity coefficient) |
| 307 | fmov d28, #0.5 |
| 308 | |
| 309 | // Mask out non-exponent bits to make sure we have just 2^int_n |
| 310 | and v2.8B, v2.8B, v30.8B |
| 311 | |
| 312 | // square twoto1o2mask to get 1.0 or 2^(1/2) |
| 313 | fmul d1, d1, d1 |
| 314 | |
| 315 | // multiply twoto2o4mask into the exponent output adjustment value |
| 316 | fmul d7, d7, d4 |
| 317 | |
| 318 | #ifdef NO_FUSED_MULTIPLY |
| 319 | // k5 = ss k6 + k5 |
| 320 | fmul d31, d16, v22.2D[0] |
| 321 | fadd d23, d23, d31 |
| 322 | |
| 323 | // k3 = ss k4 + k3 |
| 324 | fmul d31, d16, v24.2D[0] |
| 325 | fadd d25, d25, d31 |
| 326 | |
| 327 | // k1 = ss k2 + k1 |
| 328 | fmul d31, d16, v26.2D[0] |
| 329 | fadd d27, d27, d31 |
| 330 | #else |
| 331 | // k5 = ss k6 + k5 |
| 332 | fmla d23, d16, v22.2D[0] |
| 333 | |
| 334 | // k3 = ss k4 + k3 |
| 335 | fmla d25, d16, v24.2D[0] |
| 336 | |
| 337 | // k1 = ss k2 + k1 |
| 338 | fmla d27, d16, v26.2D[0] |
| 339 | #endif |
| 340 | // multiply twoto1o2mask into exponent output adjustment value |
| 341 | fmul d7, d7, d1 |
| 342 | #ifdef NO_FUSED_MULTIPLY |
| 343 | // k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5 |
| 344 | fmul d31, d17, v21.2D[0] |
| 345 | fadd d23, d23, d31 |
| 346 | |
| 347 | // k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 348 | fmul d31, d17, v25.2D[0] |
| 349 | fadd d27, d27, d31 |
| 350 | |
| 351 | // Result = 1.0 + ss + 1/2 ss^2 |
| 352 | fmul d31, d17, v28.2D[0] |
| 353 | fadd d0, d0, d31 |
| 354 | #else |
| 355 | // k5 = ss^2 ( ss k8 + k7 ) + ss k6 + k5 |
| 356 | fmla d23, d17, v21.2D[0] |
| 357 | |
| 358 | // k1 = ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 359 | fmla d27, d17, v25.2D[0] |
| 360 | |
| 361 | // Result = 1.0 + ss + 1/2 ss^2 |
| 362 | fmla d0, d17, v28.2D[0] |
| 363 | #endif |
| 364 | // Adjust int_n so that it's a double precision value that can be multiplied by Result |
| 365 | add d7, d2, d7 |
| 366 | #ifdef NO_FUSED_MULTIPLY |
| 367 | // k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 368 | fmul d31, d18, v23.2D[0] |
| 369 | fadd d27, d27, d31 |
| 370 | |
| 371 | // Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 ) |
| 372 | fmul d31, d19, v27.2D[0] |
| 373 | fadd d0, d0, d31 |
| 374 | #else |
| 375 | // k1 = ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 |
| 376 | fmla d27, d18, v23.2D[0] |
| 377 | |
| 378 | // Result = 1.0 + ss + 1/2 ss^2 + ss^3 ( ss^4 ( ss^2 ( ss k8 + k7 ) + ss k6 + k5 ) + ss^2 ( ss k4 + k3 ) + ss k2 + k1 ) |
| 379 | fmla d0, d19, v27.2D[0] |
| 380 | #endif |
| 381 | // multiply by adjustment (sign*(rounding ? sqrt(2) : 1) * 2^int_n) |
| 382 | fmul d0, d7, d0 |
| 383 | |
| 384 | .LleavePow: |
| 385 | #if defined(KRAIT_NO_AAPCS_VFP_MODE) |
| 386 | // return Result (FP) |
| 387 | // fmov x0, d0 |
| 388 | #endif |
| 389 | .LleavePowDirect: |
| 390 | // leave directly returning whatever is in d0 |
| 391 | ret |
| 392 | .LuseFullImpl: |
| 393 | fmov d0, x0 |
| 394 | fmov d1, x1 |
| 395 | b __full_ieee754_pow |
| 396 | |
| 397 | .align 6 |
| 398 | .LliteralTable: |
| 399 | // Least-sqares tuned constants for 11th order (log2((1+s)/(1-s)): |
| 400 | .LL4: // ~3/11 |
| 401 | .long 0x53a79915, 0x3fd1b108 |
| 402 | .LL3: // ~1/3 |
| 403 | .long 0x9ca0567a, 0x3fd554fa |
| 404 | .LL2: // ~3/7 |
| 405 | .long 0x1408e660, 0x3fdb6db7 |
| 406 | .LL1: // ~3/5 |
| 407 | .long 0x332D4313, 0x3fe33333 |
| 408 | |
| 409 | // Least-squares tuned constants for 10th order exp(s): |
| 410 | .LE10: // ~1/3628800 |
| 411 | .long 0x25c7ba0a, 0x3e92819b |
| 412 | .LE9: // ~1/362880 |
| 413 | .long 0x9499b49c, 0x3ec72294 |
| 414 | .LE8: // ~1/40320 |
| 415 | .long 0xabb79d95, 0x3efa019f |
| 416 | .LE7: // ~1/5040 |
| 417 | .long 0x8723aeaa, 0x3f2a019f |
| 418 | .LE6: // ~1/720 |
| 419 | .long 0x16c76a94, 0x3f56c16c |
| 420 | .LE5: // ~1/120 |
| 421 | .long 0x11185da8, 0x3f811111 |
| 422 | .LE4: // ~1/24 |
| 423 | .long 0x5555551c, 0x3fa55555 |
| 424 | .LE3: // ~1/6 |
| 425 | .long 0x555554db, 0x3fc55555 |
| 426 | |
| 427 | .LbpA: // (2^(2/5) - 1) |
| 428 | .long 0x4ee54db1, 0x3fd472d1 |
| 429 | |
| 430 | .LbpB: // (2^(4/5) - 2^(2/5)) |
| 431 | .long 0x1c8a36cf, 0x3fdafb62 |
| 432 | |
| 433 | .Ltwofifths: // 2/5 |
| 434 | .long 0x9999999a, 0x3fd99999 |
| 435 | |
| 436 | .Ltwooverthreeln2: |
| 437 | .long 0xDC3A03FD, 0x3FEEC709 |
| 438 | |
| 439 | .Ltwoto1o5: // 2^(1/5) exponent 3ff stripped for non-normalized compares |
| 440 | .long 0x86BAE675, 0x00026111 |
| 441 | |
| 442 | .Ltwoto3o5: // 2^(3/5) exponent 3ff stripped for non-normalized compares |
| 443 | .long 0x03B2AE5C, 0x00084060 |
| 444 | |
| 445 | .Lln2: // ln(2) |
| 446 | .long 0xFEFA39EF, 0x3FE62E42 |
| 447 | |
| 448 | .Ltwoto1o4: // 2^1/4 |
| 449 | .long 0x0a31b715, 0x3ff306fe |
| 450 | END(pow) |