blob: 1ebed3523abfd15e9b83d7d711c225608b1d236d [file] [log] [blame]
Shravya Rukmannagariad79a5a2016-04-06 10:29:26 -07001/*
2* Copyright (c) 2016, Intel Corporation.
3* Intel Math Library (LIBM) Source Code
4*
5* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6*
7* This code is free software; you can redistribute it and/or modify it
8* under the terms of the GNU General Public License version 2 only, as
9* published by the Free Software Foundation.
10*
11* This code is distributed in the hope that it will be useful, but WITHOUT
12* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14* version 2 for more details (a copy is included in the LICENSE file that
15* accompanied this code).
16*
17* You should have received a copy of the GNU General Public License version
18* 2 along with this work; if not, write to the Free Software Foundation,
19* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20*
21* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22* or visit www.oracle.com if you need additional information or have any
23* questions.
24*
25*/
26
27#include "precompiled.hpp"
28#include "asm/assembler.hpp"
29#include "asm/assembler.inline.hpp"
30#include "runtime/stubRoutines.hpp"
31#include "macroAssembler_x86.hpp"
32
33#ifdef _MSC_VER
34#define ALIGNED_(x) __declspec(align(x))
35#else
36#define ALIGNED_(x) __attribute__ ((aligned(x)))
37#endif
38
39/******************************************************************************/
40// ALGORITHM DESCRIPTION - COS()
41// ---------------------
42//
43// 1. RANGE REDUCTION
44//
45// We perform an initial range reduction from X to r with
46//
47// X =~= N * pi/32 + r
48//
49// so that |r| <= pi/64 + epsilon. We restrict inputs to those
50// where |N| <= 932560. Beyond this, the range reduction is
51// insufficiently accurate. For extremely small inputs,
52// denormalization can occur internally, impacting performance.
53// This means that the main path is actually only taken for
54// 2^-252 <= |X| < 90112.
55//
56// To avoid branches, we perform the range reduction to full
57// accuracy each time.
58//
59// X - N * (P_1 + P_2 + P_3)
60//
61// where P_1 and P_2 are 32-bit numbers (so multiplication by N
62// is exact) and P_3 is a 53-bit number. Together, these
63// approximate pi well enough for all cases in the restricted
64// range.
65//
66// The main reduction sequence is:
67//
68// y = 32/pi * x
69// N = integer(y)
70// (computed by adding and subtracting off SHIFTER)
71//
72// m_1 = N * P_1
73// m_2 = N * P_2
74// r_1 = x - m_1
75// r = r_1 - m_2
76// (this r can be used for most of the calculation)
77//
78// c_1 = r_1 - r
79// m_3 = N * P_3
80// c_2 = c_1 - m_2
81// c = c_2 - m_3
82//
83// 2. MAIN ALGORITHM
84//
85// The algorithm uses a table lookup based on B = M * pi / 32
86// where M = N mod 64. The stored values are:
87// sigma closest power of 2 to cos(B)
88// C_hl 53-bit cos(B) - sigma
89// S_hi + S_lo 2 * 53-bit sin(B)
90//
91// The computation is organized as follows:
92//
93// sin(B + r + c) = [sin(B) + sigma * r] +
94// r * (cos(B) - sigma) +
95// sin(B) * [cos(r + c) - 1] +
96// cos(B) * [sin(r + c) - r]
97//
98// which is approximately:
99//
100// [S_hi + sigma * r] +
101// C_hl * r +
102// S_lo + S_hi * [(cos(r) - 1) - r * c] +
103// (C_hl + sigma) * [(sin(r) - r) + c]
104//
105// and this is what is actually computed. We separate this sum
106// into four parts:
107//
108// hi + med + pols + corr
109//
110// where
111//
112// hi = S_hi + sigma r
113// med = C_hl * r
114// pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r)
115// corr = S_lo + c * ((C_hl + sigma) - S_hi * r)
116//
117// 3. POLYNOMIAL
118//
119// The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) *
120// (sin(r) - r) can be rearranged freely, since it is quite
121// small, so we exploit parallelism to the fullest.
122//
123// psc4 = SC_4 * r_1
124// msc4 = psc4 * r
125// r2 = r * r
126// msc2 = SC_2 * r2
127// r4 = r2 * r2
128// psc3 = SC_3 + msc4
129// psc1 = SC_1 + msc2
130// msc3 = r4 * psc3
131// sincospols = psc1 + msc3
132// pols = sincospols *
133// <S_hi * r^2 | (C_hl + sigma) * r^3>
134//
135// 4. CORRECTION TERM
136//
137// This is where the "c" component of the range reduction is
138// taken into account; recall that just "r" is used for most of
139// the calculation.
140//
141// -c = m_3 - c_2
142// -d = S_hi * r - (C_hl + sigma)
143// corr = -c * -d + S_lo
144//
145// 5. COMPENSATED SUMMATIONS
146//
147// The two successive compensated summations add up the high
148// and medium parts, leaving just the low parts to add up at
149// the end.
150//
151// rs = sigma * r
152// res_int = S_hi + rs
153// k_0 = S_hi - res_int
154// k_2 = k_0 + rs
155// med = C_hl * r
156// res_hi = res_int + med
157// k_1 = res_int - res_hi
158// k_3 = k_1 + med
159//
160// 6. FINAL SUMMATION
161//
162// We now add up all the small parts:
163//
164// res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3
165//
166// Now the overall result is just:
167//
168// res_hi + res_lo
169//
170// 7. SMALL ARGUMENTS
171//
172// Inputs with |X| < 2^-252 are treated specially as
173// 1 - |x|.
174//
175// Special cases:
176// cos(NaN) = quiet NaN, and raise invalid exception
177// cos(INF) = NaN and raise invalid exception
178// cos(0) = 1
179//
180/******************************************************************************/
181
182#ifdef _LP64
183// The 64 bit code is at most SSE2 compliant
184ALIGNED_(8) juint _ONE[] =
185{
186 0x00000000UL, 0x3ff00000UL
187};
188void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register r8, Register r9, Register r10, Register r11) {
189
190 Label L_2TAG_PACKET_0_0_1, L_2TAG_PACKET_1_0_1, L_2TAG_PACKET_2_0_1, L_2TAG_PACKET_3_0_1;
191 Label L_2TAG_PACKET_4_0_1, L_2TAG_PACKET_5_0_1, L_2TAG_PACKET_6_0_1, L_2TAG_PACKET_7_0_1;
192 Label L_2TAG_PACKET_8_0_1, L_2TAG_PACKET_9_0_1, L_2TAG_PACKET_10_0_1, L_2TAG_PACKET_11_0_1;
193 Label L_2TAG_PACKET_12_0_1, L_2TAG_PACKET_13_0_1, B1_2, B1_3, B1_4, B1_5, start;
194
195 assert_different_registers(r8, r9, r10, r11, eax, ecx, edx);
196
197 address ONEHALF = StubRoutines::x86::_ONEHALF_addr();
198 address P_2 = StubRoutines::x86::_P_2_addr();
199 address SC_4 = StubRoutines::x86::_SC_4_addr();
200 address Ctable = StubRoutines::x86::_Ctable_addr();
201 address SC_2 = StubRoutines::x86::_SC_2_addr();
202 address SC_3 = StubRoutines::x86::_SC_3_addr();
203 address SC_1 = StubRoutines::x86::_SC_1_addr();
204 address PI_INV_TABLE = StubRoutines::x86::_PI_INV_TABLE_addr();
205 address PI_4 = (address)StubRoutines::x86::_PI_4_addr();
206 address PI32INV = (address)StubRoutines::x86::_PI32INV_addr();
207 address SIGN_MASK = (address)StubRoutines::x86::_SIGN_MASK_addr();
208 address P_1 = (address)StubRoutines::x86::_P_1_addr();
209 address P_3 = (address)StubRoutines::x86::_P_3_addr();
210 address ONE = (address)_ONE;
211 address NEG_ZERO = (address)StubRoutines::x86::_NEG_ZERO_addr();
212
213 bind(start);
214 push(rbx);
215 subq(rsp, 16);
216 movsd(Address(rsp, 8), xmm0);
217
218 bind(B1_2);
219 movl(eax, Address(rsp, 12));
220 movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL
221 andl(eax, 2147418112);
222 subl(eax, 808452096);
223 cmpl(eax, 281346048);
224 jcc(Assembler::above, L_2TAG_PACKET_0_0_1);
225 mulsd(xmm1, xmm0);
226 movdqu(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
227 movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL
228 pand(xmm4, xmm0);
229 por(xmm5, xmm4);
230 addpd(xmm1, xmm5);
231 cvttsd2sil(edx, xmm1);
232 cvtsi2sdl(xmm1, edx);
233 movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
234 movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL
235 mulsd(xmm3, xmm1);
236 unpcklpd(xmm1, xmm1);
237 addq(rdx, 1865232);
238 movdqu(xmm4, xmm0);
239 andq(rdx, 63);
240 movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
241 lea(rax, ExternalAddress(Ctable));
242 shlq(rdx, 5);
243 addq(rax, rdx);
244 mulpd(xmm2, xmm1);
245 subsd(xmm0, xmm3);
246 mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL
247 subsd(xmm4, xmm3);
248 movq(xmm7, Address(rax, 8));
249 unpcklpd(xmm0, xmm0);
250 movdqu(xmm3, xmm4);
251 subsd(xmm4, xmm2);
252 mulpd(xmm5, xmm0);
253 subpd(xmm0, xmm2);
254 movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
255 mulsd(xmm7, xmm4);
256 subsd(xmm3, xmm4);
257 mulpd(xmm5, xmm0);
258 mulpd(xmm0, xmm0);
259 subsd(xmm3, xmm2);
260 movdqu(xmm2, Address(rax, 0));
261 subsd(xmm1, xmm3);
262 movq(xmm3, Address(rax, 24));
263 addsd(xmm2, xmm3);
264 subsd(xmm7, xmm2);
265 mulsd(xmm2, xmm4);
266 mulpd(xmm6, xmm0);
267 mulsd(xmm3, xmm4);
268 mulpd(xmm2, xmm0);
269 mulpd(xmm0, xmm0);
270 addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
271 mulsd(xmm4, Address(rax, 0));
272 addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
273 mulpd(xmm5, xmm0);
274 movdqu(xmm0, xmm3);
275 addsd(xmm3, Address(rax, 8));
276 mulpd(xmm1, xmm7);
277 movdqu(xmm7, xmm4);
278 addsd(xmm4, xmm3);
279 addpd(xmm6, xmm5);
280 movq(xmm5, Address(rax, 8));
281 subsd(xmm5, xmm3);
282 subsd(xmm3, xmm4);
283 addsd(xmm1, Address(rax, 16));
284 mulpd(xmm6, xmm2);
285 addsd(xmm0, xmm5);
286 addsd(xmm3, xmm7);
287 addsd(xmm0, xmm1);
288 addsd(xmm0, xmm3);
289 addsd(xmm0, xmm6);
290 unpckhpd(xmm6, xmm6);
291 addsd(xmm0, xmm6);
292 addsd(xmm0, xmm4);
293 jmp(B1_4);
294
295 bind(L_2TAG_PACKET_0_0_1);
296 jcc(Assembler::greater, L_2TAG_PACKET_1_0_1);
297 pextrw(eax, xmm0, 3);
298 andl(eax, 32767);
299 pinsrw(xmm0, eax, 3);
300 movq(xmm1, ExternalAddress(ONE)); //0x00000000UL, 0x3ff00000UL
301 subsd(xmm1, xmm0);
302 movdqu(xmm0, xmm1);
303 jmp(B1_4);
304
305 bind(L_2TAG_PACKET_1_0_1);
306 pextrw(eax, xmm0, 3);
307 andl(eax, 32752);
308 cmpl(eax, 32752);
309 jcc(Assembler::equal, L_2TAG_PACKET_2_0_1);
310 pextrw(ecx, xmm0, 3);
311 andl(ecx, 32752);
312 subl(ecx, 16224);
313 shrl(ecx, 7);
314 andl(ecx, 65532);
315 lea(r11, ExternalAddress(PI_INV_TABLE));
316 addq(rcx, r11);
317 movdq(rax, xmm0);
318 movl(r10, Address(rcx, 20));
319 movl(r8, Address(rcx, 24));
320 movl(edx, eax);
321 shrq(rax, 21);
322 orl(eax, INT_MIN);
323 shrl(eax, 11);
324 movl(r9, r10);
325 imulq(r10, rdx);
326 imulq(r9, rax);
327 imulq(r8, rax);
328 movl(rsi, Address(rcx, 16));
329 movl(rdi, Address(rcx, 12));
330 movl(r11, r10);
331 shrq(r10, 32);
332 addq(r9, r10);
333 addq(r11, r8);
334 movl(r8, r11);
335 shrq(r11, 32);
336 addq(r9, r11);
337 movl(r10, rsi);
338 imulq(rsi, rdx);
339 imulq(r10, rax);
340 movl(r11, rdi);
341 imulq(rdi, rdx);
342 movl(rbx, rsi);
343 shrq(rsi, 32);
344 addq(r9, rbx);
345 movl(rbx, r9);
346 shrq(r9, 32);
347 addq(r10, rsi);
348 addq(r10, r9);
349 shlq(rbx, 32);
350 orq(r8, rbx);
351 imulq(r11, rax);
352 movl(r9, Address(rcx, 8));
353 movl(rsi, Address(rcx, 4));
354 movl(rbx, rdi);
355 shrq(rdi, 32);
356 addq(r10, rbx);
357 movl(rbx, r10);
358 shrq(r10, 32);
359 addq(r11, rdi);
360 addq(r11, r10);
361 movq(rdi, r9);
362 imulq(r9, rdx);
363 imulq(rdi, rax);
364 movl(r10, r9);
365 shrq(r9, 32);
366 addq(r11, r10);
367 movl(r10, r11);
368 shrq(r11, 32);
369 addq(rdi, r9);
370 addq(rdi, r11);
371 movq(r9, rsi);
372 imulq(rsi, rdx);
373 imulq(r9, rax);
374 shlq(r10, 32);
375 orq(r10, rbx);
376 movl(eax, Address(rcx, 0));
377 movl(r11, rsi);
378 shrq(rsi, 32);
379 addq(rdi, r11);
380 movl(r11, rdi);
381 shrq(rdi, 32);
382 addq(r9, rsi);
383 addq(r9, rdi);
384 imulq(rdx, rax);
385 pextrw(rbx, xmm0, 3);
386 lea(rdi, ExternalAddress(PI_INV_TABLE));
387 subq(rcx, rdi);
388 addl(ecx, ecx);
389 addl(ecx, ecx);
390 addl(ecx, ecx);
391 addl(ecx, 19);
392 movl(rsi, 32768);
393 andl(rsi, rbx);
394 shrl(rbx, 4);
395 andl(rbx, 2047);
396 subl(rbx, 1023);
397 subl(ecx, rbx);
398 addq(r9, rdx);
399 movl(edx, ecx);
400 addl(edx, 32);
401 cmpl(ecx, 1);
402 jcc(Assembler::less, L_2TAG_PACKET_3_0_1);
403 negl(ecx);
404 addl(ecx, 29);
405 shll(r9);
406 movl(rdi, r9);
407 andl(r9, 536870911);
408 testl(r9, 268435456);
409 jcc(Assembler::notEqual, L_2TAG_PACKET_4_0_1);
410 shrl(r9);
411 movl(rbx, 0);
412 shlq(r9, 32);
413 orq(r9, r11);
414
415 bind(L_2TAG_PACKET_5_0_1);
416
417 bind(L_2TAG_PACKET_6_0_1);
418 cmpq(r9, 0);
419 jcc(Assembler::equal, L_2TAG_PACKET_7_0_1);
420
421 bind(L_2TAG_PACKET_8_0_1);
422 bsrq(r11, r9);
423 movl(ecx, 29);
424 subl(ecx, r11);
425 jcc(Assembler::lessEqual, L_2TAG_PACKET_9_0_1);
426 shlq(r9);
427 movq(rax, r10);
428 shlq(r10);
429 addl(edx, ecx);
430 negl(ecx);
431 addl(ecx, 64);
432 shrq(rax);
433 shrq(r8);
434 orq(r9, rax);
435 orq(r10, r8);
436
437 bind(L_2TAG_PACKET_10_0_1);
438 cvtsi2sdq(xmm0, r9);
439 shrq(r10, 1);
440 cvtsi2sdq(xmm3, r10);
441 xorpd(xmm4, xmm4);
442 shll(edx, 4);
443 negl(edx);
444 addl(edx, 16368);
445 orl(edx, rsi);
446 xorl(edx, rbx);
447 pinsrw(xmm4, edx, 3);
448 movq(xmm2, ExternalAddress(PI_4)); //0x40000000UL, 0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
449 movq(xmm6, ExternalAddress(8 + PI_4)); //0x3fe921fbUL, 0x18469899UL, 0x3e64442dUL
450 xorpd(xmm5, xmm5);
451 subl(edx, 1008);
452 pinsrw(xmm5, edx, 3);
453 mulsd(xmm0, xmm4);
454 shll(rsi, 16);
455 sarl(rsi, 31);
456 mulsd(xmm3, xmm5);
457 movdqu(xmm1, xmm0);
458 mulsd(xmm0, xmm2);
459 shrl(rdi, 29);
460 addsd(xmm1, xmm3);
461 mulsd(xmm3, xmm2);
462 addl(rdi, rsi);
463 xorl(rdi, rsi);
464 mulsd(xmm6, xmm1);
465 movl(eax, rdi);
466 addsd(xmm6, xmm3);
467 movdqu(xmm2, xmm0);
468 addsd(xmm0, xmm6);
469 subsd(xmm2, xmm0);
470 addsd(xmm6, xmm2);
471
472 bind(L_2TAG_PACKET_11_0_1);
473 movq(xmm1, ExternalAddress(PI32INV)); //0x6dc9c883UL, 0x40245f30UL
474 mulsd(xmm1, xmm0);
475 movq(xmm5, ExternalAddress(ONEHALF)); //0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
476 movq(xmm4, ExternalAddress(SIGN_MASK)); //0x00000000UL, 0x80000000UL
477 pand(xmm4, xmm0);
478 por(xmm5, xmm4);
479 addpd(xmm1, xmm5);
480 cvttsd2siq(rdx, xmm1);
481 cvtsi2sdq(xmm1, rdx);
482 movq(xmm3, ExternalAddress(P_1)); //0x54400000UL, 0x3fb921fbUL
483 movdqu(xmm2, ExternalAddress(P_2)); //0x1a600000UL, 0x3d90b461UL, 0x1a600000UL, 0x3d90b461UL
484 mulsd(xmm3, xmm1);
485 unpcklpd(xmm1, xmm1);
486 shll(eax, 3);
487 addl(edx, 1865232);
488 movdqu(xmm4, xmm0);
489 addl(edx, eax);
490 andl(edx, 63);
491 movdqu(xmm5, ExternalAddress(SC_4)); //0xa556c734UL, 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL
492 lea(rax, ExternalAddress(Ctable));
493 shll(edx, 5);
494 addq(rax, rdx);
495 mulpd(xmm2, xmm1);
496 subsd(xmm0, xmm3);
497 mulsd(xmm1, ExternalAddress(P_3)); //0x2e037073UL, 0x3b63198aUL
498 subsd(xmm4, xmm3);
499 movq(xmm7, Address(rax, 8));
500 unpcklpd(xmm0, xmm0);
501 movdqu(xmm3, xmm4);
502 subsd(xmm4, xmm2);
503 mulpd(xmm5, xmm0);
504 subpd(xmm0, xmm2);
505 mulsd(xmm7, xmm4);
506 subsd(xmm3, xmm4);
507 mulpd(xmm5, xmm0);
508 mulpd(xmm0, xmm0);
509 subsd(xmm3, xmm2);
510 movdqu(xmm2, Address(rax, 0));
511 subsd(xmm1, xmm3);
512 movq(xmm3, Address(rax, 24));
513 addsd(xmm2, xmm3);
514 subsd(xmm7, xmm2);
515 subsd(xmm1, xmm6);
516 movdqu(xmm6, ExternalAddress(SC_2)); //0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL
517 mulsd(xmm2, xmm4);
518 mulpd(xmm6, xmm0);
519 mulsd(xmm3, xmm4);
520 mulpd(xmm2, xmm0);
521 mulpd(xmm0, xmm0);
522 addpd(xmm5, ExternalAddress(SC_3)); //0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL
523 mulsd(xmm4, Address(rax, 0));
524 addpd(xmm6, ExternalAddress(SC_1)); //0x55555555UL, 0xbfc55555UL, 0x00000000UL, 0xbfe00000UL
525 mulpd(xmm5, xmm0);
526 movdqu(xmm0, xmm3);
527 addsd(xmm3, Address(rax, 8));
528 mulpd(xmm1, xmm7);
529 movdqu(xmm7, xmm4);
530 addsd(xmm4, xmm3);
531 addpd(xmm6, xmm5);
532 movq(xmm5, Address(rax, 8));
533 subsd(xmm5, xmm3);
534 subsd(xmm3, xmm4);
535 addsd(xmm1, Address(rax, 16));
536 mulpd(xmm6, xmm2);
537 addsd(xmm5, xmm0);
538 addsd(xmm3, xmm7);
539 addsd(xmm1, xmm5);
540 addsd(xmm1, xmm3);
541 addsd(xmm1, xmm6);
542 unpckhpd(xmm6, xmm6);
543 movdqu(xmm0, xmm4);
544 addsd(xmm1, xmm6);
545 addsd(xmm0, xmm1);
546 jmp(B1_4);
547
548 bind(L_2TAG_PACKET_7_0_1);
549 addl(edx, 64);
550 movq(r9, r10);
551 movq(r10, r8);
552 movl(r8, 0);
553 cmpq(r9, 0);
554 jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
555 addl(edx, 64);
556 movq(r9, r10);
557 movq(r10, r8);
558 cmpq(r9, 0);
559 jcc(Assembler::notEqual, L_2TAG_PACKET_8_0_1);
560 xorpd(xmm0, xmm0);
561 xorpd(xmm6, xmm6);
562 jmp(L_2TAG_PACKET_11_0_1);
563
564 bind(L_2TAG_PACKET_9_0_1);
565 jcc(Assembler::equal, L_2TAG_PACKET_10_0_1);
566 negl(ecx);
567 shrq(r10);
568 movq(rax, r9);
569 shrq(r9);
570 subl(edx, ecx);
571 negl(ecx);
572 addl(ecx, 64);
573 shlq(rax);
574 orq(r10, rax);
575 jmp(L_2TAG_PACKET_10_0_1);
576 bind(L_2TAG_PACKET_3_0_1);
577 negl(ecx);
578 shlq(r9, 32);
579 orq(r9, r11);
580 shlq(r9);
581 movq(rdi, r9);
582 testl(r9, INT_MIN);
583 jcc(Assembler::notEqual, L_2TAG_PACKET_12_0_1);
584 shrl(r9);
585 movl(rbx, 0);
586 shrq(rdi, 3);
587 jmp(L_2TAG_PACKET_6_0_1);
588
589 bind(L_2TAG_PACKET_4_0_1);
590 shrl(r9);
591 movl(rbx, 536870912);
592 shrl(rbx);
593 shlq(r9, 32);
594 orq(r9, r11);
595 shlq(rbx, 32);
596 addl(rdi, 536870912);
597 movl(rcx, 0);
598 movl(r11, 0);
599 subq(rcx, r8);
600 sbbq(r11, r10);
601 sbbq(rbx, r9);
602 movq(r8, rcx);
603 movq(r10, r11);
604 movq(r9, rbx);
605 movl(rbx, 32768);
606 jmp(L_2TAG_PACKET_5_0_1);
607
608 bind(L_2TAG_PACKET_12_0_1);
609 shrl(r9);
610 mov64(rbx, 0x100000000);
611 shrq(rbx);
612 movl(rcx, 0);
613 movl(r11, 0);
614 subq(rcx, r8);
615 sbbq(r11, r10);
616 sbbq(rbx, r9);
617 movq(r8, rcx);
618 movq(r10, r11);
619 movq(r9, rbx);
620 movl(rbx, 32768);
621 shrq(rdi, 3);
622 addl(rdi, 536870912);
623 jmp(L_2TAG_PACKET_6_0_1);
624
625 bind(L_2TAG_PACKET_2_0_1);
626 movsd(xmm0, Address(rsp, 8));
627 mulsd(xmm0, ExternalAddress(NEG_ZERO)); //0x00000000UL, 0x80000000UL
628 movq(Address(rsp, 0), xmm0);
629
630 bind(L_2TAG_PACKET_13_0_1);
631
632 bind(B1_4);
633 addq(rsp, 16);
634 pop(rbx);
635}
636#else
637// The 32 bit code is at most SSE2 compliant
638
639ALIGNED_(16) juint _static_const_table_cos[] =
640{
641 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
642 0x00000000UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL, 0xbf73b92eUL,
643 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
644 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL,
645 0xc0000000UL, 0xbc626d19UL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL,
646 0xbfa60beaUL, 0x2ed59f06UL, 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL,
647 0x00000000UL, 0x3ff00000UL, 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL,
648 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL, 0x00000000UL, 0x3ff00000UL,
649 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL, 0x20000000UL,
650 0x3c5e0d89UL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL, 0xbfc59267UL,
651 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
652 0x3ff00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL,
653 0x20000000UL, 0x3c68076aUL, 0x00000000UL, 0x3ff00000UL, 0x99fcef32UL,
654 0x3fca8279UL, 0x667f3bcdUL, 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL,
655 0x00000000UL, 0x3fe00000UL, 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL,
656 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL, 0x00000000UL, 0x3fe00000UL,
657 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL, 0xe0000000UL,
658 0x3c39f630UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL, 0xbf9d4a2cUL,
659 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
660 0x3fe00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0x3fed906bUL,
661 0x20000000UL, 0x3c7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x76acf82dUL,
662 0x3fa4a031UL, 0x56c62ddaUL, 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL,
663 0x00000000UL, 0x3fd00000UL, 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL,
664 0x3fef6297UL, 0x20000000UL, 0x3c756217UL, 0x00000000UL, 0x3fd00000UL,
665 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL, 0x3fefd88dUL, 0x40000000UL,
666 0xbc887df6UL, 0x00000000UL, 0x3fc00000UL, 0x00000000UL, 0x00000000UL,
667 0x00000000UL, 0x3ff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
668 0x00000000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0x3fefd88dUL,
669 0x40000000UL, 0xbc887df6UL, 0x00000000UL, 0xbfc00000UL, 0x0e5967d5UL,
670 0x3fac1d1fUL, 0xcff75cb0UL, 0x3fef6297UL, 0x20000000UL, 0x3c756217UL,
671 0x00000000UL, 0xbfd00000UL, 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL,
672 0x3fee9f41UL, 0xe0000000UL, 0x3c8760b1UL, 0x00000000UL, 0xbfd00000UL,
673 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL, 0x3fed906bUL, 0x20000000UL,
674 0x3c7457e6UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL, 0x3f9d4a2cUL,
675 0xf180bdb1UL, 0x3fec38b2UL, 0x80000000UL, 0xbc76e0b1UL, 0x00000000UL,
676 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0x3fea9b66UL,
677 0xe0000000UL, 0x3c39f630UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL,
678 0xbfc133ccUL, 0x6b151741UL, 0x3fe8bc80UL, 0x20000000UL, 0xbc82c5e1UL,
679 0x00000000UL, 0xbfe00000UL, 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL,
680 0x3fe6a09eUL, 0x20000000UL, 0xbc8bdd34UL, 0x00000000UL, 0xbfe00000UL,
681 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL, 0x3fe44cf3UL, 0x20000000UL,
682 0x3c68076aUL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL, 0x3fc59267UL,
683 0x39ae68c8UL, 0x3fe1c73bUL, 0x20000000UL, 0x3c8b25ddUL, 0x00000000UL,
684 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0x3fde2b5dUL,
685 0x20000000UL, 0x3c5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL,
686 0x3fb37ca1UL, 0xa6aea963UL, 0x3fd87de2UL, 0xe0000000UL, 0xbc672cedUL,
687 0x00000000UL, 0xbff00000UL, 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL,
688 0x3fd29406UL, 0xa0000000UL, 0xbc75d28dUL, 0x00000000UL, 0xbff00000UL,
689 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL, 0x3fc8f8b8UL, 0xc0000000UL,
690 0xbc626d19UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL, 0x3f73b92eUL,
691 0xbc29b42cUL, 0x3fb917a6UL, 0xe0000000UL, 0xbc3e2718UL, 0x00000000UL,
692 0xbff00000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
693 0x00000000UL, 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x176d6d31UL,
694 0x3f73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
695 0x00000000UL, 0xbff00000UL, 0x011469fbUL, 0x3f93ad06UL, 0x3c69a60bUL,
696 0xbfc8f8b8UL, 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0xbff00000UL,
697 0x939d225aUL, 0x3fa60beaUL, 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL,
698 0x3c75d28dUL, 0x00000000UL, 0xbff00000UL, 0x866b95cfUL, 0x3fb37ca1UL,
699 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL, 0x3c672cedUL, 0x00000000UL,
700 0xbff00000UL, 0x73fa1279UL, 0x3fbe3a68UL, 0x3806f63bUL, 0xbfde2b5dUL,
701 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0xbff00000UL, 0x5bc57974UL,
702 0x3fc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
703 0x00000000UL, 0xbff00000UL, 0x53aba2fdUL, 0x3fcd0dfeUL, 0x25091dd6UL,
704 0xbfe44cf3UL, 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0xbff00000UL,
705 0x99fcef32UL, 0xbfca8279UL, 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL,
706 0x3c8bdd34UL, 0x00000000UL, 0xbfe00000UL, 0x94247758UL, 0xbfc133ccUL,
707 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL, 0x3c82c5e1UL, 0x00000000UL,
708 0xbfe00000UL, 0x9ae68c87UL, 0xbfac73b3UL, 0x290ea1a3UL, 0xbfea9b66UL,
709 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0xbfe00000UL, 0x7f909c4eUL,
710 0x3f9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
711 0x00000000UL, 0xbfe00000UL, 0x65455a75UL, 0x3fbe0875UL, 0xcf328d46UL,
712 0xbfed906bUL, 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0xbfe00000UL,
713 0x76acf82dUL, 0xbfa4a031UL, 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL,
714 0xbc8760b1UL, 0x00000000UL, 0xbfd00000UL, 0x0e5967d5UL, 0x3fac1d1fUL,
715 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL, 0xbc756217UL, 0x00000000UL,
716 0xbfd00000UL, 0x0f592f50UL, 0x3f9ba165UL, 0xa3d12526UL, 0xbfefd88dUL,
717 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0xbfc00000UL, 0x00000000UL,
718 0x00000000UL, 0x00000000UL, 0xbff00000UL, 0x00000000UL, 0x00000000UL,
719 0x00000000UL, 0x00000000UL, 0x0f592f50UL, 0xbf9ba165UL, 0xa3d12526UL,
720 0xbfefd88dUL, 0x40000000UL, 0x3c887df6UL, 0x00000000UL, 0x3fc00000UL,
721 0x0e5967d5UL, 0xbfac1d1fUL, 0xcff75cb0UL, 0xbfef6297UL, 0x20000000UL,
722 0xbc756217UL, 0x00000000UL, 0x3fd00000UL, 0x76acf82dUL, 0x3fa4a031UL,
723 0x56c62ddaUL, 0xbfee9f41UL, 0xe0000000UL, 0xbc8760b1UL, 0x00000000UL,
724 0x3fd00000UL, 0x65455a75UL, 0xbfbe0875UL, 0xcf328d46UL, 0xbfed906bUL,
725 0x20000000UL, 0xbc7457e6UL, 0x00000000UL, 0x3fe00000UL, 0x7f909c4eUL,
726 0xbf9d4a2cUL, 0xf180bdb1UL, 0xbfec38b2UL, 0x80000000UL, 0x3c76e0b1UL,
727 0x00000000UL, 0x3fe00000UL, 0x9ae68c87UL, 0x3fac73b3UL, 0x290ea1a3UL,
728 0xbfea9b66UL, 0xe0000000UL, 0xbc39f630UL, 0x00000000UL, 0x3fe00000UL,
729 0x94247758UL, 0x3fc133ccUL, 0x6b151741UL, 0xbfe8bc80UL, 0x20000000UL,
730 0x3c82c5e1UL, 0x00000000UL, 0x3fe00000UL, 0x99fcef32UL, 0x3fca8279UL,
731 0x667f3bcdUL, 0xbfe6a09eUL, 0x20000000UL, 0x3c8bdd34UL, 0x00000000UL,
732 0x3fe00000UL, 0x53aba2fdUL, 0xbfcd0dfeUL, 0x25091dd6UL, 0xbfe44cf3UL,
733 0x20000000UL, 0xbc68076aUL, 0x00000000UL, 0x3ff00000UL, 0x5bc57974UL,
734 0xbfc59267UL, 0x39ae68c8UL, 0xbfe1c73bUL, 0x20000000UL, 0xbc8b25ddUL,
735 0x00000000UL, 0x3ff00000UL, 0x73fa1279UL, 0xbfbe3a68UL, 0x3806f63bUL,
736 0xbfde2b5dUL, 0x20000000UL, 0xbc5e0d89UL, 0x00000000UL, 0x3ff00000UL,
737 0x866b95cfUL, 0xbfb37ca1UL, 0xa6aea963UL, 0xbfd87de2UL, 0xe0000000UL,
738 0x3c672cedUL, 0x00000000UL, 0x3ff00000UL, 0x939d225aUL, 0xbfa60beaUL,
739 0x2ed59f06UL, 0xbfd29406UL, 0xa0000000UL, 0x3c75d28dUL, 0x00000000UL,
740 0x3ff00000UL, 0x011469fbUL, 0xbf93ad06UL, 0x3c69a60bUL, 0xbfc8f8b8UL,
741 0xc0000000UL, 0x3c626d19UL, 0x00000000UL, 0x3ff00000UL, 0x176d6d31UL,
742 0xbf73b92eUL, 0xbc29b42cUL, 0xbfb917a6UL, 0xe0000000UL, 0x3c3e2718UL,
743 0x00000000UL, 0x3ff00000UL, 0x55555555UL, 0xbfc55555UL, 0x00000000UL,
744 0xbfe00000UL, 0x11111111UL, 0x3f811111UL, 0x55555555UL, 0x3fa55555UL,
745 0x1a01a01aUL, 0xbf2a01a0UL, 0x16c16c17UL, 0xbf56c16cUL, 0xa556c734UL,
746 0x3ec71de3UL, 0x1a01a01aUL, 0x3efa01a0UL, 0x1a600000UL, 0x3d90b461UL,
747 0x1a600000UL, 0x3d90b461UL, 0x54400000UL, 0x3fb921fbUL, 0x00000000UL,
748 0x00000000UL, 0x2e037073UL, 0x3b63198aUL, 0x00000000UL, 0x00000000UL,
749 0x6dc9c883UL, 0x40245f30UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
750 0x43380000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x3ff00000UL,
751 0x00000000UL, 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL,
752 0x00000000UL, 0x00000000UL, 0x80000000UL, 0x00000000UL, 0x00000000UL,
753 0x00000000UL, 0x3fe00000UL, 0x00000000UL, 0x3fe00000UL
754};
755//registers,
756// input: (rbp + 8)
757// scratch: xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
758// rax, rdx, rcx, rbx (tmp)
759
760// Code generated by Intel C compiler for LIBM library
761
762void MacroAssembler::fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register eax, Register ecx, Register edx, Register tmp) {
763 Label L_2TAG_PACKET_0_0_2, L_2TAG_PACKET_1_0_2, L_2TAG_PACKET_2_0_2, L_2TAG_PACKET_3_0_2;
764 Label L_2TAG_PACKET_4_0_2, L_2TAG_PACKET_5_0_2, L_2TAG_PACKET_6_0_2, L_2TAG_PACKET_7_0_2;
765 Label L_2TAG_PACKET_8_0_2, L_2TAG_PACKET_9_0_2, L_2TAG_PACKET_10_0_2, L_2TAG_PACKET_11_0_2;
766 Label L_2TAG_PACKET_12_0_2, L_2TAG_PACKET_13_0_2, B1_3, B1_5, start;
767
768 assert_different_registers(tmp, eax, ecx, edx);
769
770 address static_const_table_cos = (address)_static_const_table_cos;
771
772 bind(start);
773 subl(rsp, 120);
774 movl(Address(rsp, 56), tmp);
775 lea(tmp, ExternalAddress(static_const_table_cos));
776 movsd(xmm0, Address(rsp, 128));
777 pextrw(eax, xmm0, 3);
778 andl(eax, 32767);
779 subl(eax, 12336);
780 cmpl(eax, 4293);
781 jcc(Assembler::above, L_2TAG_PACKET_0_0_2);
782 movsd(xmm1, Address(tmp, 2160));
783 mulsd(xmm1, xmm0);
784 movdqu(xmm5, Address(tmp, 2240));
785 movsd(xmm4, Address(tmp, 2224));
786 pand(xmm4, xmm0);
787 por(xmm5, xmm4);
788 movsd(xmm3, Address(tmp, 2128));
789 movdqu(xmm2, Address(tmp, 2112));
790 addpd(xmm1, xmm5);
791 cvttsd2sil(edx, xmm1);
792 cvtsi2sdl(xmm1, edx);
793 mulsd(xmm3, xmm1);
794 unpcklpd(xmm1, xmm1);
795 addl(edx, 1865232);
796 movdqu(xmm4, xmm0);
797 andl(edx, 63);
798 movdqu(xmm5, Address(tmp, 2096));
799 lea(eax, Address(tmp, 0));
800 shll(edx, 5);
801 addl(eax, edx);
802 mulpd(xmm2, xmm1);
803 subsd(xmm0, xmm3);
804 mulsd(xmm1, Address(tmp, 2144));
805 subsd(xmm4, xmm3);
806 movsd(xmm7, Address(eax, 8));
807 unpcklpd(xmm0, xmm0);
808 movapd(xmm3, xmm4);
809 subsd(xmm4, xmm2);
810 mulpd(xmm5, xmm0);
811 subpd(xmm0, xmm2);
812 movdqu(xmm6, Address(tmp, 2064));
813 mulsd(xmm7, xmm4);
814 subsd(xmm3, xmm4);
815 mulpd(xmm5, xmm0);
816 mulpd(xmm0, xmm0);
817 subsd(xmm3, xmm2);
818 movdqu(xmm2, Address(eax, 0));
819 subsd(xmm1, xmm3);
820 movsd(xmm3, Address(eax, 24));
821 addsd(xmm2, xmm3);
822 subsd(xmm7, xmm2);
823 mulsd(xmm2, xmm4);
824 mulpd(xmm6, xmm0);
825 mulsd(xmm3, xmm4);
826 mulpd(xmm2, xmm0);
827 mulpd(xmm0, xmm0);
828 addpd(xmm5, Address(tmp, 2080));
829 mulsd(xmm4, Address(eax, 0));
830 addpd(xmm6, Address(tmp, 2048));
831 mulpd(xmm5, xmm0);
832 movapd(xmm0, xmm3);
833 addsd(xmm3, Address(eax, 8));
834 mulpd(xmm1, xmm7);
835 movapd(xmm7, xmm4);
836 addsd(xmm4, xmm3);
837 addpd(xmm6, xmm5);
838 movsd(xmm5, Address(eax, 8));
839 subsd(xmm5, xmm3);
840 subsd(xmm3, xmm4);
841 addsd(xmm1, Address(eax, 16));
842 mulpd(xmm6, xmm2);
843 addsd(xmm5, xmm0);
844 addsd(xmm3, xmm7);
845 addsd(xmm1, xmm5);
846 addsd(xmm1, xmm3);
847 addsd(xmm1, xmm6);
848 unpckhpd(xmm6, xmm6);
849 addsd(xmm1, xmm6);
850 addsd(xmm4, xmm1);
851 movsd(Address(rsp, 0), xmm4);
852 fld_d(Address(rsp, 0));
853 jmp(L_2TAG_PACKET_1_0_2);
854
855 bind(L_2TAG_PACKET_0_0_2);
856 jcc(Assembler::greater, L_2TAG_PACKET_2_0_2);
857 pextrw(eax, xmm0, 3);
858 andl(eax, 32767);
859 pinsrw(xmm0, eax, 3);
860 movsd(xmm1, Address(tmp, 2192));
861 subsd(xmm1, xmm0);
862 movsd(Address(rsp, 0), xmm1);
863 fld_d(Address(rsp, 0));
864 jmp(L_2TAG_PACKET_1_0_2);
865
866 bind(L_2TAG_PACKET_2_0_2);
867 movl(eax, Address(rsp, 132));
868 andl(eax, 2146435072);
869 cmpl(eax, 2146435072);
870 jcc(Assembler::equal, L_2TAG_PACKET_3_0_2);
871 subl(rsp, 32);
872 movsd(Address(rsp, 0), xmm0);
873 lea(eax, Address(rsp, 40));
874 movl(Address(rsp, 8), eax);
875 movl(eax, 1);
876 movl(Address(rsp, 12), eax);
877 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::dlibm_sin_cos_huge())));
878 addl(rsp, 32);
879 fld_d(Address(rsp, 8));
880 jmp(L_2TAG_PACKET_1_0_2);
881
882 bind(L_2TAG_PACKET_3_0_2);
883 fld_d(Address(rsp, 128));
884 fmul_d(Address(tmp, 2208));
885
886 bind(L_2TAG_PACKET_1_0_2);
887 movl(tmp, Address(rsp, 56));
888}
889#endif