blob: 64c805226149c440d04b3affcfa9798afd3a4113 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001// z_Linux_asm.s: - microtasking routines specifically
2// written for Intel platforms running Linux* OS
Jim Cownie181b4bb2013-12-23 17:28:57 +00003// $Revision: 42810 $
4// $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005
6//
7////===----------------------------------------------------------------------===//
8////
9//// The LLVM Compiler Infrastructure
10////
11//// This file is dual licensed under the MIT and the University of Illinois Open
12//// Source Licenses. See LICENSE.txt for details.
13////
14////===----------------------------------------------------------------------===//
15//
16
17// -----------------------------------------------------------------------
18// macros
19// -----------------------------------------------------------------------
20
21#if KMP_ARCH_X86 || KMP_ARCH_X86_64
22
23# if __MIC__ || __MIC2__
24//
25// the 'delay r16/r32/r64' should be used instead of the 'pause'.
26// The delay operation has the effect of removing the current thread from
27// the round-robin HT mechanism, and therefore speeds up the issue rate of
28// the other threads on the same core.
29//
30// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
31// barrier time to increase greatly for 3 or more threads per core.
32//
33// A value of 100 works pretty well for up to 4 threads per core, but isn't
34// quite as fast as 0 for 2 threads per core.
35//
36// We need to check what happens for oversubscription / > 4 threads per core.
37// It is possible that we need to pass the delay value in as a parameter
38// that the caller determines based on the total # threads / # cores.
39//
40//.macro pause_op
41// mov $100, %rax
42// delay %rax
43//.endm
44# else
45# define pause_op .byte 0xf3,0x90
46# endif // __MIC__ || __MIC2__
47
48# if defined __APPLE__ && defined __MACH__
49# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
50.macro ALIGN
51 .align $0
52.endmacro
53.macro DEBUG_INFO
54/* Not sure what .size does in icc, not sure if we need to do something
55 similar for OS X*.
56*/
57.endmacro
58.macro PROC
59 ALIGN 4
60 .globl KMP_PREFIX_UNDERSCORE($0)
61KMP_PREFIX_UNDERSCORE($0):
62.endmacro
63# else // defined __APPLE__ && defined __MACH__
64# define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
65.macro ALIGN size
66 .align 1<<(\size)
67.endm
68.macro DEBUG_INFO proc
69// Not sure why we need .type and .size for the functions
70 .align 16
71 .type \proc,@function
72 .size \proc,.-\proc
73.endm
74.macro PROC proc
75 ALIGN 4
76 .globl KMP_PREFIX_UNDERSCORE(\proc)
77KMP_PREFIX_UNDERSCORE(\proc):
78.endm
79# endif // defined __APPLE__ && defined __MACH__
Jim Cownie181b4bb2013-12-23 17:28:57 +000080#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
Jim Cownie5e8470a2013-09-27 10:38:44 +000081
82
83// -----------------------------------------------------------------------
84// data
85// -----------------------------------------------------------------------
86
87#ifdef KMP_GOMP_COMPAT
88
89//
90// Support for unnamed common blocks.
91//
92// Because the symbol ".gomp_critical_user_" contains a ".", we have to
93// put this stuff in assembly.
94//
95
96# if KMP_ARCH_X86
97# if defined __APPLE__ && defined __MACH__
98 .data
99 .comm .gomp_critical_user_,32
100 .data
101 .globl ___kmp_unnamed_critical_addr
102___kmp_unnamed_critical_addr:
103 .long .gomp_critical_user_
104# else /* Linux* OS */
105 .data
106 .comm .gomp_critical_user_,32,8
107 .data
108 ALIGN 4
109 .global __kmp_unnamed_critical_addr
110__kmp_unnamed_critical_addr:
111 .4byte .gomp_critical_user_
112 .type __kmp_unnamed_critical_addr,@object
113 .size __kmp_unnamed_critical_addr,4
114# endif /* defined __APPLE__ && defined __MACH__ */
115# endif /* KMP_ARCH_X86 */
116
117# if KMP_ARCH_X86_64
118# if defined __APPLE__ && defined __MACH__
119 .data
120 .comm .gomp_critical_user_,32
121 .data
122 .globl ___kmp_unnamed_critical_addr
123___kmp_unnamed_critical_addr:
124 .quad .gomp_critical_user_
125# else /* Linux* OS */
126 .data
127 .comm .gomp_critical_user_,32,8
128 .data
129 ALIGN 8
130 .global __kmp_unnamed_critical_addr
131__kmp_unnamed_critical_addr:
132 .8byte .gomp_critical_user_
133 .type __kmp_unnamed_critical_addr,@object
134 .size __kmp_unnamed_critical_addr,8
135# endif /* defined __APPLE__ && defined __MACH__ */
136# endif /* KMP_ARCH_X86_64 */
137
138#endif /* KMP_GOMP_COMPAT */
139
140
Jim Cownie3051f972014-08-07 10:12:54 +0000141#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
Jim Cownie5e8470a2013-09-27 10:38:44 +0000142
143// -----------------------------------------------------------------------
144// microtasking routines specifically written for IA-32 architecture
145// running Linux* OS
146// -----------------------------------------------------------------------
147//
148
149 .ident "Intel Corporation"
150 .data
151 ALIGN 4
152// void
153// __kmp_x86_pause( void );
154//
155
156 .text
157 PROC __kmp_x86_pause
158
159 pause_op
160 ret
161
162 DEBUG_INFO __kmp_x86_pause
163
164//
165// void
166// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
167//
168 PROC __kmp_x86_cpuid
169
170 pushl %ebp
171 movl %esp,%ebp
172 pushl %edi
173 pushl %ebx
174 pushl %ecx
175 pushl %edx
176
177 movl 8(%ebp), %eax
178 movl 12(%ebp), %ecx
179 cpuid // Query the CPUID for the current processor
180
181 movl 16(%ebp), %edi
182 movl %eax, 0(%edi)
183 movl %ebx, 4(%edi)
184 movl %ecx, 8(%edi)
185 movl %edx, 12(%edi)
186
187 popl %edx
188 popl %ecx
189 popl %ebx
190 popl %edi
191 movl %ebp, %esp
192 popl %ebp
193 ret
194
195 DEBUG_INFO __kmp_x86_cpuid
196
197
198# if !KMP_ASM_INTRINS
199
200//------------------------------------------------------------------------
201//
202// kmp_int32
203// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
204//
205
206 PROC __kmp_test_then_add32
207
208 movl 4(%esp), %ecx
209 movl 8(%esp), %eax
210 lock
211 xaddl %eax,(%ecx)
212 ret
213
214 DEBUG_INFO __kmp_test_then_add32
215
216//------------------------------------------------------------------------
217//
218// FUNCTION __kmp_xchg_fixed8
219//
220// kmp_int32
221// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
222//
223// parameters:
224// p: 4(%esp)
225// d: 8(%esp)
226//
227// return: %al
228
229 PROC __kmp_xchg_fixed8
230
231 movl 4(%esp), %ecx // "p"
232 movb 8(%esp), %al // "d"
233
234 lock
235 xchgb %al,(%ecx)
236 ret
237
238 DEBUG_INFO __kmp_xchg_fixed8
239
240
241//------------------------------------------------------------------------
242//
243// FUNCTION __kmp_xchg_fixed16
244//
245// kmp_int16
246// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
247//
248// parameters:
249// p: 4(%esp)
250// d: 8(%esp)
251// return: %ax
252
253 PROC __kmp_xchg_fixed16
254
255 movl 4(%esp), %ecx // "p"
256 movw 8(%esp), %ax // "d"
257
258 lock
259 xchgw %ax,(%ecx)
260 ret
261
262 DEBUG_INFO __kmp_xchg_fixed16
263
264
265//------------------------------------------------------------------------
266//
267// FUNCTION __kmp_xchg_fixed32
268//
269// kmp_int32
270// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
271//
272// parameters:
273// p: 4(%esp)
274// d: 8(%esp)
275//
276// return: %eax
277
278 PROC __kmp_xchg_fixed32
279
280 movl 4(%esp), %ecx // "p"
281 movl 8(%esp), %eax // "d"
282
283 lock
284 xchgl %eax,(%ecx)
285 ret
286
287 DEBUG_INFO __kmp_xchg_fixed32
288
289
290//
291// kmp_int8
292// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
293//
294
295 PROC __kmp_compare_and_store8
296
297 movl 4(%esp), %ecx
298 movb 8(%esp), %al
299 movb 12(%esp), %dl
300 lock
301 cmpxchgb %dl,(%ecx)
302 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
303 and $1, %eax // sign extend previous instruction
304 ret
305
306 DEBUG_INFO __kmp_compare_and_store8
307
308//
309// kmp_int16
310// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
311//
312
313 PROC __kmp_compare_and_store16
314
315 movl 4(%esp), %ecx
316 movw 8(%esp), %ax
317 movw 12(%esp), %dx
318 lock
319 cmpxchgw %dx,(%ecx)
320 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
321 and $1, %eax // sign extend previous instruction
322 ret
323
324 DEBUG_INFO __kmp_compare_and_store16
325
326//
327// kmp_int32
328// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
329//
330
331 PROC __kmp_compare_and_store32
332
333 movl 4(%esp), %ecx
334 movl 8(%esp), %eax
335 movl 12(%esp), %edx
336 lock
337 cmpxchgl %edx,(%ecx)
338 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
339 and $1, %eax // sign extend previous instruction
340 ret
341
342 DEBUG_INFO __kmp_compare_and_store32
343
344//
345// kmp_int32
346// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
347//
348 PROC __kmp_compare_and_store64
349
350 pushl %ebp
351 movl %esp, %ebp
352 pushl %ebx
353 pushl %edi
354 movl 8(%ebp), %edi
355 movl 12(%ebp), %eax // "cv" low order word
356 movl 16(%ebp), %edx // "cv" high order word
357 movl 20(%ebp), %ebx // "sv" low order word
358 movl 24(%ebp), %ecx // "sv" high order word
359 lock
360 cmpxchg8b (%edi)
361 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
362 and $1, %eax // sign extend previous instruction
363 popl %edi
364 popl %ebx
365 movl %ebp, %esp
366 popl %ebp
367 ret
368
369 DEBUG_INFO __kmp_compare_and_store64
370
371//
372// kmp_int8
373// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
374//
375
376 PROC __kmp_compare_and_store_ret8
377
378 movl 4(%esp), %ecx
379 movb 8(%esp), %al
380 movb 12(%esp), %dl
381 lock
382 cmpxchgb %dl,(%ecx)
383 ret
384
385 DEBUG_INFO __kmp_compare_and_store_ret8
386
387//
388// kmp_int16
389// __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
390//
391
392 PROC __kmp_compare_and_store_ret16
393
394 movl 4(%esp), %ecx
395 movw 8(%esp), %ax
396 movw 12(%esp), %dx
397 lock
398 cmpxchgw %dx,(%ecx)
399 ret
400
401 DEBUG_INFO __kmp_compare_and_store_ret16
402
403//
404// kmp_int32
405// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
406//
407
408 PROC __kmp_compare_and_store_ret32
409
410 movl 4(%esp), %ecx
411 movl 8(%esp), %eax
412 movl 12(%esp), %edx
413 lock
414 cmpxchgl %edx,(%ecx)
415 ret
416
417 DEBUG_INFO __kmp_compare_and_store_ret32
418
419//
420// kmp_int64
421// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
422//
423 PROC __kmp_compare_and_store_ret64
424
425 pushl %ebp
426 movl %esp, %ebp
427 pushl %ebx
428 pushl %edi
429 movl 8(%ebp), %edi
430 movl 12(%ebp), %eax // "cv" low order word
431 movl 16(%ebp), %edx // "cv" high order word
432 movl 20(%ebp), %ebx // "sv" low order word
433 movl 24(%ebp), %ecx // "sv" high order word
434 lock
435 cmpxchg8b (%edi)
436 popl %edi
437 popl %ebx
438 movl %ebp, %esp
439 popl %ebp
440 ret
441
442 DEBUG_INFO __kmp_compare_and_store_ret64
443
444
445//------------------------------------------------------------------------
446//
447// FUNCTION __kmp_xchg_real32
448//
449// kmp_real32
450// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
451//
452// parameters:
453// addr: 4(%esp)
454// data: 8(%esp)
455//
456// return: %eax
457
458
459 PROC __kmp_xchg_real32
460
461 pushl %ebp
462 movl %esp, %ebp
463 subl $4, %esp
464 pushl %esi
465
466 movl 4(%ebp), %esi
467 flds (%esi)
468 // load <addr>
469 fsts -4(%ebp)
470 // store old value
471
472 movl 8(%ebp), %eax
473
474 lock
475 xchgl %eax, (%esi)
476
477 flds -4(%ebp)
478 // return old value
479
480 popl %esi
481 movl %ebp, %esp
482 popl %ebp
483 ret
484
485 DEBUG_INFO __kmp_xchg_real32
486
487# endif /* !KMP_ASM_INTRINS */
488
489
490//------------------------------------------------------------------------
491//
492// FUNCTION __kmp_test_then_add_real32
493//
494// kmp_real32
495// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
496//
497
498 PROC __kmp_test_then_add_real32
499
500_addr = 8
501_data = 12
502_old_value = -4
503_new_value = -8
504
505 pushl %ebp
506 movl %esp, %ebp
507 subl $8, %esp
508 pushl %esi
509 pushl %ebx
510 movl _addr(%ebp), %esi
511L22:
512 flds (%esi)
513 // load <addr>
514 fsts _old_value(%ebp)
515 // store into old_value
516 fadds _data(%ebp)
517 fstps _new_value(%ebp)
518 // new_value = old_value + data
519
520 movl _old_value(%ebp), %eax
521 // load old_value
522 movl _new_value(%ebp), %ebx
523 // load new_value
524
525 lock
526 cmpxchgl %ebx,(%esi)
527 // Compare %EAX with <addr>. If equal set
528 // ZF and load %EBX into <addr>. Else, clear
529 // ZF and load <addr> into %EAX.
530 jnz L22
531
532
533 flds _old_value(%ebp)
534 // return old_value
535 popl %ebx
536 popl %esi
537 movl %ebp, %esp
538 popl %ebp
539 ret
540
541 DEBUG_INFO __kmp_test_then_add_real32
542
543//------------------------------------------------------------------------
544//
545// FUNCTION __kmp_test_then_add_real64
546//
547// kmp_real64
548// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
549//
550 PROC __kmp_test_then_add_real64
551
552_addr = 8
553_data = 12
554_old_value = -8
555_new_value = -16
556
557 pushl %ebp
558 movl %esp, %ebp
559 subl $16, %esp
560 pushl %esi
561 pushl %ebx
562 pushl %ecx
563 pushl %edx
564 movl _addr(%ebp), %esi
565L44:
566 fldl (%esi)
567 // load <addr>
568 fstl _old_value(%ebp)
569 // store into old_value
570 faddl _data(%ebp)
571 fstpl _new_value(%ebp)
572 // new_value = old_value + data
573
574 movl _old_value+4(%ebp), %edx
575 movl _old_value(%ebp), %eax
576 // load old_value
577 movl _new_value+4(%ebp), %ecx
578 movl _new_value(%ebp), %ebx
579 // load new_value
580
581 lock
582 cmpxchg8b (%esi)
583 // Compare %EDX:%EAX with <addr>. If equal set
584 // ZF and load %ECX:%EBX into <addr>. Else, clear
585 // ZF and load <addr> into %EDX:%EAX.
586 jnz L44
587
588
589 fldl _old_value(%ebp)
590 // return old_value
591 popl %edx
592 popl %ecx
593 popl %ebx
594 popl %esi
595 movl %ebp, %esp
596 popl %ebp
597 ret
598
599 DEBUG_INFO __kmp_test_then_add_real64
600
601
602//------------------------------------------------------------------------
603//
604// FUNCTION __kmp_load_x87_fpu_control_word
605//
606// void
607// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
608//
609// parameters:
610// p: 4(%esp)
611//
612
613 PROC __kmp_load_x87_fpu_control_word
614
615 movl 4(%esp), %eax
616 fldcw (%eax)
617 ret
618
619 DEBUG_INFO __kmp_load_x87_fpu_control_word
620
621
622//------------------------------------------------------------------------
623//
624// FUNCTION __kmp_store_x87_fpu_control_word
625//
626// void
627// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
628//
629// parameters:
630// p: 4(%esp)
631//
632
633 PROC __kmp_store_x87_fpu_control_word
634
635 movl 4(%esp), %eax
636 fstcw (%eax)
637 ret
638
639 DEBUG_INFO __kmp_store_x87_fpu_control_word
640
641
642//------------------------------------------------------------------------
643//
644// FUNCTION __kmp_clear_x87_fpu_status_word
645//
646// void
647// __kmp_clear_x87_fpu_status_word();
648//
649//
650
651 PROC __kmp_clear_x87_fpu_status_word
652
653 fnclex
654 ret
655
656 DEBUG_INFO __kmp_clear_x87_fpu_status_word
657
658
659//------------------------------------------------------------------------
660//
661// typedef void (*microtask_t)( int *gtid, int *tid, ... );
662//
663// int
664// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
665// int argc, void *p_argv[] ) {
666// (*pkfn)( & gtid, & gtid, argv[0], ... );
667// return 1;
668// }
669
670// -- Begin __kmp_invoke_microtask
671// mark_begin;
672 PROC __kmp_invoke_microtask
673
674 pushl %ebp
675 movl %esp,%ebp // establish the base pointer for this routine.
676 subl $8,%esp // allocate space for two local variables.
677 // These varibales are:
678 // argv: -4(%ebp)
679 // temp: -8(%ebp)
680 //
681 pushl %ebx // save %ebx to use during this routine
682 //
683 movl 20(%ebp),%ebx // Stack alignment - # args
684 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
685 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
686 movl %esp,%eax //
687 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
688 movl %eax,%ebx // Save to %ebx
689 andl $0xFFFFFF80,%eax // mask off 7 bits
690 subl %eax,%ebx // Amount to subtract from %esp
691 subl %ebx,%esp // Prepare the stack ptr --
692 // now it will be aligned on 128-byte boundary at the call
693
694 movl 24(%ebp),%eax // copy from p_argv[]
695 movl %eax,-4(%ebp) // into the local variable *argv.
696
697 movl 20(%ebp),%ebx // argc is 20(%ebp)
698 shll $2,%ebx
699
700.invoke_2:
701 cmpl $0,%ebx
702 jg .invoke_4
703 jmp .invoke_3
704 ALIGN 2
705.invoke_4:
706 movl -4(%ebp),%eax
707 subl $4,%ebx // decrement argc.
708 addl %ebx,%eax // index into argv.
709 movl (%eax),%edx
710 pushl %edx
711
712 jmp .invoke_2
713 ALIGN 2
714.invoke_3:
715 leal 16(%ebp),%eax // push & tid
716 pushl %eax
717
718 leal 12(%ebp),%eax // push & gtid
719 pushl %eax
720
721 movl 8(%ebp),%ebx
722 call *%ebx // call (*pkfn)();
723
724 movl $1,%eax // return 1;
725
726 movl -12(%ebp),%ebx // restore %ebx
727 leave
728 ret
729
730 DEBUG_INFO __kmp_invoke_microtask
731// -- End __kmp_invoke_microtask
732
733
734// kmp_uint64
735// __kmp_hardware_timestamp(void)
736 PROC __kmp_hardware_timestamp
737 rdtsc
738 ret
739
740 DEBUG_INFO __kmp_hardware_timestamp
741// -- End __kmp_hardware_timestamp
742
743// -----------------------------------------------------------------------
744#endif /* KMP_ARCH_X86 */
745
746
747#if KMP_ARCH_X86_64
748
749// -----------------------------------------------------------------------
750// microtasking routines specifically written for IA-32 architecture and
751// Intel(R) 64 running Linux* OS
752// -----------------------------------------------------------------------
753
754// -- Machine type P
755// mark_description "Intel Corporation";
756 .ident "Intel Corporation"
757// -- .file "z_Linux_asm.s"
758 .data
759 ALIGN 4
760
761// AC: The following #if hiden the .text thus moving the rest of code into .data section on MIC.
762// To prevent this in future .text added to every routine definition for x86_64.
763# if __MIC__ || __MIC2__
764
765# else
766
767//------------------------------------------------------------------------
768//
769// FUNCTION __kmp_x86_pause
770//
771// void
772// __kmp_x86_pause( void );
773//
774
775 .text
776 PROC __kmp_x86_pause
777
778 pause_op
779 ret
780
781 DEBUG_INFO __kmp_x86_pause
782
783# endif // __MIC__ || __MIC2__
784
785//------------------------------------------------------------------------
786//
787// FUNCTION __kmp_x86_cpuid
788//
789// void
790// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
791//
792// parameters:
793// mode: %edi
794// mode2: %esi
795// cpuid_buffer: %rdx
796
797 .text
798 PROC __kmp_x86_cpuid
799
800 pushq %rbp
801 movq %rsp,%rbp
802 pushq %rbx // callee-save register
803
804 movl %esi, %ecx // "mode2"
805 movl %edi, %eax // "mode"
806 movq %rdx, %rsi // cpuid_buffer
807 cpuid // Query the CPUID for the current processor
808
809 movl %eax, 0(%rsi) // store results into buffer
810 movl %ebx, 4(%rsi)
811 movl %ecx, 8(%rsi)
812 movl %edx, 12(%rsi)
813
814 popq %rbx // callee-save register
815 movq %rbp, %rsp
816 popq %rbp
817 ret
818
819 DEBUG_INFO __kmp_x86_cpuid
820
821
822
823# if !KMP_ASM_INTRINS
824
825//------------------------------------------------------------------------
826//
827// FUNCTION __kmp_test_then_add32
828//
829// kmp_int32
830// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
831//
832// parameters:
833// p: %rdi
834// d: %esi
835//
836// return: %eax
837
838 .text
839 PROC __kmp_test_then_add32
840
841 movl %esi, %eax // "d"
842 lock
843 xaddl %eax,(%rdi)
844 ret
845
846 DEBUG_INFO __kmp_test_then_add32
847
848
849//------------------------------------------------------------------------
850//
851// FUNCTION __kmp_test_then_add64
852//
853// kmp_int64
854// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
855//
856// parameters:
857// p: %rdi
858// d: %rsi
859// return: %rax
860
861 .text
862 PROC __kmp_test_then_add64
863
864 movq %rsi, %rax // "d"
865 lock
866 xaddq %rax,(%rdi)
867 ret
868
869 DEBUG_INFO __kmp_test_then_add64
870
871
872//------------------------------------------------------------------------
873//
874// FUNCTION __kmp_xchg_fixed8
875//
876// kmp_int32
877// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
878//
879// parameters:
880// p: %rdi
881// d: %sil
882//
883// return: %al
884
885 .text
886 PROC __kmp_xchg_fixed8
887
888 movb %sil, %al // "d"
889
890 lock
891 xchgb %al,(%rdi)
892 ret
893
894 DEBUG_INFO __kmp_xchg_fixed8
895
896
897//------------------------------------------------------------------------
898//
899// FUNCTION __kmp_xchg_fixed16
900//
901// kmp_int16
902// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
903//
904// parameters:
905// p: %rdi
906// d: %si
907// return: %ax
908
909 .text
910 PROC __kmp_xchg_fixed16
911
912 movw %si, %ax // "d"
913
914 lock
915 xchgw %ax,(%rdi)
916 ret
917
918 DEBUG_INFO __kmp_xchg_fixed16
919
920
921//------------------------------------------------------------------------
922//
923// FUNCTION __kmp_xchg_fixed32
924//
925// kmp_int32
926// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
927//
928// parameters:
929// p: %rdi
930// d: %esi
931//
932// return: %eax
933
934 .text
935 PROC __kmp_xchg_fixed32
936
937 movl %esi, %eax // "d"
938
939 lock
940 xchgl %eax,(%rdi)
941 ret
942
943 DEBUG_INFO __kmp_xchg_fixed32
944
945
946//------------------------------------------------------------------------
947//
948// FUNCTION __kmp_xchg_fixed64
949//
950// kmp_int64
951// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
952//
953// parameters:
954// p: %rdi
955// d: %rsi
956// return: %rax
957
958 .text
959 PROC __kmp_xchg_fixed64
960
961 movq %rsi, %rax // "d"
962
963 lock
964 xchgq %rax,(%rdi)
965 ret
966
967 DEBUG_INFO __kmp_xchg_fixed64
968
969
970//------------------------------------------------------------------------
971//
972// FUNCTION __kmp_compare_and_store8
973//
974// kmp_int8
975// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
976//
977// parameters:
978// p: %rdi
979// cv: %esi
980// sv: %edx
981//
982// return: %eax
983
984 .text
985 PROC __kmp_compare_and_store8
986
987 movb %sil, %al // "cv"
988 lock
989 cmpxchgb %dl,(%rdi)
990 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
991 andq $1, %rax // sign extend previous instruction for return value
992 ret
993
994 DEBUG_INFO __kmp_compare_and_store8
995
996
997//------------------------------------------------------------------------
998//
999// FUNCTION __kmp_compare_and_store16
1000//
1001// kmp_int16
1002// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
1003//
1004// parameters:
1005// p: %rdi
1006// cv: %si
1007// sv: %dx
1008//
1009// return: %eax
1010
1011 .text
1012 PROC __kmp_compare_and_store16
1013
1014 movw %si, %ax // "cv"
1015 lock
1016 cmpxchgw %dx,(%rdi)
1017 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
1018 andq $1, %rax // sign extend previous instruction for return value
1019 ret
1020
1021 DEBUG_INFO __kmp_compare_and_store16
1022
1023
1024//------------------------------------------------------------------------
1025//
1026// FUNCTION __kmp_compare_and_store32
1027//
1028// kmp_int32
1029// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
1030//
1031// parameters:
1032// p: %rdi
1033// cv: %esi
1034// sv: %edx
1035//
1036// return: %eax
1037
1038 .text
1039 PROC __kmp_compare_and_store32
1040
1041 movl %esi, %eax // "cv"
1042 lock
1043 cmpxchgl %edx,(%rdi)
1044 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
1045 andq $1, %rax // sign extend previous instruction for return value
1046 ret
1047
1048 DEBUG_INFO __kmp_compare_and_store32
1049
1050
1051//------------------------------------------------------------------------
1052//
1053// FUNCTION __kmp_compare_and_store64
1054//
1055// kmp_int32
1056// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1057//
1058// parameters:
1059// p: %rdi
1060// cv: %rsi
1061// sv: %rdx
1062// return: %eax
1063
1064 .text
1065 PROC __kmp_compare_and_store64
1066
1067 movq %rsi, %rax // "cv"
1068 lock
1069 cmpxchgq %rdx,(%rdi)
1070 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
1071 andq $1, %rax // sign extend previous instruction for return value
1072 ret
1073
1074 DEBUG_INFO __kmp_compare_and_store64
1075
1076//------------------------------------------------------------------------
1077//
1078// FUNCTION __kmp_compare_and_store_ret8
1079//
1080// kmp_int8
1081// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
1082//
1083// parameters:
1084// p: %rdi
1085// cv: %esi
1086// sv: %edx
1087//
1088// return: %eax
1089
1090 .text
1091 PROC __kmp_compare_and_store_ret8
1092
1093 movb %sil, %al // "cv"
1094 lock
1095 cmpxchgb %dl,(%rdi)
1096 ret
1097
1098 DEBUG_INFO __kmp_compare_and_store_ret8
1099
1100
1101//------------------------------------------------------------------------
1102//
1103// FUNCTION __kmp_compare_and_store_ret16
1104//
1105// kmp_int16
1106// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
1107//
1108// parameters:
1109// p: %rdi
1110// cv: %si
1111// sv: %dx
1112//
1113// return: %eax
1114
1115 .text
1116 PROC __kmp_compare_and_store_ret16
1117
1118 movw %si, %ax // "cv"
1119 lock
1120 cmpxchgw %dx,(%rdi)
1121 ret
1122
1123 DEBUG_INFO __kmp_compare_and_store_ret16
1124
1125
1126//------------------------------------------------------------------------
1127//
1128// FUNCTION __kmp_compare_and_store_ret32
1129//
1130// kmp_int32
1131// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
1132//
1133// parameters:
1134// p: %rdi
1135// cv: %esi
1136// sv: %edx
1137//
1138// return: %eax
1139
1140 .text
1141 PROC __kmp_compare_and_store_ret32
1142
1143 movl %esi, %eax // "cv"
1144 lock
1145 cmpxchgl %edx,(%rdi)
1146 ret
1147
1148 DEBUG_INFO __kmp_compare_and_store_ret32
1149
1150
1151//------------------------------------------------------------------------
1152//
1153// FUNCTION __kmp_compare_and_store_ret64
1154//
1155// kmp_int64
1156// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1157//
1158// parameters:
1159// p: %rdi
1160// cv: %rsi
1161// sv: %rdx
1162// return: %eax
1163
1164 .text
1165 PROC __kmp_compare_and_store_ret64
1166
1167 movq %rsi, %rax // "cv"
1168 lock
1169 cmpxchgq %rdx,(%rdi)
1170 ret
1171
1172 DEBUG_INFO __kmp_compare_and_store_ret64
1173
1174# endif /* !KMP_ASM_INTRINS */
1175
1176
1177# if ! (__MIC__ || __MIC2__)
1178
1179//------------------------------------------------------------------------
1180//
1181// FUNCTION __kmp_test_then_add_real32
1182//
1183// kmp_real32
1184// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
1185//
1186// parameters:
1187// addr: %rdi
1188// data: %xmm0 (lower 4 bytes)
1189//
1190// return: %xmm0 (lower 4 bytes)
1191
1192 .text
1193 PROC __kmp_test_then_add_real32
11941:
1195 movss (%rdi), %xmm1 // load value of <addr>
1196 movd %xmm1, %eax // save old value of <addr>
1197
1198 addss %xmm0, %xmm1 // new value = old value + <data>
1199 movd %xmm1, %ecx // move new value to GP reg.
1200
1201 lock
1202 cmpxchgl %ecx, (%rdi) // Compare %EAX with <addr>. If equal set
1203 // ZF and exchange %ECX with <addr>. Else,
1204 // clear ZF and load <addr> into %EAX.
1205 jz 2f
1206 pause_op
1207 jmp 1b
12082:
1209 movd %eax, %xmm0 // load old value into return register
1210 ret
1211
1212 DEBUG_INFO __kmp_test_then_add_real32
1213
1214
1215//------------------------------------------------------------------------
1216//
1217// FUNCTION __kmp_test_then_add_real64
1218//
1219// kmp_real64
1220// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
1221//
1222// parameters:
1223// addr: %rdi
1224// data: %xmm0 (lower 8 bytes)
1225// return: %xmm0 (lower 8 bytes)
1226//
1227
1228 .text
1229 PROC __kmp_test_then_add_real64
12301:
1231 movlpd (%rdi), %xmm1 // load value of <addr>
1232 movd %xmm1, %rax // save old value of <addr>
1233
1234 addsd %xmm0, %xmm1 // new value = old value + <data>
1235 movd %xmm1, %rcx // move new value to GP reg.
1236
1237 lock
1238 cmpxchgq %rcx, (%rdi) // Compare %RAX with <addr>. If equal set
1239 // ZF and exchange %RCX with <addr>. Else,
1240 // clear ZF and load <addr> into %RAX.
1241 jz 2f
1242 pause_op
1243 jmp 1b
1244
12452:
1246 movd %rax, %xmm0 // load old value into return register
1247 ret
1248
1249 DEBUG_INFO __kmp_test_then_add_real64
1250
1251
1252# if !KMP_ASM_INTRINS
1253
1254//------------------------------------------------------------------------
1255//
1256// FUNCTION __kmp_xchg_real32
1257//
1258// kmp_real32
1259// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1260//
1261// parameters:
1262// addr: %rdi
1263// data: %xmm0 (lower 4 bytes)
1264//
1265// return: %xmm0 (lower 4 bytes)
1266
1267 .text
1268 PROC __kmp_xchg_real32
1269
1270 movd %xmm0, %eax // load "data" to eax
1271
1272 lock
1273 xchgl %eax, (%rdi)
1274
1275 movd %eax, %xmm0 // load old value into return register
1276
1277 ret
1278
1279 DEBUG_INFO __kmp_xchg_real32
1280
1281
1282//------------------------------------------------------------------------
1283//
1284// FUNCTION __kmp_xchg_real64
1285//
1286// kmp_real64
1287// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1288//
1289// parameters:
1290// addr: %rdi
1291// data: %xmm0 (lower 8 bytes)
1292// return: %xmm0 (lower 8 bytes)
1293//
1294
1295 .text
1296 PROC __kmp_xchg_real64
1297
1298 movd %xmm0, %rax // load "data" to rax
1299
1300 lock
1301 xchgq %rax, (%rdi)
1302
1303 movd %rax, %xmm0 // load old value into return register
1304 ret
1305
1306 DEBUG_INFO __kmp_xchg_real64
1307
1308
1309# endif /* !(__MIC__ || __MIC2__) */
1310
1311# endif /* !KMP_ASM_INTRINS */
1312
1313
1314//------------------------------------------------------------------------
1315//
1316// FUNCTION __kmp_load_x87_fpu_control_word
1317//
1318// void
1319// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
1320//
1321// parameters:
1322// p: %rdi
1323//
1324
1325 .text
1326 PROC __kmp_load_x87_fpu_control_word
1327
1328 fldcw (%rdi)
1329 ret
1330
1331 DEBUG_INFO __kmp_load_x87_fpu_control_word
1332
1333
1334//------------------------------------------------------------------------
1335//
1336// FUNCTION __kmp_store_x87_fpu_control_word
1337//
1338// void
1339// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
1340//
1341// parameters:
1342// p: %rdi
1343//
1344
1345 .text
1346 PROC __kmp_store_x87_fpu_control_word
1347
1348 fstcw (%rdi)
1349 ret
1350
1351 DEBUG_INFO __kmp_store_x87_fpu_control_word
1352
1353
1354//------------------------------------------------------------------------
1355//
1356// FUNCTION __kmp_clear_x87_fpu_status_word
1357//
1358// void
1359// __kmp_clear_x87_fpu_status_word();
1360//
1361//
1362
1363 .text
1364 PROC __kmp_clear_x87_fpu_status_word
1365
1366#if __MIC__ || __MIC2__
1367// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
1368 fstenv -32(%rsp) // store FP env
1369 andw $~0x80ff, 4-32(%rsp) // clear 0-7,15 bits of FP SW
1370 fldenv -32(%rsp) // load FP env back
1371 ret
1372#else
1373 fnclex
1374 ret
1375#endif
1376
1377 DEBUG_INFO __kmp_clear_x87_fpu_status_word
1378
1379
1380//------------------------------------------------------------------------
1381//
1382// typedef void (*microtask_t)( int *gtid, int *tid, ... );
1383//
1384// int
1385// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...),
1386// int gtid, int tid,
1387// int argc, void *p_argv[] ) {
1388// (*pkfn)( & gtid, & tid, argv[0], ... );
1389// return 1;
1390// }
1391//
1392// note:
1393// at call to pkfn must have %rsp 128-byte aligned for compiler
1394//
1395// parameters:
1396// %rdi: pkfn
1397// %esi: gtid
1398// %edx: tid
1399// %ecx: argc
1400// %r8: p_argv
1401//
1402// locals:
1403// __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
1404// __tid: tid parm pushed on stack so can pass &tid to pkfn
1405//
1406// reg temps:
1407// %rax: used all over the place
1408// %rdx: used in stack pointer alignment calculation
1409// %r11: used to traverse p_argv array
1410// %rsi: used as temporary for stack parameters
1411// used as temporary for number of pkfn parms to push
1412// %rbx: used to hold pkfn address, and zero constant, callee-save
1413//
1414// return: %eax (always 1/TRUE)
1415//
1416
1417__gtid = -16
1418__tid = -24
1419
1420// -- Begin __kmp_invoke_microtask
1421// mark_begin;
1422 .text
1423 PROC __kmp_invoke_microtask
1424
1425 pushq %rbp // save base pointer
1426 movq %rsp,%rbp // establish the base pointer for this routine.
1427 pushq %rbx // %rbx is callee-saved register
1428
1429 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1430 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1431
1432 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1433 movq $0, %rbx // constant for cmovs later
1434 subq $4, %rax // subtract four args passed in registers to pkfn
1435#if __MIC__ || __MIC2__
1436 js L_kmp_0 // jump to movq
1437 jmp L_kmp_0_exit // jump ahead
1438L_kmp_0:
1439 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1440L_kmp_0_exit:
1441#else
1442 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1443#endif // __MIC__ || __MIC2__
1444
1445 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1446 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1447
1448 movq %rsp, %rdx //
1449 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1450 // without align, stack ptr would be this
1451 movq %rdx, %rax // Save to %rax
1452
1453 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1454 subq %rax, %rdx // Amount to subtract from %rsp
1455 subq %rdx, %rsp // Prepare the stack ptr --
1456 // now %rsp will align to 128-byte boundary at call site
1457
1458 // setup pkfn parameter reg and stack
1459 movq %rcx, %rax // argc -> %rax
1460 cmpq $0, %rsi
1461 je L_kmp_invoke_pass_parms // jump ahead if no parms to push
1462 shlq $3, %rcx // argc*8 -> %rcx
1463 movq %r8, %rdx // p_argv -> %rdx
1464 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1465
1466 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1467
1468L_kmp_invoke_push_parms: // push nth - 7th parms to pkfn on stack
1469 subq $8, %rdx // decrement p_argv pointer to previous parm
1470 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1471 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1472 subl $1, %ecx
1473
1474// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1475// if the name of the label that is an operand of this jecxz starts with a dot (".");
1476// Apple's linker does not support 1-byte length relocation;
1477// Resolution: replace all .labelX entries with L_labelX.
1478
1479 jecxz L_kmp_invoke_pass_parms // stop when four p_argv[] parms left
1480 jmp L_kmp_invoke_push_parms
1481
1482 ALIGN 3
1483L_kmp_invoke_pass_parms: // put 1st - 6th parms to pkfn in registers.
1484 // order here is important to avoid trashing
1485 // registers used for both input and output parms!
1486 movq %rdi, %rbx // pkfn -> %rbx
1487 leaq __gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1488 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1489
1490 movq %r8, %r11 // p_argv -> %r11
1491
1492#if __MIC__ || __MIC2__
1493 cmpq $4, %rax // argc >= 4?
1494 jns L_kmp_4 // jump to movq
1495 jmp L_kmp_4_exit // jump ahead
1496L_kmp_4:
1497 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1498L_kmp_4_exit:
1499
1500 cmpq $3, %rax // argc >= 3?
1501 jns L_kmp_3 // jump to movq
1502 jmp L_kmp_3_exit // jump ahead
1503L_kmp_3:
1504 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1505L_kmp_3_exit:
1506
1507 cmpq $2, %rax // argc >= 2?
1508 jns L_kmp_2 // jump to movq
1509 jmp L_kmp_2_exit // jump ahead
1510L_kmp_2:
1511 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1512L_kmp_2_exit:
1513
1514 cmpq $1, %rax // argc >= 1?
1515 jns L_kmp_1 // jump to movq
1516 jmp L_kmp_1_exit // jump ahead
1517L_kmp_1:
1518 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1519L_kmp_1_exit:
1520#else
1521 cmpq $4, %rax // argc >= 4?
1522 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1523
1524 cmpq $3, %rax // argc >= 3?
1525 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1526
1527 cmpq $2, %rax // argc >= 2?
1528 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1529
1530 cmpq $1, %rax // argc >= 1?
1531 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1532#endif // __MIC__ || __MIC2__
1533
1534 call *%rbx // call (*pkfn)();
1535 movq $1, %rax // move 1 into return register;
1536
1537 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1538 movq %rbp, %rsp // restore stack pointer
1539 popq %rbp // restore frame pointer
1540 ret
1541
1542 DEBUG_INFO __kmp_invoke_microtask
1543// -- End __kmp_invoke_microtask
1544
1545// kmp_uint64
1546// __kmp_hardware_timestamp(void)
1547 .text
1548 PROC __kmp_hardware_timestamp
1549 rdtsc
1550 shlq $32, %rdx
1551 orq %rdx, %rax
1552 ret
1553
1554 DEBUG_INFO __kmp_hardware_timestamp
1555// -- End __kmp_hardware_timestamp
1556
1557//------------------------------------------------------------------------
1558//
1559// FUNCTION __kmp_bsr32
1560//
1561// int
1562// __kmp_bsr32( int );
1563//
1564
1565 .text
1566 PROC __kmp_bsr32
1567
1568 bsr %edi,%eax
1569 ret
1570
1571 DEBUG_INFO __kmp_bsr32
1572
1573
1574// -----------------------------------------------------------------------
1575#endif /* KMP_ARCH_X86_64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001576
1577#if KMP_ARCH_ARM
1578 .data
1579 .comm .gomp_critical_user_,32,8
1580 .data
1581 .align 4
1582 .global __kmp_unnamed_critical_addr
1583__kmp_unnamed_critical_addr:
1584 .4byte .gomp_critical_user_
1585 .size __kmp_unnamed_critical_addr,4
1586#endif /* KMP_ARCH_ARM */
1587
Jim Cownie3051f972014-08-07 10:12:54 +00001588#if KMP_ARCH_PPC64
1589 .data
1590 .comm .gomp_critical_user_,32,8
1591 .data
1592 .align 8
1593 .global __kmp_unnamed_critical_addr
1594__kmp_unnamed_critical_addr:
1595 .8byte .gomp_critical_user_
1596 .size __kmp_unnamed_critical_addr,8
1597#endif /* KMP_ARCH_PPC64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001598
1599#if defined(__linux__)
1600.section .note.GNU-stack,"",@progbits
1601#endif