blob: 2b9822343071e02d4411caaee85642c1be745fe9 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001// z_Linux_asm.s: - microtasking routines specifically
2// written for Intel platforms running Linux* OS
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003// $Revision: 43473 $
4// $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005
6//
7////===----------------------------------------------------------------------===//
8////
9//// The LLVM Compiler Infrastructure
10////
11//// This file is dual licensed under the MIT and the University of Illinois Open
12//// Source Licenses. See LICENSE.txt for details.
13////
14////===----------------------------------------------------------------------===//
15//
16
17// -----------------------------------------------------------------------
18// macros
19// -----------------------------------------------------------------------
20
21#if KMP_ARCH_X86 || KMP_ARCH_X86_64
22
23# if __MIC__ || __MIC2__
24//
25// the 'delay r16/r32/r64' should be used instead of the 'pause'.
26// The delay operation has the effect of removing the current thread from
27// the round-robin HT mechanism, and therefore speeds up the issue rate of
28// the other threads on the same core.
29//
30// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
31// barrier time to increase greatly for 3 or more threads per core.
32//
33// A value of 100 works pretty well for up to 4 threads per core, but isn't
34// quite as fast as 0 for 2 threads per core.
35//
36// We need to check what happens for oversubscription / > 4 threads per core.
37// It is possible that we need to pass the delay value in as a parameter
38// that the caller determines based on the total # threads / # cores.
39//
40//.macro pause_op
41// mov $100, %rax
42// delay %rax
43//.endm
44# else
45# define pause_op .byte 0xf3,0x90
46# endif // __MIC__ || __MIC2__
47
48# if defined __APPLE__ && defined __MACH__
49# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
50.macro ALIGN
51 .align $0
52.endmacro
53.macro DEBUG_INFO
54/* Not sure what .size does in icc, not sure if we need to do something
55 similar for OS X*.
56*/
57.endmacro
58.macro PROC
59 ALIGN 4
60 .globl KMP_PREFIX_UNDERSCORE($0)
61KMP_PREFIX_UNDERSCORE($0):
62.endmacro
63# else // defined __APPLE__ && defined __MACH__
64# define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
65.macro ALIGN size
66 .align 1<<(\size)
67.endm
68.macro DEBUG_INFO proc
69// Not sure why we need .type and .size for the functions
70 .align 16
71 .type \proc,@function
72 .size \proc,.-\proc
73.endm
74.macro PROC proc
75 ALIGN 4
76 .globl KMP_PREFIX_UNDERSCORE(\proc)
77KMP_PREFIX_UNDERSCORE(\proc):
78.endm
79# endif // defined __APPLE__ && defined __MACH__
Jim Cownie181b4bb2013-12-23 17:28:57 +000080#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
Jim Cownie5e8470a2013-09-27 10:38:44 +000081
82
83// -----------------------------------------------------------------------
84// data
85// -----------------------------------------------------------------------
86
87#ifdef KMP_GOMP_COMPAT
88
89//
90// Support for unnamed common blocks.
91//
92// Because the symbol ".gomp_critical_user_" contains a ".", we have to
93// put this stuff in assembly.
94//
95
96# if KMP_ARCH_X86
97# if defined __APPLE__ && defined __MACH__
98 .data
99 .comm .gomp_critical_user_,32
100 .data
101 .globl ___kmp_unnamed_critical_addr
102___kmp_unnamed_critical_addr:
103 .long .gomp_critical_user_
104# else /* Linux* OS */
105 .data
106 .comm .gomp_critical_user_,32,8
107 .data
108 ALIGN 4
109 .global __kmp_unnamed_critical_addr
110__kmp_unnamed_critical_addr:
111 .4byte .gomp_critical_user_
112 .type __kmp_unnamed_critical_addr,@object
113 .size __kmp_unnamed_critical_addr,4
114# endif /* defined __APPLE__ && defined __MACH__ */
115# endif /* KMP_ARCH_X86 */
116
117# if KMP_ARCH_X86_64
118# if defined __APPLE__ && defined __MACH__
119 .data
120 .comm .gomp_critical_user_,32
121 .data
122 .globl ___kmp_unnamed_critical_addr
123___kmp_unnamed_critical_addr:
124 .quad .gomp_critical_user_
125# else /* Linux* OS */
126 .data
127 .comm .gomp_critical_user_,32,8
128 .data
129 ALIGN 8
130 .global __kmp_unnamed_critical_addr
131__kmp_unnamed_critical_addr:
132 .8byte .gomp_critical_user_
133 .type __kmp_unnamed_critical_addr,@object
134 .size __kmp_unnamed_critical_addr,8
135# endif /* defined __APPLE__ && defined __MACH__ */
136# endif /* KMP_ARCH_X86_64 */
137
138#endif /* KMP_GOMP_COMPAT */
139
140
Jim Cownie3051f972014-08-07 10:12:54 +0000141#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
Jim Cownie5e8470a2013-09-27 10:38:44 +0000142
143// -----------------------------------------------------------------------
144// microtasking routines specifically written for IA-32 architecture
145// running Linux* OS
146// -----------------------------------------------------------------------
147//
148
149 .ident "Intel Corporation"
150 .data
151 ALIGN 4
152// void
153// __kmp_x86_pause( void );
154//
155
156 .text
157 PROC __kmp_x86_pause
158
159 pause_op
160 ret
161
162 DEBUG_INFO __kmp_x86_pause
163
164//
165// void
166// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
167//
168 PROC __kmp_x86_cpuid
169
170 pushl %ebp
171 movl %esp,%ebp
172 pushl %edi
173 pushl %ebx
174 pushl %ecx
175 pushl %edx
176
177 movl 8(%ebp), %eax
178 movl 12(%ebp), %ecx
179 cpuid // Query the CPUID for the current processor
180
181 movl 16(%ebp), %edi
182 movl %eax, 0(%edi)
183 movl %ebx, 4(%edi)
184 movl %ecx, 8(%edi)
185 movl %edx, 12(%edi)
186
187 popl %edx
188 popl %ecx
189 popl %ebx
190 popl %edi
191 movl %ebp, %esp
192 popl %ebp
193 ret
194
195 DEBUG_INFO __kmp_x86_cpuid
196
197
198# if !KMP_ASM_INTRINS
199
200//------------------------------------------------------------------------
201//
202// kmp_int32
203// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
204//
205
206 PROC __kmp_test_then_add32
207
208 movl 4(%esp), %ecx
209 movl 8(%esp), %eax
210 lock
211 xaddl %eax,(%ecx)
212 ret
213
214 DEBUG_INFO __kmp_test_then_add32
215
216//------------------------------------------------------------------------
217//
218// FUNCTION __kmp_xchg_fixed8
219//
220// kmp_int32
221// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
222//
223// parameters:
224// p: 4(%esp)
225// d: 8(%esp)
226//
227// return: %al
228
229 PROC __kmp_xchg_fixed8
230
231 movl 4(%esp), %ecx // "p"
232 movb 8(%esp), %al // "d"
233
234 lock
235 xchgb %al,(%ecx)
236 ret
237
238 DEBUG_INFO __kmp_xchg_fixed8
239
240
241//------------------------------------------------------------------------
242//
243// FUNCTION __kmp_xchg_fixed16
244//
245// kmp_int16
246// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
247//
248// parameters:
249// p: 4(%esp)
250// d: 8(%esp)
251// return: %ax
252
253 PROC __kmp_xchg_fixed16
254
255 movl 4(%esp), %ecx // "p"
256 movw 8(%esp), %ax // "d"
257
258 lock
259 xchgw %ax,(%ecx)
260 ret
261
262 DEBUG_INFO __kmp_xchg_fixed16
263
264
265//------------------------------------------------------------------------
266//
267// FUNCTION __kmp_xchg_fixed32
268//
269// kmp_int32
270// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
271//
272// parameters:
273// p: 4(%esp)
274// d: 8(%esp)
275//
276// return: %eax
277
278 PROC __kmp_xchg_fixed32
279
280 movl 4(%esp), %ecx // "p"
281 movl 8(%esp), %eax // "d"
282
283 lock
284 xchgl %eax,(%ecx)
285 ret
286
287 DEBUG_INFO __kmp_xchg_fixed32
288
289
290//
291// kmp_int8
292// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
293//
294
295 PROC __kmp_compare_and_store8
296
297 movl 4(%esp), %ecx
298 movb 8(%esp), %al
299 movb 12(%esp), %dl
300 lock
301 cmpxchgb %dl,(%ecx)
302 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
303 and $1, %eax // sign extend previous instruction
304 ret
305
306 DEBUG_INFO __kmp_compare_and_store8
307
308//
309// kmp_int16
310// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
311//
312
313 PROC __kmp_compare_and_store16
314
315 movl 4(%esp), %ecx
316 movw 8(%esp), %ax
317 movw 12(%esp), %dx
318 lock
319 cmpxchgw %dx,(%ecx)
320 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
321 and $1, %eax // sign extend previous instruction
322 ret
323
324 DEBUG_INFO __kmp_compare_and_store16
325
326//
327// kmp_int32
328// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
329//
330
331 PROC __kmp_compare_and_store32
332
333 movl 4(%esp), %ecx
334 movl 8(%esp), %eax
335 movl 12(%esp), %edx
336 lock
337 cmpxchgl %edx,(%ecx)
338 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
339 and $1, %eax // sign extend previous instruction
340 ret
341
342 DEBUG_INFO __kmp_compare_and_store32
343
344//
345// kmp_int32
346// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
347//
348 PROC __kmp_compare_and_store64
349
350 pushl %ebp
351 movl %esp, %ebp
352 pushl %ebx
353 pushl %edi
354 movl 8(%ebp), %edi
355 movl 12(%ebp), %eax // "cv" low order word
356 movl 16(%ebp), %edx // "cv" high order word
357 movl 20(%ebp), %ebx // "sv" low order word
358 movl 24(%ebp), %ecx // "sv" high order word
359 lock
360 cmpxchg8b (%edi)
361 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
362 and $1, %eax // sign extend previous instruction
363 popl %edi
364 popl %ebx
365 movl %ebp, %esp
366 popl %ebp
367 ret
368
369 DEBUG_INFO __kmp_compare_and_store64
370
371//
372// kmp_int8
373// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
374//
375
376 PROC __kmp_compare_and_store_ret8
377
378 movl 4(%esp), %ecx
379 movb 8(%esp), %al
380 movb 12(%esp), %dl
381 lock
382 cmpxchgb %dl,(%ecx)
383 ret
384
385 DEBUG_INFO __kmp_compare_and_store_ret8
386
387//
388// kmp_int16
389// __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
390//
391
392 PROC __kmp_compare_and_store_ret16
393
394 movl 4(%esp), %ecx
395 movw 8(%esp), %ax
396 movw 12(%esp), %dx
397 lock
398 cmpxchgw %dx,(%ecx)
399 ret
400
401 DEBUG_INFO __kmp_compare_and_store_ret16
402
403//
404// kmp_int32
405// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
406//
407
408 PROC __kmp_compare_and_store_ret32
409
410 movl 4(%esp), %ecx
411 movl 8(%esp), %eax
412 movl 12(%esp), %edx
413 lock
414 cmpxchgl %edx,(%ecx)
415 ret
416
417 DEBUG_INFO __kmp_compare_and_store_ret32
418
419//
420// kmp_int64
421// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
422//
423 PROC __kmp_compare_and_store_ret64
424
425 pushl %ebp
426 movl %esp, %ebp
427 pushl %ebx
428 pushl %edi
429 movl 8(%ebp), %edi
430 movl 12(%ebp), %eax // "cv" low order word
431 movl 16(%ebp), %edx // "cv" high order word
432 movl 20(%ebp), %ebx // "sv" low order word
433 movl 24(%ebp), %ecx // "sv" high order word
434 lock
435 cmpxchg8b (%edi)
436 popl %edi
437 popl %ebx
438 movl %ebp, %esp
439 popl %ebp
440 ret
441
442 DEBUG_INFO __kmp_compare_and_store_ret64
443
444
445//------------------------------------------------------------------------
446//
447// FUNCTION __kmp_xchg_real32
448//
449// kmp_real32
450// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
451//
452// parameters:
453// addr: 4(%esp)
454// data: 8(%esp)
455//
456// return: %eax
457
458
459 PROC __kmp_xchg_real32
460
461 pushl %ebp
462 movl %esp, %ebp
463 subl $4, %esp
464 pushl %esi
465
466 movl 4(%ebp), %esi
467 flds (%esi)
468 // load <addr>
469 fsts -4(%ebp)
470 // store old value
471
472 movl 8(%ebp), %eax
473
474 lock
475 xchgl %eax, (%esi)
476
477 flds -4(%ebp)
478 // return old value
479
480 popl %esi
481 movl %ebp, %esp
482 popl %ebp
483 ret
484
485 DEBUG_INFO __kmp_xchg_real32
486
487# endif /* !KMP_ASM_INTRINS */
488
489
490//------------------------------------------------------------------------
491//
Jim Cownie5e8470a2013-09-27 10:38:44 +0000492// FUNCTION __kmp_load_x87_fpu_control_word
493//
494// void
495// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
496//
497// parameters:
498// p: 4(%esp)
499//
500
501 PROC __kmp_load_x87_fpu_control_word
502
503 movl 4(%esp), %eax
504 fldcw (%eax)
505 ret
506
507 DEBUG_INFO __kmp_load_x87_fpu_control_word
508
509
510//------------------------------------------------------------------------
511//
512// FUNCTION __kmp_store_x87_fpu_control_word
513//
514// void
515// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
516//
517// parameters:
518// p: 4(%esp)
519//
520
521 PROC __kmp_store_x87_fpu_control_word
522
523 movl 4(%esp), %eax
524 fstcw (%eax)
525 ret
526
527 DEBUG_INFO __kmp_store_x87_fpu_control_word
528
529
530//------------------------------------------------------------------------
531//
532// FUNCTION __kmp_clear_x87_fpu_status_word
533//
534// void
535// __kmp_clear_x87_fpu_status_word();
536//
537//
538
539 PROC __kmp_clear_x87_fpu_status_word
540
541 fnclex
542 ret
543
544 DEBUG_INFO __kmp_clear_x87_fpu_status_word
545
546
547//------------------------------------------------------------------------
548//
549// typedef void (*microtask_t)( int *gtid, int *tid, ... );
550//
551// int
552// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
553// int argc, void *p_argv[] ) {
554// (*pkfn)( & gtid, & gtid, argv[0], ... );
555// return 1;
556// }
557
558// -- Begin __kmp_invoke_microtask
559// mark_begin;
560 PROC __kmp_invoke_microtask
561
562 pushl %ebp
563 movl %esp,%ebp // establish the base pointer for this routine.
564 subl $8,%esp // allocate space for two local variables.
565 // These varibales are:
566 // argv: -4(%ebp)
567 // temp: -8(%ebp)
568 //
569 pushl %ebx // save %ebx to use during this routine
570 //
571 movl 20(%ebp),%ebx // Stack alignment - # args
572 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
573 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
574 movl %esp,%eax //
575 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
576 movl %eax,%ebx // Save to %ebx
577 andl $0xFFFFFF80,%eax // mask off 7 bits
578 subl %eax,%ebx // Amount to subtract from %esp
579 subl %ebx,%esp // Prepare the stack ptr --
580 // now it will be aligned on 128-byte boundary at the call
581
582 movl 24(%ebp),%eax // copy from p_argv[]
583 movl %eax,-4(%ebp) // into the local variable *argv.
584
585 movl 20(%ebp),%ebx // argc is 20(%ebp)
586 shll $2,%ebx
587
588.invoke_2:
589 cmpl $0,%ebx
590 jg .invoke_4
591 jmp .invoke_3
592 ALIGN 2
593.invoke_4:
594 movl -4(%ebp),%eax
595 subl $4,%ebx // decrement argc.
596 addl %ebx,%eax // index into argv.
597 movl (%eax),%edx
598 pushl %edx
599
600 jmp .invoke_2
601 ALIGN 2
602.invoke_3:
603 leal 16(%ebp),%eax // push & tid
604 pushl %eax
605
606 leal 12(%ebp),%eax // push & gtid
607 pushl %eax
608
609 movl 8(%ebp),%ebx
610 call *%ebx // call (*pkfn)();
611
612 movl $1,%eax // return 1;
613
614 movl -12(%ebp),%ebx // restore %ebx
615 leave
616 ret
617
618 DEBUG_INFO __kmp_invoke_microtask
619// -- End __kmp_invoke_microtask
620
621
622// kmp_uint64
623// __kmp_hardware_timestamp(void)
624 PROC __kmp_hardware_timestamp
625 rdtsc
626 ret
627
628 DEBUG_INFO __kmp_hardware_timestamp
629// -- End __kmp_hardware_timestamp
630
631// -----------------------------------------------------------------------
632#endif /* KMP_ARCH_X86 */
633
634
635#if KMP_ARCH_X86_64
636
637// -----------------------------------------------------------------------
638// microtasking routines specifically written for IA-32 architecture and
639// Intel(R) 64 running Linux* OS
640// -----------------------------------------------------------------------
641
642// -- Machine type P
643// mark_description "Intel Corporation";
644 .ident "Intel Corporation"
645// -- .file "z_Linux_asm.s"
646 .data
647 ALIGN 4
648
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000649// To prevent getting our code into .data section .text added to every routine definition for x86_64.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000650//------------------------------------------------------------------------
651//
652// FUNCTION __kmp_x86_cpuid
653//
654// void
655// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
656//
657// parameters:
658// mode: %edi
659// mode2: %esi
660// cpuid_buffer: %rdx
661
662 .text
663 PROC __kmp_x86_cpuid
664
665 pushq %rbp
666 movq %rsp,%rbp
667 pushq %rbx // callee-save register
668
669 movl %esi, %ecx // "mode2"
670 movl %edi, %eax // "mode"
671 movq %rdx, %rsi // cpuid_buffer
672 cpuid // Query the CPUID for the current processor
673
674 movl %eax, 0(%rsi) // store results into buffer
675 movl %ebx, 4(%rsi)
676 movl %ecx, 8(%rsi)
677 movl %edx, 12(%rsi)
678
679 popq %rbx // callee-save register
680 movq %rbp, %rsp
681 popq %rbp
682 ret
683
684 DEBUG_INFO __kmp_x86_cpuid
685
686
687
688# if !KMP_ASM_INTRINS
689
690//------------------------------------------------------------------------
691//
692// FUNCTION __kmp_test_then_add32
693//
694// kmp_int32
695// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
696//
697// parameters:
698// p: %rdi
699// d: %esi
700//
701// return: %eax
702
703 .text
704 PROC __kmp_test_then_add32
705
706 movl %esi, %eax // "d"
707 lock
708 xaddl %eax,(%rdi)
709 ret
710
711 DEBUG_INFO __kmp_test_then_add32
712
713
714//------------------------------------------------------------------------
715//
716// FUNCTION __kmp_test_then_add64
717//
718// kmp_int64
719// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
720//
721// parameters:
722// p: %rdi
723// d: %rsi
724// return: %rax
725
726 .text
727 PROC __kmp_test_then_add64
728
729 movq %rsi, %rax // "d"
730 lock
731 xaddq %rax,(%rdi)
732 ret
733
734 DEBUG_INFO __kmp_test_then_add64
735
736
737//------------------------------------------------------------------------
738//
739// FUNCTION __kmp_xchg_fixed8
740//
741// kmp_int32
742// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
743//
744// parameters:
745// p: %rdi
746// d: %sil
747//
748// return: %al
749
750 .text
751 PROC __kmp_xchg_fixed8
752
753 movb %sil, %al // "d"
754
755 lock
756 xchgb %al,(%rdi)
757 ret
758
759 DEBUG_INFO __kmp_xchg_fixed8
760
761
762//------------------------------------------------------------------------
763//
764// FUNCTION __kmp_xchg_fixed16
765//
766// kmp_int16
767// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
768//
769// parameters:
770// p: %rdi
771// d: %si
772// return: %ax
773
774 .text
775 PROC __kmp_xchg_fixed16
776
777 movw %si, %ax // "d"
778
779 lock
780 xchgw %ax,(%rdi)
781 ret
782
783 DEBUG_INFO __kmp_xchg_fixed16
784
785
786//------------------------------------------------------------------------
787//
788// FUNCTION __kmp_xchg_fixed32
789//
790// kmp_int32
791// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
792//
793// parameters:
794// p: %rdi
795// d: %esi
796//
797// return: %eax
798
799 .text
800 PROC __kmp_xchg_fixed32
801
802 movl %esi, %eax // "d"
803
804 lock
805 xchgl %eax,(%rdi)
806 ret
807
808 DEBUG_INFO __kmp_xchg_fixed32
809
810
811//------------------------------------------------------------------------
812//
813// FUNCTION __kmp_xchg_fixed64
814//
815// kmp_int64
816// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
817//
818// parameters:
819// p: %rdi
820// d: %rsi
821// return: %rax
822
823 .text
824 PROC __kmp_xchg_fixed64
825
826 movq %rsi, %rax // "d"
827
828 lock
829 xchgq %rax,(%rdi)
830 ret
831
832 DEBUG_INFO __kmp_xchg_fixed64
833
834
835//------------------------------------------------------------------------
836//
837// FUNCTION __kmp_compare_and_store8
838//
839// kmp_int8
840// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
841//
842// parameters:
843// p: %rdi
844// cv: %esi
845// sv: %edx
846//
847// return: %eax
848
849 .text
850 PROC __kmp_compare_and_store8
851
852 movb %sil, %al // "cv"
853 lock
854 cmpxchgb %dl,(%rdi)
855 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
856 andq $1, %rax // sign extend previous instruction for return value
857 ret
858
859 DEBUG_INFO __kmp_compare_and_store8
860
861
862//------------------------------------------------------------------------
863//
864// FUNCTION __kmp_compare_and_store16
865//
866// kmp_int16
867// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
868//
869// parameters:
870// p: %rdi
871// cv: %si
872// sv: %dx
873//
874// return: %eax
875
876 .text
877 PROC __kmp_compare_and_store16
878
879 movw %si, %ax // "cv"
880 lock
881 cmpxchgw %dx,(%rdi)
882 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
883 andq $1, %rax // sign extend previous instruction for return value
884 ret
885
886 DEBUG_INFO __kmp_compare_and_store16
887
888
889//------------------------------------------------------------------------
890//
891// FUNCTION __kmp_compare_and_store32
892//
893// kmp_int32
894// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
895//
896// parameters:
897// p: %rdi
898// cv: %esi
899// sv: %edx
900//
901// return: %eax
902
903 .text
904 PROC __kmp_compare_and_store32
905
906 movl %esi, %eax // "cv"
907 lock
908 cmpxchgl %edx,(%rdi)
909 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
910 andq $1, %rax // sign extend previous instruction for return value
911 ret
912
913 DEBUG_INFO __kmp_compare_and_store32
914
915
916//------------------------------------------------------------------------
917//
918// FUNCTION __kmp_compare_and_store64
919//
920// kmp_int32
921// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
922//
923// parameters:
924// p: %rdi
925// cv: %rsi
926// sv: %rdx
927// return: %eax
928
929 .text
930 PROC __kmp_compare_and_store64
931
932 movq %rsi, %rax // "cv"
933 lock
934 cmpxchgq %rdx,(%rdi)
935 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
936 andq $1, %rax // sign extend previous instruction for return value
937 ret
938
939 DEBUG_INFO __kmp_compare_and_store64
940
941//------------------------------------------------------------------------
942//
943// FUNCTION __kmp_compare_and_store_ret8
944//
945// kmp_int8
946// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
947//
948// parameters:
949// p: %rdi
950// cv: %esi
951// sv: %edx
952//
953// return: %eax
954
955 .text
956 PROC __kmp_compare_and_store_ret8
957
958 movb %sil, %al // "cv"
959 lock
960 cmpxchgb %dl,(%rdi)
961 ret
962
963 DEBUG_INFO __kmp_compare_and_store_ret8
964
965
966//------------------------------------------------------------------------
967//
968// FUNCTION __kmp_compare_and_store_ret16
969//
970// kmp_int16
971// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
972//
973// parameters:
974// p: %rdi
975// cv: %si
976// sv: %dx
977//
978// return: %eax
979
980 .text
981 PROC __kmp_compare_and_store_ret16
982
983 movw %si, %ax // "cv"
984 lock
985 cmpxchgw %dx,(%rdi)
986 ret
987
988 DEBUG_INFO __kmp_compare_and_store_ret16
989
990
991//------------------------------------------------------------------------
992//
993// FUNCTION __kmp_compare_and_store_ret32
994//
995// kmp_int32
996// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
997//
998// parameters:
999// p: %rdi
1000// cv: %esi
1001// sv: %edx
1002//
1003// return: %eax
1004
1005 .text
1006 PROC __kmp_compare_and_store_ret32
1007
1008 movl %esi, %eax // "cv"
1009 lock
1010 cmpxchgl %edx,(%rdi)
1011 ret
1012
1013 DEBUG_INFO __kmp_compare_and_store_ret32
1014
1015
1016//------------------------------------------------------------------------
1017//
1018// FUNCTION __kmp_compare_and_store_ret64
1019//
1020// kmp_int64
1021// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1022//
1023// parameters:
1024// p: %rdi
1025// cv: %rsi
1026// sv: %rdx
1027// return: %eax
1028
1029 .text
1030 PROC __kmp_compare_and_store_ret64
1031
1032 movq %rsi, %rax // "cv"
1033 lock
1034 cmpxchgq %rdx,(%rdi)
1035 ret
1036
1037 DEBUG_INFO __kmp_compare_and_store_ret64
1038
1039# endif /* !KMP_ASM_INTRINS */
1040
1041
1042# if ! (__MIC__ || __MIC2__)
1043
Jim Cownie5e8470a2013-09-27 10:38:44 +00001044# if !KMP_ASM_INTRINS
1045
1046//------------------------------------------------------------------------
1047//
1048// FUNCTION __kmp_xchg_real32
1049//
1050// kmp_real32
1051// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1052//
1053// parameters:
1054// addr: %rdi
1055// data: %xmm0 (lower 4 bytes)
1056//
1057// return: %xmm0 (lower 4 bytes)
1058
1059 .text
1060 PROC __kmp_xchg_real32
1061
1062 movd %xmm0, %eax // load "data" to eax
1063
1064 lock
1065 xchgl %eax, (%rdi)
1066
1067 movd %eax, %xmm0 // load old value into return register
1068
1069 ret
1070
1071 DEBUG_INFO __kmp_xchg_real32
1072
1073
1074//------------------------------------------------------------------------
1075//
1076// FUNCTION __kmp_xchg_real64
1077//
1078// kmp_real64
1079// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1080//
1081// parameters:
1082// addr: %rdi
1083// data: %xmm0 (lower 8 bytes)
1084// return: %xmm0 (lower 8 bytes)
1085//
1086
1087 .text
1088 PROC __kmp_xchg_real64
1089
1090 movd %xmm0, %rax // load "data" to rax
1091
1092 lock
1093 xchgq %rax, (%rdi)
1094
1095 movd %rax, %xmm0 // load old value into return register
1096 ret
1097
1098 DEBUG_INFO __kmp_xchg_real64
1099
1100
1101# endif /* !(__MIC__ || __MIC2__) */
1102
1103# endif /* !KMP_ASM_INTRINS */
1104
1105
1106//------------------------------------------------------------------------
1107//
1108// FUNCTION __kmp_load_x87_fpu_control_word
1109//
1110// void
1111// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
1112//
1113// parameters:
1114// p: %rdi
1115//
1116
1117 .text
1118 PROC __kmp_load_x87_fpu_control_word
1119
1120 fldcw (%rdi)
1121 ret
1122
1123 DEBUG_INFO __kmp_load_x87_fpu_control_word
1124
1125
1126//------------------------------------------------------------------------
1127//
1128// FUNCTION __kmp_store_x87_fpu_control_word
1129//
1130// void
1131// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
1132//
1133// parameters:
1134// p: %rdi
1135//
1136
1137 .text
1138 PROC __kmp_store_x87_fpu_control_word
1139
1140 fstcw (%rdi)
1141 ret
1142
1143 DEBUG_INFO __kmp_store_x87_fpu_control_word
1144
1145
1146//------------------------------------------------------------------------
1147//
1148// FUNCTION __kmp_clear_x87_fpu_status_word
1149//
1150// void
1151// __kmp_clear_x87_fpu_status_word();
1152//
1153//
1154
1155 .text
1156 PROC __kmp_clear_x87_fpu_status_word
1157
1158#if __MIC__ || __MIC2__
1159// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
1160 fstenv -32(%rsp) // store FP env
1161 andw $~0x80ff, 4-32(%rsp) // clear 0-7,15 bits of FP SW
1162 fldenv -32(%rsp) // load FP env back
1163 ret
1164#else
1165 fnclex
1166 ret
1167#endif
1168
1169 DEBUG_INFO __kmp_clear_x87_fpu_status_word
1170
1171
1172//------------------------------------------------------------------------
1173//
1174// typedef void (*microtask_t)( int *gtid, int *tid, ... );
1175//
1176// int
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001177// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
Jim Cownie5e8470a2013-09-27 10:38:44 +00001178// int gtid, int tid,
1179// int argc, void *p_argv[] ) {
1180// (*pkfn)( & gtid, & tid, argv[0], ... );
1181// return 1;
1182// }
1183//
1184// note:
1185// at call to pkfn must have %rsp 128-byte aligned for compiler
1186//
1187// parameters:
1188// %rdi: pkfn
1189// %esi: gtid
1190// %edx: tid
1191// %ecx: argc
1192// %r8: p_argv
1193//
1194// locals:
1195// __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
1196// __tid: tid parm pushed on stack so can pass &tid to pkfn
1197//
1198// reg temps:
1199// %rax: used all over the place
1200// %rdx: used in stack pointer alignment calculation
1201// %r11: used to traverse p_argv array
1202// %rsi: used as temporary for stack parameters
1203// used as temporary for number of pkfn parms to push
1204// %rbx: used to hold pkfn address, and zero constant, callee-save
1205//
1206// return: %eax (always 1/TRUE)
1207//
1208
1209__gtid = -16
1210__tid = -24
1211
1212// -- Begin __kmp_invoke_microtask
1213// mark_begin;
1214 .text
1215 PROC __kmp_invoke_microtask
1216
1217 pushq %rbp // save base pointer
1218 movq %rsp,%rbp // establish the base pointer for this routine.
1219 pushq %rbx // %rbx is callee-saved register
1220
1221 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1222 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1223
1224 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1225 movq $0, %rbx // constant for cmovs later
1226 subq $4, %rax // subtract four args passed in registers to pkfn
1227#if __MIC__ || __MIC2__
1228 js L_kmp_0 // jump to movq
1229 jmp L_kmp_0_exit // jump ahead
1230L_kmp_0:
1231 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1232L_kmp_0_exit:
1233#else
1234 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1235#endif // __MIC__ || __MIC2__
1236
1237 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1238 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1239
1240 movq %rsp, %rdx //
1241 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1242 // without align, stack ptr would be this
1243 movq %rdx, %rax // Save to %rax
1244
1245 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1246 subq %rax, %rdx // Amount to subtract from %rsp
1247 subq %rdx, %rsp // Prepare the stack ptr --
1248 // now %rsp will align to 128-byte boundary at call site
1249
1250 // setup pkfn parameter reg and stack
1251 movq %rcx, %rax // argc -> %rax
1252 cmpq $0, %rsi
1253 je L_kmp_invoke_pass_parms // jump ahead if no parms to push
1254 shlq $3, %rcx // argc*8 -> %rcx
1255 movq %r8, %rdx // p_argv -> %rdx
1256 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1257
1258 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1259
1260L_kmp_invoke_push_parms: // push nth - 7th parms to pkfn on stack
1261 subq $8, %rdx // decrement p_argv pointer to previous parm
1262 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1263 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1264 subl $1, %ecx
1265
1266// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1267// if the name of the label that is an operand of this jecxz starts with a dot (".");
1268// Apple's linker does not support 1-byte length relocation;
1269// Resolution: replace all .labelX entries with L_labelX.
1270
1271 jecxz L_kmp_invoke_pass_parms // stop when four p_argv[] parms left
1272 jmp L_kmp_invoke_push_parms
1273
1274 ALIGN 3
1275L_kmp_invoke_pass_parms: // put 1st - 6th parms to pkfn in registers.
1276 // order here is important to avoid trashing
1277 // registers used for both input and output parms!
1278 movq %rdi, %rbx // pkfn -> %rbx
1279 leaq __gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1280 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1281
1282 movq %r8, %r11 // p_argv -> %r11
1283
1284#if __MIC__ || __MIC2__
1285 cmpq $4, %rax // argc >= 4?
1286 jns L_kmp_4 // jump to movq
1287 jmp L_kmp_4_exit // jump ahead
1288L_kmp_4:
1289 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1290L_kmp_4_exit:
1291
1292 cmpq $3, %rax // argc >= 3?
1293 jns L_kmp_3 // jump to movq
1294 jmp L_kmp_3_exit // jump ahead
1295L_kmp_3:
1296 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1297L_kmp_3_exit:
1298
1299 cmpq $2, %rax // argc >= 2?
1300 jns L_kmp_2 // jump to movq
1301 jmp L_kmp_2_exit // jump ahead
1302L_kmp_2:
1303 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1304L_kmp_2_exit:
1305
1306 cmpq $1, %rax // argc >= 1?
1307 jns L_kmp_1 // jump to movq
1308 jmp L_kmp_1_exit // jump ahead
1309L_kmp_1:
1310 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1311L_kmp_1_exit:
1312#else
1313 cmpq $4, %rax // argc >= 4?
1314 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1315
1316 cmpq $3, %rax // argc >= 3?
1317 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1318
1319 cmpq $2, %rax // argc >= 2?
1320 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1321
1322 cmpq $1, %rax // argc >= 1?
1323 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1324#endif // __MIC__ || __MIC2__
1325
1326 call *%rbx // call (*pkfn)();
1327 movq $1, %rax // move 1 into return register;
1328
1329 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1330 movq %rbp, %rsp // restore stack pointer
1331 popq %rbp // restore frame pointer
1332 ret
1333
1334 DEBUG_INFO __kmp_invoke_microtask
1335// -- End __kmp_invoke_microtask
1336
1337// kmp_uint64
1338// __kmp_hardware_timestamp(void)
1339 .text
1340 PROC __kmp_hardware_timestamp
1341 rdtsc
1342 shlq $32, %rdx
1343 orq %rdx, %rax
1344 ret
1345
1346 DEBUG_INFO __kmp_hardware_timestamp
1347// -- End __kmp_hardware_timestamp
1348
1349//------------------------------------------------------------------------
1350//
1351// FUNCTION __kmp_bsr32
1352//
1353// int
1354// __kmp_bsr32( int );
1355//
1356
1357 .text
1358 PROC __kmp_bsr32
1359
1360 bsr %edi,%eax
1361 ret
1362
1363 DEBUG_INFO __kmp_bsr32
1364
1365
1366// -----------------------------------------------------------------------
1367#endif /* KMP_ARCH_X86_64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001368
1369#if KMP_ARCH_ARM
1370 .data
1371 .comm .gomp_critical_user_,32,8
1372 .data
1373 .align 4
1374 .global __kmp_unnamed_critical_addr
1375__kmp_unnamed_critical_addr:
1376 .4byte .gomp_critical_user_
1377 .size __kmp_unnamed_critical_addr,4
1378#endif /* KMP_ARCH_ARM */
1379
Jim Cownie3051f972014-08-07 10:12:54 +00001380#if KMP_ARCH_PPC64
1381 .data
1382 .comm .gomp_critical_user_,32,8
1383 .data
1384 .align 8
1385 .global __kmp_unnamed_critical_addr
1386__kmp_unnamed_critical_addr:
1387 .8byte .gomp_critical_user_
1388 .size __kmp_unnamed_critical_addr,8
1389#endif /* KMP_ARCH_PPC64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001390
1391#if defined(__linux__)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001392# if KMP_ARCH_ARM
1393.section .note.GNU-stack,"",%progbits
1394# else
Jim Cownie181b4bb2013-12-23 17:28:57 +00001395.section .note.GNU-stack,"",@progbits
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001396# endif
Jim Cownie181b4bb2013-12-23 17:28:57 +00001397#endif