blob: 177d6ca9582b85be5a255794709682d8b17fc7ba [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001// z_Linux_asm.s: - microtasking routines specifically
2// written for Intel platforms running Linux* OS
Jim Cownie5e8470a2013-09-27 10:38:44 +00003
4//
5////===----------------------------------------------------------------------===//
6////
7//// The LLVM Compiler Infrastructure
8////
9//// This file is dual licensed under the MIT and the University of Illinois Open
10//// Source Licenses. See LICENSE.txt for details.
11////
12////===----------------------------------------------------------------------===//
13//
14
15// -----------------------------------------------------------------------
16// macros
17// -----------------------------------------------------------------------
18
19#if KMP_ARCH_X86 || KMP_ARCH_X86_64
20
21# if __MIC__ || __MIC2__
22//
23// the 'delay r16/r32/r64' should be used instead of the 'pause'.
24// The delay operation has the effect of removing the current thread from
25// the round-robin HT mechanism, and therefore speeds up the issue rate of
26// the other threads on the same core.
27//
28// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29// barrier time to increase greatly for 3 or more threads per core.
30//
31// A value of 100 works pretty well for up to 4 threads per core, but isn't
32// quite as fast as 0 for 2 threads per core.
33//
34// We need to check what happens for oversubscription / > 4 threads per core.
35// It is possible that we need to pass the delay value in as a parameter
36// that the caller determines based on the total # threads / # cores.
37//
38//.macro pause_op
39// mov $100, %rax
40// delay %rax
41//.endm
42# else
43# define pause_op .byte 0xf3,0x90
44# endif // __MIC__ || __MIC2__
45
46# if defined __APPLE__ && defined __MACH__
47# define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
48.macro ALIGN
49 .align $0
50.endmacro
51.macro DEBUG_INFO
52/* Not sure what .size does in icc, not sure if we need to do something
53 similar for OS X*.
54*/
55.endmacro
56.macro PROC
57 ALIGN 4
58 .globl KMP_PREFIX_UNDERSCORE($0)
59KMP_PREFIX_UNDERSCORE($0):
60.endmacro
61# else // defined __APPLE__ && defined __MACH__
62# define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
63.macro ALIGN size
64 .align 1<<(\size)
65.endm
66.macro DEBUG_INFO proc
67// Not sure why we need .type and .size for the functions
68 .align 16
69 .type \proc,@function
70 .size \proc,.-\proc
71.endm
72.macro PROC proc
73 ALIGN 4
74 .globl KMP_PREFIX_UNDERSCORE(\proc)
75KMP_PREFIX_UNDERSCORE(\proc):
76.endm
77# endif // defined __APPLE__ && defined __MACH__
Jim Cownie181b4bb2013-12-23 17:28:57 +000078#endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
Jim Cownie5e8470a2013-09-27 10:38:44 +000079
80
81// -----------------------------------------------------------------------
82// data
83// -----------------------------------------------------------------------
84
85#ifdef KMP_GOMP_COMPAT
86
87//
88// Support for unnamed common blocks.
89//
90// Because the symbol ".gomp_critical_user_" contains a ".", we have to
91// put this stuff in assembly.
92//
93
94# if KMP_ARCH_X86
95# if defined __APPLE__ && defined __MACH__
96 .data
97 .comm .gomp_critical_user_,32
98 .data
99 .globl ___kmp_unnamed_critical_addr
100___kmp_unnamed_critical_addr:
101 .long .gomp_critical_user_
102# else /* Linux* OS */
103 .data
104 .comm .gomp_critical_user_,32,8
105 .data
106 ALIGN 4
107 .global __kmp_unnamed_critical_addr
108__kmp_unnamed_critical_addr:
109 .4byte .gomp_critical_user_
110 .type __kmp_unnamed_critical_addr,@object
111 .size __kmp_unnamed_critical_addr,4
112# endif /* defined __APPLE__ && defined __MACH__ */
113# endif /* KMP_ARCH_X86 */
114
115# if KMP_ARCH_X86_64
116# if defined __APPLE__ && defined __MACH__
117 .data
118 .comm .gomp_critical_user_,32
119 .data
120 .globl ___kmp_unnamed_critical_addr
121___kmp_unnamed_critical_addr:
122 .quad .gomp_critical_user_
123# else /* Linux* OS */
124 .data
125 .comm .gomp_critical_user_,32,8
126 .data
127 ALIGN 8
128 .global __kmp_unnamed_critical_addr
129__kmp_unnamed_critical_addr:
130 .8byte .gomp_critical_user_
131 .type __kmp_unnamed_critical_addr,@object
132 .size __kmp_unnamed_critical_addr,8
133# endif /* defined __APPLE__ && defined __MACH__ */
134# endif /* KMP_ARCH_X86_64 */
135
136#endif /* KMP_GOMP_COMPAT */
137
138
Jim Cownie3051f972014-08-07 10:12:54 +0000139#if KMP_ARCH_X86 && !KMP_ARCH_PPC64
Jim Cownie5e8470a2013-09-27 10:38:44 +0000140
141// -----------------------------------------------------------------------
142// microtasking routines specifically written for IA-32 architecture
143// running Linux* OS
144// -----------------------------------------------------------------------
145//
146
147 .ident "Intel Corporation"
148 .data
149 ALIGN 4
150// void
151// __kmp_x86_pause( void );
152//
153
154 .text
155 PROC __kmp_x86_pause
156
157 pause_op
158 ret
159
160 DEBUG_INFO __kmp_x86_pause
161
162//
163// void
164// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
165//
166 PROC __kmp_x86_cpuid
167
168 pushl %ebp
169 movl %esp,%ebp
170 pushl %edi
171 pushl %ebx
172 pushl %ecx
173 pushl %edx
174
175 movl 8(%ebp), %eax
176 movl 12(%ebp), %ecx
177 cpuid // Query the CPUID for the current processor
178
179 movl 16(%ebp), %edi
180 movl %eax, 0(%edi)
181 movl %ebx, 4(%edi)
182 movl %ecx, 8(%edi)
183 movl %edx, 12(%edi)
184
185 popl %edx
186 popl %ecx
187 popl %ebx
188 popl %edi
189 movl %ebp, %esp
190 popl %ebp
191 ret
192
193 DEBUG_INFO __kmp_x86_cpuid
194
195
196# if !KMP_ASM_INTRINS
197
198//------------------------------------------------------------------------
199//
200// kmp_int32
201// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
202//
203
204 PROC __kmp_test_then_add32
205
206 movl 4(%esp), %ecx
207 movl 8(%esp), %eax
208 lock
209 xaddl %eax,(%ecx)
210 ret
211
212 DEBUG_INFO __kmp_test_then_add32
213
214//------------------------------------------------------------------------
215//
216// FUNCTION __kmp_xchg_fixed8
217//
218// kmp_int32
219// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
220//
221// parameters:
222// p: 4(%esp)
223// d: 8(%esp)
224//
225// return: %al
226
227 PROC __kmp_xchg_fixed8
228
229 movl 4(%esp), %ecx // "p"
230 movb 8(%esp), %al // "d"
231
232 lock
233 xchgb %al,(%ecx)
234 ret
235
236 DEBUG_INFO __kmp_xchg_fixed8
237
238
239//------------------------------------------------------------------------
240//
241// FUNCTION __kmp_xchg_fixed16
242//
243// kmp_int16
244// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
245//
246// parameters:
247// p: 4(%esp)
248// d: 8(%esp)
249// return: %ax
250
251 PROC __kmp_xchg_fixed16
252
253 movl 4(%esp), %ecx // "p"
254 movw 8(%esp), %ax // "d"
255
256 lock
257 xchgw %ax,(%ecx)
258 ret
259
260 DEBUG_INFO __kmp_xchg_fixed16
261
262
263//------------------------------------------------------------------------
264//
265// FUNCTION __kmp_xchg_fixed32
266//
267// kmp_int32
268// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
269//
270// parameters:
271// p: 4(%esp)
272// d: 8(%esp)
273//
274// return: %eax
275
276 PROC __kmp_xchg_fixed32
277
278 movl 4(%esp), %ecx // "p"
279 movl 8(%esp), %eax // "d"
280
281 lock
282 xchgl %eax,(%ecx)
283 ret
284
285 DEBUG_INFO __kmp_xchg_fixed32
286
287
288//
289// kmp_int8
290// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
291//
292
293 PROC __kmp_compare_and_store8
294
295 movl 4(%esp), %ecx
296 movb 8(%esp), %al
297 movb 12(%esp), %dl
298 lock
299 cmpxchgb %dl,(%ecx)
300 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
301 and $1, %eax // sign extend previous instruction
302 ret
303
304 DEBUG_INFO __kmp_compare_and_store8
305
306//
307// kmp_int16
308// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
309//
310
311 PROC __kmp_compare_and_store16
312
313 movl 4(%esp), %ecx
314 movw 8(%esp), %ax
315 movw 12(%esp), %dx
316 lock
317 cmpxchgw %dx,(%ecx)
318 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
319 and $1, %eax // sign extend previous instruction
320 ret
321
322 DEBUG_INFO __kmp_compare_and_store16
323
324//
325// kmp_int32
326// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
327//
328
329 PROC __kmp_compare_and_store32
330
331 movl 4(%esp), %ecx
332 movl 8(%esp), %eax
333 movl 12(%esp), %edx
334 lock
335 cmpxchgl %edx,(%ecx)
336 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
337 and $1, %eax // sign extend previous instruction
338 ret
339
340 DEBUG_INFO __kmp_compare_and_store32
341
342//
343// kmp_int32
344// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
345//
346 PROC __kmp_compare_and_store64
347
348 pushl %ebp
349 movl %esp, %ebp
350 pushl %ebx
351 pushl %edi
352 movl 8(%ebp), %edi
353 movl 12(%ebp), %eax // "cv" low order word
354 movl 16(%ebp), %edx // "cv" high order word
355 movl 20(%ebp), %ebx // "sv" low order word
356 movl 24(%ebp), %ecx // "sv" high order word
357 lock
358 cmpxchg8b (%edi)
359 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
360 and $1, %eax // sign extend previous instruction
361 popl %edi
362 popl %ebx
363 movl %ebp, %esp
364 popl %ebp
365 ret
366
367 DEBUG_INFO __kmp_compare_and_store64
368
369//
370// kmp_int8
371// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
372//
373
374 PROC __kmp_compare_and_store_ret8
375
376 movl 4(%esp), %ecx
377 movb 8(%esp), %al
378 movb 12(%esp), %dl
379 lock
380 cmpxchgb %dl,(%ecx)
381 ret
382
383 DEBUG_INFO __kmp_compare_and_store_ret8
384
385//
386// kmp_int16
387// __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
388//
389
390 PROC __kmp_compare_and_store_ret16
391
392 movl 4(%esp), %ecx
393 movw 8(%esp), %ax
394 movw 12(%esp), %dx
395 lock
396 cmpxchgw %dx,(%ecx)
397 ret
398
399 DEBUG_INFO __kmp_compare_and_store_ret16
400
401//
402// kmp_int32
403// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
404//
405
406 PROC __kmp_compare_and_store_ret32
407
408 movl 4(%esp), %ecx
409 movl 8(%esp), %eax
410 movl 12(%esp), %edx
411 lock
412 cmpxchgl %edx,(%ecx)
413 ret
414
415 DEBUG_INFO __kmp_compare_and_store_ret32
416
417//
418// kmp_int64
419// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
420//
421 PROC __kmp_compare_and_store_ret64
422
423 pushl %ebp
424 movl %esp, %ebp
425 pushl %ebx
426 pushl %edi
427 movl 8(%ebp), %edi
428 movl 12(%ebp), %eax // "cv" low order word
429 movl 16(%ebp), %edx // "cv" high order word
430 movl 20(%ebp), %ebx // "sv" low order word
431 movl 24(%ebp), %ecx // "sv" high order word
432 lock
433 cmpxchg8b (%edi)
434 popl %edi
435 popl %ebx
436 movl %ebp, %esp
437 popl %ebp
438 ret
439
440 DEBUG_INFO __kmp_compare_and_store_ret64
441
442
443//------------------------------------------------------------------------
444//
445// FUNCTION __kmp_xchg_real32
446//
447// kmp_real32
448// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
449//
450// parameters:
451// addr: 4(%esp)
452// data: 8(%esp)
453//
454// return: %eax
455
456
457 PROC __kmp_xchg_real32
458
459 pushl %ebp
460 movl %esp, %ebp
461 subl $4, %esp
462 pushl %esi
463
464 movl 4(%ebp), %esi
465 flds (%esi)
466 // load <addr>
467 fsts -4(%ebp)
468 // store old value
469
470 movl 8(%ebp), %eax
471
472 lock
473 xchgl %eax, (%esi)
474
475 flds -4(%ebp)
476 // return old value
477
478 popl %esi
479 movl %ebp, %esp
480 popl %ebp
481 ret
482
483 DEBUG_INFO __kmp_xchg_real32
484
485# endif /* !KMP_ASM_INTRINS */
486
487
488//------------------------------------------------------------------------
489//
Jim Cownie5e8470a2013-09-27 10:38:44 +0000490// FUNCTION __kmp_load_x87_fpu_control_word
491//
492// void
493// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
494//
495// parameters:
496// p: 4(%esp)
497//
498
499 PROC __kmp_load_x87_fpu_control_word
500
501 movl 4(%esp), %eax
502 fldcw (%eax)
503 ret
504
505 DEBUG_INFO __kmp_load_x87_fpu_control_word
506
507
508//------------------------------------------------------------------------
509//
510// FUNCTION __kmp_store_x87_fpu_control_word
511//
512// void
513// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
514//
515// parameters:
516// p: 4(%esp)
517//
518
519 PROC __kmp_store_x87_fpu_control_word
520
521 movl 4(%esp), %eax
522 fstcw (%eax)
523 ret
524
525 DEBUG_INFO __kmp_store_x87_fpu_control_word
526
527
528//------------------------------------------------------------------------
529//
530// FUNCTION __kmp_clear_x87_fpu_status_word
531//
532// void
533// __kmp_clear_x87_fpu_status_word();
534//
535//
536
537 PROC __kmp_clear_x87_fpu_status_word
538
539 fnclex
540 ret
541
542 DEBUG_INFO __kmp_clear_x87_fpu_status_word
543
544
545//------------------------------------------------------------------------
546//
547// typedef void (*microtask_t)( int *gtid, int *tid, ... );
548//
549// int
550// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
551// int argc, void *p_argv[] ) {
552// (*pkfn)( & gtid, & gtid, argv[0], ... );
553// return 1;
554// }
555
556// -- Begin __kmp_invoke_microtask
557// mark_begin;
558 PROC __kmp_invoke_microtask
559
560 pushl %ebp
561 movl %esp,%ebp // establish the base pointer for this routine.
562 subl $8,%esp // allocate space for two local variables.
563 // These varibales are:
564 // argv: -4(%ebp)
565 // temp: -8(%ebp)
566 //
567 pushl %ebx // save %ebx to use during this routine
568 //
569 movl 20(%ebp),%ebx // Stack alignment - # args
570 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
571 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
572 movl %esp,%eax //
573 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
574 movl %eax,%ebx // Save to %ebx
575 andl $0xFFFFFF80,%eax // mask off 7 bits
576 subl %eax,%ebx // Amount to subtract from %esp
577 subl %ebx,%esp // Prepare the stack ptr --
578 // now it will be aligned on 128-byte boundary at the call
579
580 movl 24(%ebp),%eax // copy from p_argv[]
581 movl %eax,-4(%ebp) // into the local variable *argv.
582
583 movl 20(%ebp),%ebx // argc is 20(%ebp)
584 shll $2,%ebx
585
586.invoke_2:
587 cmpl $0,%ebx
588 jg .invoke_4
589 jmp .invoke_3
590 ALIGN 2
591.invoke_4:
592 movl -4(%ebp),%eax
593 subl $4,%ebx // decrement argc.
594 addl %ebx,%eax // index into argv.
595 movl (%eax),%edx
596 pushl %edx
597
598 jmp .invoke_2
599 ALIGN 2
600.invoke_3:
601 leal 16(%ebp),%eax // push & tid
602 pushl %eax
603
604 leal 12(%ebp),%eax // push & gtid
605 pushl %eax
606
607 movl 8(%ebp),%ebx
608 call *%ebx // call (*pkfn)();
609
610 movl $1,%eax // return 1;
611
612 movl -12(%ebp),%ebx // restore %ebx
613 leave
614 ret
615
616 DEBUG_INFO __kmp_invoke_microtask
617// -- End __kmp_invoke_microtask
618
619
620// kmp_uint64
621// __kmp_hardware_timestamp(void)
622 PROC __kmp_hardware_timestamp
623 rdtsc
624 ret
625
626 DEBUG_INFO __kmp_hardware_timestamp
627// -- End __kmp_hardware_timestamp
628
629// -----------------------------------------------------------------------
630#endif /* KMP_ARCH_X86 */
631
632
633#if KMP_ARCH_X86_64
634
635// -----------------------------------------------------------------------
636// microtasking routines specifically written for IA-32 architecture and
637// Intel(R) 64 running Linux* OS
638// -----------------------------------------------------------------------
639
640// -- Machine type P
641// mark_description "Intel Corporation";
642 .ident "Intel Corporation"
643// -- .file "z_Linux_asm.s"
644 .data
645 ALIGN 4
646
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000647// To prevent getting our code into .data section .text added to every routine definition for x86_64.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000648//------------------------------------------------------------------------
649//
650// FUNCTION __kmp_x86_cpuid
651//
652// void
653// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
654//
655// parameters:
656// mode: %edi
657// mode2: %esi
658// cpuid_buffer: %rdx
659
660 .text
661 PROC __kmp_x86_cpuid
662
663 pushq %rbp
664 movq %rsp,%rbp
665 pushq %rbx // callee-save register
666
667 movl %esi, %ecx // "mode2"
668 movl %edi, %eax // "mode"
669 movq %rdx, %rsi // cpuid_buffer
670 cpuid // Query the CPUID for the current processor
671
672 movl %eax, 0(%rsi) // store results into buffer
673 movl %ebx, 4(%rsi)
674 movl %ecx, 8(%rsi)
675 movl %edx, 12(%rsi)
676
677 popq %rbx // callee-save register
678 movq %rbp, %rsp
679 popq %rbp
680 ret
681
682 DEBUG_INFO __kmp_x86_cpuid
683
684
685
686# if !KMP_ASM_INTRINS
687
688//------------------------------------------------------------------------
689//
690// FUNCTION __kmp_test_then_add32
691//
692// kmp_int32
693// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
694//
695// parameters:
696// p: %rdi
697// d: %esi
698//
699// return: %eax
700
701 .text
702 PROC __kmp_test_then_add32
703
704 movl %esi, %eax // "d"
705 lock
706 xaddl %eax,(%rdi)
707 ret
708
709 DEBUG_INFO __kmp_test_then_add32
710
711
712//------------------------------------------------------------------------
713//
714// FUNCTION __kmp_test_then_add64
715//
716// kmp_int64
717// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
718//
719// parameters:
720// p: %rdi
721// d: %rsi
722// return: %rax
723
724 .text
725 PROC __kmp_test_then_add64
726
727 movq %rsi, %rax // "d"
728 lock
729 xaddq %rax,(%rdi)
730 ret
731
732 DEBUG_INFO __kmp_test_then_add64
733
734
735//------------------------------------------------------------------------
736//
737// FUNCTION __kmp_xchg_fixed8
738//
739// kmp_int32
740// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
741//
742// parameters:
743// p: %rdi
744// d: %sil
745//
746// return: %al
747
748 .text
749 PROC __kmp_xchg_fixed8
750
751 movb %sil, %al // "d"
752
753 lock
754 xchgb %al,(%rdi)
755 ret
756
757 DEBUG_INFO __kmp_xchg_fixed8
758
759
760//------------------------------------------------------------------------
761//
762// FUNCTION __kmp_xchg_fixed16
763//
764// kmp_int16
765// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
766//
767// parameters:
768// p: %rdi
769// d: %si
770// return: %ax
771
772 .text
773 PROC __kmp_xchg_fixed16
774
775 movw %si, %ax // "d"
776
777 lock
778 xchgw %ax,(%rdi)
779 ret
780
781 DEBUG_INFO __kmp_xchg_fixed16
782
783
784//------------------------------------------------------------------------
785//
786// FUNCTION __kmp_xchg_fixed32
787//
788// kmp_int32
789// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
790//
791// parameters:
792// p: %rdi
793// d: %esi
794//
795// return: %eax
796
797 .text
798 PROC __kmp_xchg_fixed32
799
800 movl %esi, %eax // "d"
801
802 lock
803 xchgl %eax,(%rdi)
804 ret
805
806 DEBUG_INFO __kmp_xchg_fixed32
807
808
809//------------------------------------------------------------------------
810//
811// FUNCTION __kmp_xchg_fixed64
812//
813// kmp_int64
814// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
815//
816// parameters:
817// p: %rdi
818// d: %rsi
819// return: %rax
820
821 .text
822 PROC __kmp_xchg_fixed64
823
824 movq %rsi, %rax // "d"
825
826 lock
827 xchgq %rax,(%rdi)
828 ret
829
830 DEBUG_INFO __kmp_xchg_fixed64
831
832
833//------------------------------------------------------------------------
834//
835// FUNCTION __kmp_compare_and_store8
836//
837// kmp_int8
838// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
839//
840// parameters:
841// p: %rdi
842// cv: %esi
843// sv: %edx
844//
845// return: %eax
846
847 .text
848 PROC __kmp_compare_and_store8
849
850 movb %sil, %al // "cv"
851 lock
852 cmpxchgb %dl,(%rdi)
853 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
854 andq $1, %rax // sign extend previous instruction for return value
855 ret
856
857 DEBUG_INFO __kmp_compare_and_store8
858
859
860//------------------------------------------------------------------------
861//
862// FUNCTION __kmp_compare_and_store16
863//
864// kmp_int16
865// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
866//
867// parameters:
868// p: %rdi
869// cv: %si
870// sv: %dx
871//
872// return: %eax
873
874 .text
875 PROC __kmp_compare_and_store16
876
877 movw %si, %ax // "cv"
878 lock
879 cmpxchgw %dx,(%rdi)
880 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
881 andq $1, %rax // sign extend previous instruction for return value
882 ret
883
884 DEBUG_INFO __kmp_compare_and_store16
885
886
887//------------------------------------------------------------------------
888//
889// FUNCTION __kmp_compare_and_store32
890//
891// kmp_int32
892// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
893//
894// parameters:
895// p: %rdi
896// cv: %esi
897// sv: %edx
898//
899// return: %eax
900
901 .text
902 PROC __kmp_compare_and_store32
903
904 movl %esi, %eax // "cv"
905 lock
906 cmpxchgl %edx,(%rdi)
907 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
908 andq $1, %rax // sign extend previous instruction for return value
909 ret
910
911 DEBUG_INFO __kmp_compare_and_store32
912
913
914//------------------------------------------------------------------------
915//
916// FUNCTION __kmp_compare_and_store64
917//
918// kmp_int32
919// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
920//
921// parameters:
922// p: %rdi
923// cv: %rsi
924// sv: %rdx
925// return: %eax
926
927 .text
928 PROC __kmp_compare_and_store64
929
930 movq %rsi, %rax // "cv"
931 lock
932 cmpxchgq %rdx,(%rdi)
933 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
934 andq $1, %rax // sign extend previous instruction for return value
935 ret
936
937 DEBUG_INFO __kmp_compare_and_store64
938
939//------------------------------------------------------------------------
940//
941// FUNCTION __kmp_compare_and_store_ret8
942//
943// kmp_int8
944// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
945//
946// parameters:
947// p: %rdi
948// cv: %esi
949// sv: %edx
950//
951// return: %eax
952
953 .text
954 PROC __kmp_compare_and_store_ret8
955
956 movb %sil, %al // "cv"
957 lock
958 cmpxchgb %dl,(%rdi)
959 ret
960
961 DEBUG_INFO __kmp_compare_and_store_ret8
962
963
964//------------------------------------------------------------------------
965//
966// FUNCTION __kmp_compare_and_store_ret16
967//
968// kmp_int16
969// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
970//
971// parameters:
972// p: %rdi
973// cv: %si
974// sv: %dx
975//
976// return: %eax
977
978 .text
979 PROC __kmp_compare_and_store_ret16
980
981 movw %si, %ax // "cv"
982 lock
983 cmpxchgw %dx,(%rdi)
984 ret
985
986 DEBUG_INFO __kmp_compare_and_store_ret16
987
988
989//------------------------------------------------------------------------
990//
991// FUNCTION __kmp_compare_and_store_ret32
992//
993// kmp_int32
994// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
995//
996// parameters:
997// p: %rdi
998// cv: %esi
999// sv: %edx
1000//
1001// return: %eax
1002
1003 .text
1004 PROC __kmp_compare_and_store_ret32
1005
1006 movl %esi, %eax // "cv"
1007 lock
1008 cmpxchgl %edx,(%rdi)
1009 ret
1010
1011 DEBUG_INFO __kmp_compare_and_store_ret32
1012
1013
1014//------------------------------------------------------------------------
1015//
1016// FUNCTION __kmp_compare_and_store_ret64
1017//
1018// kmp_int64
1019// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
1020//
1021// parameters:
1022// p: %rdi
1023// cv: %rsi
1024// sv: %rdx
1025// return: %eax
1026
1027 .text
1028 PROC __kmp_compare_and_store_ret64
1029
1030 movq %rsi, %rax // "cv"
1031 lock
1032 cmpxchgq %rdx,(%rdi)
1033 ret
1034
1035 DEBUG_INFO __kmp_compare_and_store_ret64
1036
1037# endif /* !KMP_ASM_INTRINS */
1038
1039
1040# if ! (__MIC__ || __MIC2__)
1041
Jim Cownie5e8470a2013-09-27 10:38:44 +00001042# if !KMP_ASM_INTRINS
1043
1044//------------------------------------------------------------------------
1045//
1046// FUNCTION __kmp_xchg_real32
1047//
1048// kmp_real32
1049// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
1050//
1051// parameters:
1052// addr: %rdi
1053// data: %xmm0 (lower 4 bytes)
1054//
1055// return: %xmm0 (lower 4 bytes)
1056
1057 .text
1058 PROC __kmp_xchg_real32
1059
1060 movd %xmm0, %eax // load "data" to eax
1061
1062 lock
1063 xchgl %eax, (%rdi)
1064
1065 movd %eax, %xmm0 // load old value into return register
1066
1067 ret
1068
1069 DEBUG_INFO __kmp_xchg_real32
1070
1071
1072//------------------------------------------------------------------------
1073//
1074// FUNCTION __kmp_xchg_real64
1075//
1076// kmp_real64
1077// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1078//
1079// parameters:
1080// addr: %rdi
1081// data: %xmm0 (lower 8 bytes)
1082// return: %xmm0 (lower 8 bytes)
1083//
1084
1085 .text
1086 PROC __kmp_xchg_real64
1087
1088 movd %xmm0, %rax // load "data" to rax
1089
1090 lock
1091 xchgq %rax, (%rdi)
1092
1093 movd %rax, %xmm0 // load old value into return register
1094 ret
1095
1096 DEBUG_INFO __kmp_xchg_real64
1097
1098
1099# endif /* !(__MIC__ || __MIC2__) */
1100
1101# endif /* !KMP_ASM_INTRINS */
1102
1103
1104//------------------------------------------------------------------------
1105//
1106// FUNCTION __kmp_load_x87_fpu_control_word
1107//
1108// void
1109// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
1110//
1111// parameters:
1112// p: %rdi
1113//
1114
1115 .text
1116 PROC __kmp_load_x87_fpu_control_word
1117
1118 fldcw (%rdi)
1119 ret
1120
1121 DEBUG_INFO __kmp_load_x87_fpu_control_word
1122
1123
1124//------------------------------------------------------------------------
1125//
1126// FUNCTION __kmp_store_x87_fpu_control_word
1127//
1128// void
1129// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
1130//
1131// parameters:
1132// p: %rdi
1133//
1134
1135 .text
1136 PROC __kmp_store_x87_fpu_control_word
1137
1138 fstcw (%rdi)
1139 ret
1140
1141 DEBUG_INFO __kmp_store_x87_fpu_control_word
1142
1143
1144//------------------------------------------------------------------------
1145//
1146// FUNCTION __kmp_clear_x87_fpu_status_word
1147//
1148// void
1149// __kmp_clear_x87_fpu_status_word();
1150//
1151//
1152
1153 .text
1154 PROC __kmp_clear_x87_fpu_status_word
1155
1156#if __MIC__ || __MIC2__
1157// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
1158 fstenv -32(%rsp) // store FP env
1159 andw $~0x80ff, 4-32(%rsp) // clear 0-7,15 bits of FP SW
1160 fldenv -32(%rsp) // load FP env back
1161 ret
1162#else
1163 fnclex
1164 ret
1165#endif
1166
1167 DEBUG_INFO __kmp_clear_x87_fpu_status_word
1168
1169
1170//------------------------------------------------------------------------
1171//
1172// typedef void (*microtask_t)( int *gtid, int *tid, ... );
1173//
1174// int
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001175// __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
Jim Cownie5e8470a2013-09-27 10:38:44 +00001176// int gtid, int tid,
1177// int argc, void *p_argv[] ) {
1178// (*pkfn)( & gtid, & tid, argv[0], ... );
1179// return 1;
1180// }
1181//
1182// note:
1183// at call to pkfn must have %rsp 128-byte aligned for compiler
1184//
1185// parameters:
1186// %rdi: pkfn
1187// %esi: gtid
1188// %edx: tid
1189// %ecx: argc
1190// %r8: p_argv
1191//
1192// locals:
1193// __gtid: gtid parm pushed on stack so can pass &gtid to pkfn
1194// __tid: tid parm pushed on stack so can pass &tid to pkfn
1195//
1196// reg temps:
1197// %rax: used all over the place
1198// %rdx: used in stack pointer alignment calculation
1199// %r11: used to traverse p_argv array
1200// %rsi: used as temporary for stack parameters
1201// used as temporary for number of pkfn parms to push
1202// %rbx: used to hold pkfn address, and zero constant, callee-save
1203//
1204// return: %eax (always 1/TRUE)
1205//
1206
1207__gtid = -16
1208__tid = -24
1209
1210// -- Begin __kmp_invoke_microtask
1211// mark_begin;
1212 .text
1213 PROC __kmp_invoke_microtask
1214
1215 pushq %rbp // save base pointer
1216 movq %rsp,%rbp // establish the base pointer for this routine.
1217 pushq %rbx // %rbx is callee-saved register
1218
1219 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1220 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1221
1222 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1223 movq $0, %rbx // constant for cmovs later
1224 subq $4, %rax // subtract four args passed in registers to pkfn
1225#if __MIC__ || __MIC2__
1226 js L_kmp_0 // jump to movq
1227 jmp L_kmp_0_exit // jump ahead
1228L_kmp_0:
1229 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1230L_kmp_0_exit:
1231#else
1232 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1233#endif // __MIC__ || __MIC2__
1234
1235 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1236 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1237
1238 movq %rsp, %rdx //
1239 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1240 // without align, stack ptr would be this
1241 movq %rdx, %rax // Save to %rax
1242
1243 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1244 subq %rax, %rdx // Amount to subtract from %rsp
1245 subq %rdx, %rsp // Prepare the stack ptr --
1246 // now %rsp will align to 128-byte boundary at call site
1247
1248 // setup pkfn parameter reg and stack
1249 movq %rcx, %rax // argc -> %rax
1250 cmpq $0, %rsi
1251 je L_kmp_invoke_pass_parms // jump ahead if no parms to push
1252 shlq $3, %rcx // argc*8 -> %rcx
1253 movq %r8, %rdx // p_argv -> %rdx
1254 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1255
1256 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1257
1258L_kmp_invoke_push_parms: // push nth - 7th parms to pkfn on stack
1259 subq $8, %rdx // decrement p_argv pointer to previous parm
1260 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1261 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1262 subl $1, %ecx
1263
1264// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1265// if the name of the label that is an operand of this jecxz starts with a dot (".");
1266// Apple's linker does not support 1-byte length relocation;
1267// Resolution: replace all .labelX entries with L_labelX.
1268
1269 jecxz L_kmp_invoke_pass_parms // stop when four p_argv[] parms left
1270 jmp L_kmp_invoke_push_parms
1271
1272 ALIGN 3
1273L_kmp_invoke_pass_parms: // put 1st - 6th parms to pkfn in registers.
1274 // order here is important to avoid trashing
1275 // registers used for both input and output parms!
1276 movq %rdi, %rbx // pkfn -> %rbx
1277 leaq __gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
1278 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1279
1280 movq %r8, %r11 // p_argv -> %r11
1281
1282#if __MIC__ || __MIC2__
1283 cmpq $4, %rax // argc >= 4?
1284 jns L_kmp_4 // jump to movq
1285 jmp L_kmp_4_exit // jump ahead
1286L_kmp_4:
1287 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1288L_kmp_4_exit:
1289
1290 cmpq $3, %rax // argc >= 3?
1291 jns L_kmp_3 // jump to movq
1292 jmp L_kmp_3_exit // jump ahead
1293L_kmp_3:
1294 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1295L_kmp_3_exit:
1296
1297 cmpq $2, %rax // argc >= 2?
1298 jns L_kmp_2 // jump to movq
1299 jmp L_kmp_2_exit // jump ahead
1300L_kmp_2:
1301 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1302L_kmp_2_exit:
1303
1304 cmpq $1, %rax // argc >= 1?
1305 jns L_kmp_1 // jump to movq
1306 jmp L_kmp_1_exit // jump ahead
1307L_kmp_1:
1308 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1309L_kmp_1_exit:
1310#else
1311 cmpq $4, %rax // argc >= 4?
1312 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1313
1314 cmpq $3, %rax // argc >= 3?
1315 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1316
1317 cmpq $2, %rax // argc >= 2?
1318 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1319
1320 cmpq $1, %rax // argc >= 1?
1321 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1322#endif // __MIC__ || __MIC2__
1323
1324 call *%rbx // call (*pkfn)();
1325 movq $1, %rax // move 1 into return register;
1326
1327 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1328 movq %rbp, %rsp // restore stack pointer
1329 popq %rbp // restore frame pointer
1330 ret
1331
1332 DEBUG_INFO __kmp_invoke_microtask
1333// -- End __kmp_invoke_microtask
1334
1335// kmp_uint64
1336// __kmp_hardware_timestamp(void)
1337 .text
1338 PROC __kmp_hardware_timestamp
1339 rdtsc
1340 shlq $32, %rdx
1341 orq %rdx, %rax
1342 ret
1343
1344 DEBUG_INFO __kmp_hardware_timestamp
1345// -- End __kmp_hardware_timestamp
1346
1347//------------------------------------------------------------------------
1348//
1349// FUNCTION __kmp_bsr32
1350//
1351// int
1352// __kmp_bsr32( int );
1353//
1354
1355 .text
1356 PROC __kmp_bsr32
1357
1358 bsr %edi,%eax
1359 ret
1360
1361 DEBUG_INFO __kmp_bsr32
1362
1363
1364// -----------------------------------------------------------------------
1365#endif /* KMP_ARCH_X86_64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001366
1367#if KMP_ARCH_ARM
1368 .data
1369 .comm .gomp_critical_user_,32,8
1370 .data
1371 .align 4
1372 .global __kmp_unnamed_critical_addr
1373__kmp_unnamed_critical_addr:
1374 .4byte .gomp_critical_user_
1375 .size __kmp_unnamed_critical_addr,4
1376#endif /* KMP_ARCH_ARM */
1377
Andrey Churbanovcbda8682015-01-13 14:43:35 +00001378#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64
Jim Cownie3051f972014-08-07 10:12:54 +00001379 .data
1380 .comm .gomp_critical_user_,32,8
1381 .data
1382 .align 8
1383 .global __kmp_unnamed_critical_addr
1384__kmp_unnamed_critical_addr:
1385 .8byte .gomp_critical_user_
1386 .size __kmp_unnamed_critical_addr,8
Andrey Churbanovcbda8682015-01-13 14:43:35 +00001387#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 */
Jim Cownie181b4bb2013-12-23 17:28:57 +00001388
1389#if defined(__linux__)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001390# if KMP_ARCH_ARM
1391.section .note.GNU-stack,"",%progbits
1392# else
Jim Cownie181b4bb2013-12-23 17:28:57 +00001393.section .note.GNU-stack,"",@progbits
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001394# endif
Jim Cownie181b4bb2013-12-23 17:28:57 +00001395#endif