blob: 2dcb3808cbdab6c91b9fbbf0d58790466780bb55 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/* Copyright 2002 Andi Kleen, SuSE Labs */
Jan Beulich8d379da2006-09-26 10:52:32 +02002
Jan Beulich8d379da2006-09-26 10:52:32 +02003#include <linux/linkage.h>
4#include <asm/dwarf2.h>
Fenghua Yu2f19e062011-05-17 15:29:18 -07005#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h>
Jan Beulich8d379da2006-09-26 10:52:32 +02007
Linus Torvalds1da177e2005-04-16 15:20:36 -07008/*
Fenghua Yu2f19e062011-05-17 15:29:18 -07009 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well.
Linus Torvalds1da177e2005-04-16 15:20:36 -070012 *
13 * rdi destination
14 * rsi value (char)
15 * rdx count (bytes)
16 *
17 * rax original destination
18 */
Jan Beulich7269e882009-12-18 16:16:03 +000019 .section .altinstr_replacement, "ax", @progbits
20.Lmemset_c:
Jan Beulich8d379da2006-09-26 10:52:32 +020021 movq %rdi,%r9
Jan Beulich5d7244e2012-01-05 16:10:42 +000022 movq %rdx,%rcx
23 andl $7,%edx
24 shrq $3,%rcx
Jan Beulich8d379da2006-09-26 10:52:32 +020025 /* expand byte value */
26 movzbl %sil,%esi
27 movabs $0x0101010101010101,%rax
Jan Beulich5d7244e2012-01-05 16:10:42 +000028 imulq %rsi,%rax
Jan Beulich8d379da2006-09-26 10:52:32 +020029 rep stosq
Jan Beulich5d7244e2012-01-05 16:10:42 +000030 movl %edx,%ecx
Jan Beulich8d379da2006-09-26 10:52:32 +020031 rep stosb
32 movq %r9,%rax
33 ret
Jan Beulich7269e882009-12-18 16:16:03 +000034.Lmemset_e:
35 .previous
Jan Beulich8d379da2006-09-26 10:52:32 +020036
Fenghua Yu2f19e062011-05-17 15:29:18 -070037/*
38 * ISO C memset - set a memory block to a byte value. This function uses
39 * enhanced rep stosb to override the fast string function.
40 * The code is simpler and shorter than the fast string function as well.
41 *
42 * rdi destination
43 * rsi value (char)
44 * rdx count (bytes)
45 *
46 * rax original destination
47 */
48 .section .altinstr_replacement, "ax", @progbits
49.Lmemset_c_e:
50 movq %rdi,%r9
51 movb %sil,%al
Jan Beulich5d7244e2012-01-05 16:10:42 +000052 movq %rdx,%rcx
Fenghua Yu2f19e062011-05-17 15:29:18 -070053 rep stosb
54 movq %r9,%rax
55 ret
56.Lmemset_e_e:
57 .previous
58
Jan Beulich8d379da2006-09-26 10:52:32 +020059ENTRY(memset)
60ENTRY(__memset)
61 CFI_STARTPROC
Andi Kleen7bcd3f32006-02-03 21:51:02 +010062 movq %rdi,%r10
Andi Kleen7bcd3f32006-02-03 21:51:02 +010063
64 /* expand byte value */
65 movzbl %sil,%ecx
66 movabs $0x0101010101010101,%rax
Jan Beulich5d7244e2012-01-05 16:10:42 +000067 imulq %rcx,%rax
Andi Kleen7bcd3f32006-02-03 21:51:02 +010068
69 /* align dst */
70 movl %edi,%r9d
71 andl $7,%r9d
72 jnz .Lbad_alignment
Jan Beulich8d379da2006-09-26 10:52:32 +020073 CFI_REMEMBER_STATE
Andi Kleen7bcd3f32006-02-03 21:51:02 +010074.Lafter_bad_alignment:
75
Jan Beulich5d7244e2012-01-05 16:10:42 +000076 movq %rdx,%rcx
77 shrq $6,%rcx
Andi Kleen7bcd3f32006-02-03 21:51:02 +010078 jz .Lhandle_tail
79
80 .p2align 4
81.Lloop_64:
Jan Beulich5d7244e2012-01-05 16:10:42 +000082 decq %rcx
Andi Kleen7bcd3f32006-02-03 21:51:02 +010083 movq %rax,(%rdi)
84 movq %rax,8(%rdi)
85 movq %rax,16(%rdi)
86 movq %rax,24(%rdi)
87 movq %rax,32(%rdi)
88 movq %rax,40(%rdi)
89 movq %rax,48(%rdi)
90 movq %rax,56(%rdi)
91 leaq 64(%rdi),%rdi
92 jnz .Lloop_64
93
94 /* Handle tail in loops. The loops should be faster than hard
95 to predict jump tables. */
96 .p2align 4
97.Lhandle_tail:
Jan Beulich5d7244e2012-01-05 16:10:42 +000098 movl %edx,%ecx
Andi Kleen7bcd3f32006-02-03 21:51:02 +010099 andl $63&(~7),%ecx
100 jz .Lhandle_7
101 shrl $3,%ecx
102 .p2align 4
103.Lloop_8:
104 decl %ecx
105 movq %rax,(%rdi)
106 leaq 8(%rdi),%rdi
107 jnz .Lloop_8
108
109.Lhandle_7:
Jan Beulich5d7244e2012-01-05 16:10:42 +0000110 andl $7,%edx
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100111 jz .Lende
112 .p2align 4
113.Lloop_1:
Jan Beulich5d7244e2012-01-05 16:10:42 +0000114 decl %edx
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100115 movb %al,(%rdi)
116 leaq 1(%rdi),%rdi
117 jnz .Lloop_1
118
119.Lende:
120 movq %r10,%rax
121 ret
122
Jan Beulich8d379da2006-09-26 10:52:32 +0200123 CFI_RESTORE_STATE
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100124.Lbad_alignment:
Jan Beulich5d7244e2012-01-05 16:10:42 +0000125 cmpq $7,%rdx
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100126 jbe .Lhandle_7
127 movq %rax,(%rdi) /* unaligned store */
128 movq $8,%r8
129 subq %r9,%r8
130 addq %r8,%rdi
Jan Beulich5d7244e2012-01-05 16:10:42 +0000131 subq %r8,%rdx
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100132 jmp .Lafter_bad_alignment
Jan Beulich8d379da2006-09-26 10:52:32 +0200133.Lfinal:
134 CFI_ENDPROC
135ENDPROC(memset)
136ENDPROC(__memset)
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100137
Fenghua Yu2f19e062011-05-17 15:29:18 -0700138 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
139 * It is recommended to use this when possible.
140 *
141 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
142 * instructions.
143 *
144 * Otherwise, use original memset function.
145 *
146 * In .altinstructions section, ERMS feature is placed after REG_GOOD
147 * feature to implement the right patch order.
148 */
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100149 .section .altinstructions,"a"
Fenghua Yu2f19e062011-05-17 15:29:18 -0700150 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
151 .Lfinal-memset,.Lmemset_e-.Lmemset_c
152 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
153 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
Andi Kleen7bcd3f32006-02-03 21:51:02 +0100154 .previous