blob: 1514aa2c84faae7aed2d55ad5312a5f6b28586b7 [file] [log] [blame]
Henrik Smiding86a16002014-05-16 13:26:08 +02001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Henrik Smiding86a16002014-05-16 13:26:08 +020016
17#include "cache.h"
18
Varvara Rainchik458d1252014-09-08 16:27:01 +040019#ifndef MEMSET
20# define MEMSET android_memset32
21#endif
22
Henrik Smiding86a16002014-05-16 13:26:08 +020023#ifndef L
24# define L(label) .L##label
25#endif
26
27#ifndef ALIGN
28# define ALIGN(n) .p2align n
29#endif
30
31#ifndef cfi_startproc
32# define cfi_startproc .cfi_startproc
33#endif
34
35#ifndef cfi_endproc
36# define cfi_endproc .cfi_endproc
37#endif
38
39#ifndef ENTRY
40# define ENTRY(name) \
41 .type name, @function; \
42 .globl name; \
43 .p2align 4; \
44name: \
45 cfi_startproc
46#endif
47
48#ifndef END
49# define END(name) \
50 cfi_endproc; \
51 .size name, .-name
52#endif
53
54#define JMPTBL(I, B) I - B
55
56/* Branch to an entry in a jump table. TABLE is a jump table with
57 relative offsets. INDEX is a register contains the index into the
58 jump table. SCALE is the scale of INDEX. */
59#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
60 lea TABLE(%rip), %r11; \
61 movslq (%r11, INDEX, SCALE), INDEX; \
62 lea (%r11, INDEX), INDEX; \
63 jmp *INDEX
64
65 .section .text.sse2,"ax",@progbits
66 ALIGN (4)
Varvara Rainchik458d1252014-09-08 16:27:01 +040067ENTRY (MEMSET) // Address in rdi
Henrik Smiding86a16002014-05-16 13:26:08 +020068 shr $2, %rdx // Count in rdx
69 movl %esi, %ecx // Pattern in ecx
70
71 cmp $16, %rdx
72 jae L(16dbwordsormore)
73
74L(write_less16dbwords):
75 lea (%rdi, %rdx, 4), %rdi
76 BRANCH_TO_JMPTBL_ENTRY (L(table_less16dbwords), %rdx, 4)
77
78 .pushsection .rodata.sse2,"a",@progbits
79 ALIGN (2)
80L(table_less16dbwords):
81 .int JMPTBL (L(write_0dbwords), L(table_less16dbwords))
82 .int JMPTBL (L(write_1dbwords), L(table_less16dbwords))
83 .int JMPTBL (L(write_2dbwords), L(table_less16dbwords))
84 .int JMPTBL (L(write_3dbwords), L(table_less16dbwords))
85 .int JMPTBL (L(write_4dbwords), L(table_less16dbwords))
86 .int JMPTBL (L(write_5dbwords), L(table_less16dbwords))
87 .int JMPTBL (L(write_6dbwords), L(table_less16dbwords))
88 .int JMPTBL (L(write_7dbwords), L(table_less16dbwords))
89 .int JMPTBL (L(write_8dbwords), L(table_less16dbwords))
90 .int JMPTBL (L(write_9dbwords), L(table_less16dbwords))
91 .int JMPTBL (L(write_10dbwords), L(table_less16dbwords))
92 .int JMPTBL (L(write_11dbwords), L(table_less16dbwords))
93 .int JMPTBL (L(write_12dbwords), L(table_less16dbwords))
94 .int JMPTBL (L(write_13dbwords), L(table_less16dbwords))
95 .int JMPTBL (L(write_14dbwords), L(table_less16dbwords))
96 .int JMPTBL (L(write_15dbwords), L(table_less16dbwords))
97 .popsection
98
99 ALIGN (4)
100L(write_15dbwords):
101 movl %ecx, -60(%rdi)
102L(write_14dbwords):
103 movl %ecx, -56(%rdi)
104L(write_13dbwords):
105 movl %ecx, -52(%rdi)
106L(write_12dbwords):
107 movl %ecx, -48(%rdi)
108L(write_11dbwords):
109 movl %ecx, -44(%rdi)
110L(write_10dbwords):
111 movl %ecx, -40(%rdi)
112L(write_9dbwords):
113 movl %ecx, -36(%rdi)
114L(write_8dbwords):
115 movl %ecx, -32(%rdi)
116L(write_7dbwords):
117 movl %ecx, -28(%rdi)
118L(write_6dbwords):
119 movl %ecx, -24(%rdi)
120L(write_5dbwords):
121 movl %ecx, -20(%rdi)
122L(write_4dbwords):
123 movl %ecx, -16(%rdi)
124L(write_3dbwords):
125 movl %ecx, -12(%rdi)
126L(write_2dbwords):
127 movl %ecx, -8(%rdi)
128L(write_1dbwords):
129 movl %ecx, -4(%rdi)
130L(write_0dbwords):
131 ret
132
133 ALIGN (4)
134L(16dbwordsormore):
135 test $3, %edi
136 jz L(aligned4bytes)
137 mov %ecx, (%rdi)
138 mov %ecx, -4(%rdi, %rdx, 4)
139 sub $1, %rdx
140 rol $24, %ecx
141 add $1, %rdi
142 test $3, %edi
143 jz L(aligned4bytes)
144 ror $8, %ecx
145 add $1, %rdi
146 test $3, %edi
147 jz L(aligned4bytes)
148 ror $8, %ecx
149 add $1, %rdi
150L(aligned4bytes):
151 shl $2, %rdx
152
153 /* Fill xmm0 with the pattern. */
154 movd %ecx, %xmm0
155 pshufd $0, %xmm0, %xmm0
156
157 testl $0xf, %edi
158 jz L(aligned_16)
159/* RDX > 32 and RDI is not 16 byte aligned. */
160 movdqu %xmm0, (%rdi)
161 mov %rdi, %rsi
162 and $-16, %rdi
163 add $16, %rdi
164 sub %rdi, %rsi
165 add %rsi, %rdx
166
167 ALIGN (4)
168L(aligned_16):
169 cmp $128, %rdx
170 jge L(128bytesormore)
171
172L(aligned_16_less128bytes):
173 add %rdx, %rdi
174 shr $2, %rdx
175 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
176
177 ALIGN (4)
178L(128bytesormore):
179 cmp $SHARED_CACHE_SIZE, %rdx
180 jg L(128bytesormore_nt)
181
182L(128bytesormore_normal):
183 sub $128, %rdx
184 movdqa %xmm0, (%rdi)
185 movdqa %xmm0, 0x10(%rdi)
186 movdqa %xmm0, 0x20(%rdi)
187 movdqa %xmm0, 0x30(%rdi)
188 movdqa %xmm0, 0x40(%rdi)
189 movdqa %xmm0, 0x50(%rdi)
190 movdqa %xmm0, 0x60(%rdi)
191 movdqa %xmm0, 0x70(%rdi)
192 lea 128(%rdi), %rdi
193 cmp $128, %rdx
194 jl L(128bytesless_normal)
195
196 sub $128, %rdx
197 movdqa %xmm0, (%rdi)
198 movdqa %xmm0, 0x10(%rdi)
199 movdqa %xmm0, 0x20(%rdi)
200 movdqa %xmm0, 0x30(%rdi)
201 movdqa %xmm0, 0x40(%rdi)
202 movdqa %xmm0, 0x50(%rdi)
203 movdqa %xmm0, 0x60(%rdi)
204 movdqa %xmm0, 0x70(%rdi)
205 lea 128(%rdi), %rdi
206 cmp $128, %rdx
207 jl L(128bytesless_normal)
208
209 sub $128, %rdx
210 movdqa %xmm0, (%rdi)
211 movdqa %xmm0, 0x10(%rdi)
212 movdqa %xmm0, 0x20(%rdi)
213 movdqa %xmm0, 0x30(%rdi)
214 movdqa %xmm0, 0x40(%rdi)
215 movdqa %xmm0, 0x50(%rdi)
216 movdqa %xmm0, 0x60(%rdi)
217 movdqa %xmm0, 0x70(%rdi)
218 lea 128(%rdi), %rdi
219 cmp $128, %rdx
220 jl L(128bytesless_normal)
221
222 sub $128, %rdx
223 movdqa %xmm0, (%rdi)
224 movdqa %xmm0, 0x10(%rdi)
225 movdqa %xmm0, 0x20(%rdi)
226 movdqa %xmm0, 0x30(%rdi)
227 movdqa %xmm0, 0x40(%rdi)
228 movdqa %xmm0, 0x50(%rdi)
229 movdqa %xmm0, 0x60(%rdi)
230 movdqa %xmm0, 0x70(%rdi)
231 lea 128(%rdi), %rdi
232 cmp $128, %rdx
233 jge L(128bytesormore_normal)
234
235L(128bytesless_normal):
236 add %rdx, %rdi
237 shr $2, %rdx
238 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
239
240 ALIGN (4)
241L(128bytesormore_nt):
242 sub $128, %rdx
243 movntdq %xmm0, (%rdi)
244 movntdq %xmm0, 0x10(%rdi)
245 movntdq %xmm0, 0x20(%rdi)
246 movntdq %xmm0, 0x30(%rdi)
247 movntdq %xmm0, 0x40(%rdi)
248 movntdq %xmm0, 0x50(%rdi)
249 movntdq %xmm0, 0x60(%rdi)
250 movntdq %xmm0, 0x70(%rdi)
251 lea 128(%rdi), %rdi
252 cmp $128, %rdx
253 jge L(128bytesormore_nt)
254
255 sfence
256 add %rdx, %rdi
257 shr $2, %rdx
258 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes), %rdx, 4)
259
260 .pushsection .rodata.sse2,"a",@progbits
261 ALIGN (2)
262L(table_16_128bytes):
263 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
264 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
265 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
266 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
267 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
268 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
269 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
270 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
271 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
272 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
273 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
274 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
275 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
276 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
277 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
278 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
279 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
280 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
281 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
282 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
283 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
284 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
285 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
286 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
287 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
288 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
289 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
290 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
291 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
292 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
293 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
294 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
295 .popsection
296
297 ALIGN (4)
298L(aligned_16_112bytes):
299 movdqa %xmm0, -112(%rdi)
300L(aligned_16_96bytes):
301 movdqa %xmm0, -96(%rdi)
302L(aligned_16_80bytes):
303 movdqa %xmm0, -80(%rdi)
304L(aligned_16_64bytes):
305 movdqa %xmm0, -64(%rdi)
306L(aligned_16_48bytes):
307 movdqa %xmm0, -48(%rdi)
308L(aligned_16_32bytes):
309 movdqa %xmm0, -32(%rdi)
310L(aligned_16_16bytes):
311 movdqa %xmm0, -16(%rdi)
312L(aligned_16_0bytes):
313 ret
314
315 ALIGN (4)
316L(aligned_16_116bytes):
317 movdqa %xmm0, -116(%rdi)
318L(aligned_16_100bytes):
319 movdqa %xmm0, -100(%rdi)
320L(aligned_16_84bytes):
321 movdqa %xmm0, -84(%rdi)
322L(aligned_16_68bytes):
323 movdqa %xmm0, -68(%rdi)
324L(aligned_16_52bytes):
325 movdqa %xmm0, -52(%rdi)
326L(aligned_16_36bytes):
327 movdqa %xmm0, -36(%rdi)
328L(aligned_16_20bytes):
329 movdqa %xmm0, -20(%rdi)
330L(aligned_16_4bytes):
331 movl %ecx, -4(%rdi)
332 ret
333
334 ALIGN (4)
335L(aligned_16_120bytes):
336 movdqa %xmm0, -120(%rdi)
337L(aligned_16_104bytes):
338 movdqa %xmm0, -104(%rdi)
339L(aligned_16_88bytes):
340 movdqa %xmm0, -88(%rdi)
341L(aligned_16_72bytes):
342 movdqa %xmm0, -72(%rdi)
343L(aligned_16_56bytes):
344 movdqa %xmm0, -56(%rdi)
345L(aligned_16_40bytes):
346 movdqa %xmm0, -40(%rdi)
347L(aligned_16_24bytes):
348 movdqa %xmm0, -24(%rdi)
349L(aligned_16_8bytes):
350 movq %xmm0, -8(%rdi)
351 ret
352
353 ALIGN (4)
354L(aligned_16_124bytes):
355 movdqa %xmm0, -124(%rdi)
356L(aligned_16_108bytes):
357 movdqa %xmm0, -108(%rdi)
358L(aligned_16_92bytes):
359 movdqa %xmm0, -92(%rdi)
360L(aligned_16_76bytes):
361 movdqa %xmm0, -76(%rdi)
362L(aligned_16_60bytes):
363 movdqa %xmm0, -60(%rdi)
364L(aligned_16_44bytes):
365 movdqa %xmm0, -44(%rdi)
366L(aligned_16_28bytes):
367 movdqa %xmm0, -28(%rdi)
368L(aligned_16_12bytes):
369 movq %xmm0, -12(%rdi)
370 movl %ecx, -4(%rdi)
371 ret
372
Varvara Rainchik458d1252014-09-08 16:27:01 +0400373END (MEMSET)