blob: 140527a20e7df03cc0a0dd9e6a3438f44b432177 [file] [log] [blame]
David S. Millerae2c6ca2012-09-26 21:11:01 -07001/* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/visasm.h>
8#include <asm/asi.h>
9#define GLOBAL_SPARE %g7
10#else
11#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12#define FPRS_FEF 0x04
13
14/* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17#define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24#ifdef MEMCPY_DEBUG
25#define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28#else
29#define VISEntryHalf FPU_ENTER
30#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31#endif
32
33#define GLOBAL_SPARE %g5
34#endif
35
36#ifndef STORE_ASI
37#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39#else
40#define STORE_ASI 0x80 /* ASI_P */
41#endif
42#endif
43
David S. Millerf4da3622014-10-14 19:37:58 -070044#if !defined(EX_LD) && !defined(EX_ST)
45#define NON_USER_COPY
46#endif
47
David S. Millerae2c6ca2012-09-26 21:11:01 -070048#ifndef EX_LD
49#define EX_LD(x) x
50#endif
51
52#ifndef EX_ST
53#define EX_ST(x) x
54#endif
55
56#ifndef EX_RETVAL
57#define EX_RETVAL(x) x
58#endif
59
60#ifndef LOAD
61#define LOAD(type,addr,dest) type [addr], dest
62#endif
63
64#ifndef STORE
65#ifndef MEMCPY_DEBUG
66#define STORE(type,src,addr) type src, [addr]
67#else
68#define STORE(type,src,addr) type##a src, [addr] %asi
69#endif
70#endif
71
72#ifndef STORE_INIT
73#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
74#endif
75
76#ifndef FUNC_NAME
77#define FUNC_NAME NG4memcpy
78#endif
79#ifndef PREAMBLE
80#define PREAMBLE
81#endif
82
83#ifndef XCC
84#define XCC xcc
85#endif
86
87 .register %g2,#scratch
88 .register %g3,#scratch
89
90 .text
91 .align 64
92
93 .globl FUNC_NAME
94 .type FUNC_NAME,#function
95FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
96#ifdef MEMCPY_DEBUG
97 wr %g0, 0x80, %asi
98#endif
99 srlx %o2, 31, %g2
100 cmp %g2, 0
101 tne %XCC, 5
102 PREAMBLE
103 mov %o0, %o3
104 brz,pn %o2, .Lexit
105 cmp %o2, 3
106 ble,pn %icc, .Ltiny
107 cmp %o2, 19
108 ble,pn %icc, .Lsmall
109 or %o0, %o1, %g2
110 cmp %o2, 128
111 bl,pn %icc, .Lmedium
112 nop
113
114.Llarge:/* len >= 0x80 */
115 /* First get dest 8 byte aligned. */
116 sub %g0, %o0, %g1
117 and %g1, 0x7, %g1
118 brz,pt %g1, 51f
119 sub %o2, %g1, %o2
David S. Miller42a41722012-09-28 13:08:22 -0700120
David S. Millerae2c6ca2012-09-26 21:11:01 -07001211: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
122 add %o1, 1, %o1
123 subcc %g1, 1, %g1
124 add %o0, 1, %o0
125 bne,pt %icc, 1b
126 EX_ST(STORE(stb, %g2, %o0 - 0x01))
127
12851: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
132 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
133 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
134 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
135 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
136
137 /* Check if we can use the straight fully aligned
138 * loop, or we require the alignaddr/faligndata variant.
139 */
140 andcc %o1, 0x7, %o5
141 bne,pn %icc, .Llarge_src_unaligned
142 sub %g0, %o0, %g1
143
144 /* Legitimize the use of initializing stores by getting dest
145 * to be 64-byte aligned.
146 */
147 and %g1, 0x3f, %g1
148 brz,pt %g1, .Llarge_aligned
149 sub %o2, %g1, %o2
David S. Miller42a41722012-09-28 13:08:22 -0700150
David S. Millerae2c6ca2012-09-26 21:11:01 -07001511: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
152 add %o1, 8, %o1
153 subcc %g1, 8, %g1
154 add %o0, 8, %o0
155 bne,pt %icc, 1b
156 EX_ST(STORE(stx, %g2, %o0 - 0x08))
157
158.Llarge_aligned:
159 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
160 andn %o2, 0x3f, %o4
161 sub %o2, %o4, %o2
162
1631: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
164 add %o1, 0x40, %o1
165 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
166 subcc %o4, 0x40, %o4
167 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
168 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
169 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
170 EX_ST(STORE_INIT(%g1, %o0))
171 add %o0, 0x08, %o0
172 EX_ST(STORE_INIT(%g2, %o0))
173 add %o0, 0x08, %o0
174 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
175 EX_ST(STORE_INIT(%g3, %o0))
176 add %o0, 0x08, %o0
177 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
178 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
179 add %o0, 0x08, %o0
180 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
181 EX_ST(STORE_INIT(%o5, %o0))
182 add %o0, 0x08, %o0
183 EX_ST(STORE_INIT(%g2, %o0))
184 add %o0, 0x08, %o0
185 EX_ST(STORE_INIT(%g3, %o0))
186 add %o0, 0x08, %o0
187 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
188 add %o0, 0x08, %o0
189 bne,pt %icc, 1b
190 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
191
192 membar #StoreLoad | #StoreStore
193
194 brz,pn %o2, .Lexit
195 cmp %o2, 19
196 ble,pn %icc, .Lsmall_unaligned
197 nop
198 ba,a,pt %icc, .Lmedium_noprefetch
199
200.Lexit: retl
201 mov EX_RETVAL(%o3), %o0
202
203.Llarge_src_unaligned:
David S. Millerf4da3622014-10-14 19:37:58 -0700204#ifdef NON_USER_COPY
205 VISEntryHalfFast(.Lmedium_vis_entry_fail)
206#else
207 VISEntryHalf
208#endif
David S. Millerae2c6ca2012-09-26 21:11:01 -0700209 andn %o2, 0x3f, %o4
210 sub %o2, %o4, %o2
David S. Millerae2c6ca2012-09-26 21:11:01 -0700211 alignaddr %o1, %g0, %g1
212 add %o1, %o4, %o1
213 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
2141: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
215 subcc %o4, 0x40, %o4
216 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
217 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
218 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
219 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
220 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
221 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
222 faligndata %f0, %f2, %f16
223 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
224 faligndata %f2, %f4, %f18
225 add %g1, 0x40, %g1
226 faligndata %f4, %f6, %f20
227 faligndata %f6, %f8, %f22
228 faligndata %f8, %f10, %f24
229 faligndata %f10, %f12, %f26
230 faligndata %f12, %f14, %f28
231 faligndata %f14, %f0, %f30
232 EX_ST(STORE(std, %f16, %o0 + 0x00))
233 EX_ST(STORE(std, %f18, %o0 + 0x08))
234 EX_ST(STORE(std, %f20, %o0 + 0x10))
235 EX_ST(STORE(std, %f22, %o0 + 0x18))
236 EX_ST(STORE(std, %f24, %o0 + 0x20))
237 EX_ST(STORE(std, %f26, %o0 + 0x28))
238 EX_ST(STORE(std, %f28, %o0 + 0x30))
239 EX_ST(STORE(std, %f30, %o0 + 0x38))
240 add %o0, 0x40, %o0
241 bne,pt %icc, 1b
242 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
243 VISExitHalf
244
245 brz,pn %o2, .Lexit
246 cmp %o2, 19
247 ble,pn %icc, .Lsmall_unaligned
248 nop
249 ba,a,pt %icc, .Lmedium_unaligned
250
David S. Millerf4da3622014-10-14 19:37:58 -0700251#ifdef NON_USER_COPY
252.Lmedium_vis_entry_fail:
253 or %o0, %o1, %g2
254#endif
David S. Millerae2c6ca2012-09-26 21:11:01 -0700255.Lmedium:
256 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
257 andcc %g2, 0x7, %g0
258 bne,pn %icc, .Lmedium_unaligned
259 nop
260.Lmedium_noprefetch:
261 andncc %o2, 0x20 - 1, %o5
262 be,pn %icc, 2f
263 sub %o2, %o5, %o2
2641: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
265 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
266 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
267 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
268 add %o1, 0x20, %o1
269 subcc %o5, 0x20, %o5
270 EX_ST(STORE(stx, %g1, %o0 + 0x00))
271 EX_ST(STORE(stx, %g2, %o0 + 0x08))
272 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
273 EX_ST(STORE(stx, %o4, %o0 + 0x18))
274 bne,pt %icc, 1b
275 add %o0, 0x20, %o0
2762: andcc %o2, 0x18, %o5
277 be,pt %icc, 3f
278 sub %o2, %o5, %o2
2791: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
280 add %o1, 0x08, %o1
281 add %o0, 0x08, %o0
282 subcc %o5, 0x08, %o5
283 bne,pt %icc, 1b
284 EX_ST(STORE(stx, %g1, %o0 - 0x08))
2853: brz,pt %o2, .Lexit
286 cmp %o2, 0x04
287 bl,pn %icc, .Ltiny
288 nop
289 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
290 add %o1, 0x04, %o1
291 add %o0, 0x04, %o0
292 subcc %o2, 0x04, %o2
293 bne,pn %icc, .Ltiny
294 EX_ST(STORE(stw, %g1, %o0 - 0x04))
295 ba,a,pt %icc, .Lexit
296.Lmedium_unaligned:
297 /* First get dest 8 byte aligned. */
298 sub %g0, %o0, %g1
299 and %g1, 0x7, %g1
300 brz,pt %g1, 2f
301 sub %o2, %g1, %o2
David S. Miller42a41722012-09-28 13:08:22 -0700302
David S. Millerae2c6ca2012-09-26 21:11:01 -07003031: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
304 add %o1, 1, %o1
305 subcc %g1, 1, %g1
306 add %o0, 1, %o0
307 bne,pt %icc, 1b
308 EX_ST(STORE(stb, %g2, %o0 - 0x01))
3092:
310 and %o1, 0x7, %g1
311 brz,pn %g1, .Lmedium_noprefetch
312 sll %g1, 3, %g1
313 mov 64, %g2
314 sub %g2, %g1, %g2
315 andn %o1, 0x7, %o1
316 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
317 sllx %o4, %g1, %o4
318 andn %o2, 0x08 - 1, %o5
319 sub %o2, %o5, %o2
3201: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
321 add %o1, 0x08, %o1
322 subcc %o5, 0x08, %o5
323 srlx %g3, %g2, GLOBAL_SPARE
324 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
325 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
326 add %o0, 0x08, %o0
327 bne,pt %icc, 1b
328 sllx %g3, %g1, %o4
329 srl %g1, 3, %g1
330 add %o1, %g1, %o1
331 brz,pn %o2, .Lexit
332 nop
333 ba,pt %icc, .Lsmall_unaligned
334
335.Ltiny:
336 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
337 subcc %o2, 1, %o2
338 be,pn %icc, .Lexit
339 EX_ST(STORE(stb, %g1, %o0 + 0x00))
340 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
341 subcc %o2, 1, %o2
342 be,pn %icc, .Lexit
343 EX_ST(STORE(stb, %g1, %o0 + 0x01))
344 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
345 ba,pt %icc, .Lexit
346 EX_ST(STORE(stb, %g1, %o0 + 0x02))
347
348.Lsmall:
349 andcc %g2, 0x3, %g0
350 bne,pn %icc, .Lsmall_unaligned
351 andn %o2, 0x4 - 1, %o5
352 sub %o2, %o5, %o2
3531:
354 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
355 add %o1, 0x04, %o1
356 subcc %o5, 0x04, %o5
357 add %o0, 0x04, %o0
358 bne,pt %icc, 1b
359 EX_ST(STORE(stw, %g1, %o0 - 0x04))
360 brz,pt %o2, .Lexit
361 nop
362 ba,a,pt %icc, .Ltiny
363
364.Lsmall_unaligned:
3651: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
366 add %o1, 1, %o1
367 add %o0, 1, %o0
368 subcc %o2, 1, %o2
369 bne,pt %icc, 1b
370 EX_ST(STORE(stb, %g1, %o0 - 0x01))
371 ba,a,pt %icc, .Lexit
372 .size FUNC_NAME, .-FUNC_NAME