blob: 84674d897937ef4a9673afab885c1a41f8c74c26 [file] [log] [blame]
Anton Blanchardb3f271e2012-05-30 20:22:09 +00001/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#define STACKFRAMESIZE 256
23#define STK_REG(i) (112 + ((i)-14)*8)
24
25_GLOBAL(memcpy_power7)
26#ifdef CONFIG_ALTIVEC
27 cmpldi r5,16
28 cmpldi cr1,r5,4096
29
30 std r3,48(r1)
31
32 blt .Lshort_copy
33 bgt cr1,.Lvmx_copy
34#else
35 cmpldi r5,16
36
37 std r3,48(r1)
38
39 blt .Lshort_copy
40#endif
41
42.Lnonvmx_copy:
43 /* Get the source 8B aligned */
44 neg r6,r4
45 mtocrf 0x01,r6
46 clrldi r6,r6,(64-3)
47
48 bf cr7*4+3,1f
49 lbz r0,0(r4)
50 addi r4,r4,1
51 stb r0,0(r3)
52 addi r3,r3,1
53
541: bf cr7*4+2,2f
55 lhz r0,0(r4)
56 addi r4,r4,2
57 sth r0,0(r3)
58 addi r3,r3,2
59
602: bf cr7*4+1,3f
61 lwz r0,0(r4)
62 addi r4,r4,4
63 stw r0,0(r3)
64 addi r3,r3,4
65
663: sub r5,r5,r6
67 cmpldi r5,128
68 blt 5f
69
70 mflr r0
71 stdu r1,-STACKFRAMESIZE(r1)
72 std r14,STK_REG(r14)(r1)
73 std r15,STK_REG(r15)(r1)
74 std r16,STK_REG(r16)(r1)
75 std r17,STK_REG(r17)(r1)
76 std r18,STK_REG(r18)(r1)
77 std r19,STK_REG(r19)(r1)
78 std r20,STK_REG(r20)(r1)
79 std r21,STK_REG(r21)(r1)
80 std r22,STK_REG(r22)(r1)
81 std r0,STACKFRAMESIZE+16(r1)
82
83 srdi r6,r5,7
84 mtctr r6
85
86 /* Now do cacheline (128B) sized loads and stores. */
87 .align 5
884:
89 ld r0,0(r4)
90 ld r6,8(r4)
91 ld r7,16(r4)
92 ld r8,24(r4)
93 ld r9,32(r4)
94 ld r10,40(r4)
95 ld r11,48(r4)
96 ld r12,56(r4)
97 ld r14,64(r4)
98 ld r15,72(r4)
99 ld r16,80(r4)
100 ld r17,88(r4)
101 ld r18,96(r4)
102 ld r19,104(r4)
103 ld r20,112(r4)
104 ld r21,120(r4)
105 addi r4,r4,128
106 std r0,0(r3)
107 std r6,8(r3)
108 std r7,16(r3)
109 std r8,24(r3)
110 std r9,32(r3)
111 std r10,40(r3)
112 std r11,48(r3)
113 std r12,56(r3)
114 std r14,64(r3)
115 std r15,72(r3)
116 std r16,80(r3)
117 std r17,88(r3)
118 std r18,96(r3)
119 std r19,104(r3)
120 std r20,112(r3)
121 std r21,120(r3)
122 addi r3,r3,128
123 bdnz 4b
124
125 clrldi r5,r5,(64-7)
126
127 ld r14,STK_REG(r14)(r1)
128 ld r15,STK_REG(r15)(r1)
129 ld r16,STK_REG(r16)(r1)
130 ld r17,STK_REG(r17)(r1)
131 ld r18,STK_REG(r18)(r1)
132 ld r19,STK_REG(r19)(r1)
133 ld r20,STK_REG(r20)(r1)
134 ld r21,STK_REG(r21)(r1)
135 ld r22,STK_REG(r22)(r1)
136 addi r1,r1,STACKFRAMESIZE
137
138 /* Up to 127B to go */
1395: srdi r6,r5,4
140 mtocrf 0x01,r6
141
1426: bf cr7*4+1,7f
143 ld r0,0(r4)
144 ld r6,8(r4)
145 ld r7,16(r4)
146 ld r8,24(r4)
147 ld r9,32(r4)
148 ld r10,40(r4)
149 ld r11,48(r4)
150 ld r12,56(r4)
151 addi r4,r4,64
152 std r0,0(r3)
153 std r6,8(r3)
154 std r7,16(r3)
155 std r8,24(r3)
156 std r9,32(r3)
157 std r10,40(r3)
158 std r11,48(r3)
159 std r12,56(r3)
160 addi r3,r3,64
161
162 /* Up to 63B to go */
1637: bf cr7*4+2,8f
164 ld r0,0(r4)
165 ld r6,8(r4)
166 ld r7,16(r4)
167 ld r8,24(r4)
168 addi r4,r4,32
169 std r0,0(r3)
170 std r6,8(r3)
171 std r7,16(r3)
172 std r8,24(r3)
173 addi r3,r3,32
174
175 /* Up to 31B to go */
1768: bf cr7*4+3,9f
177 ld r0,0(r4)
178 ld r6,8(r4)
179 addi r4,r4,16
180 std r0,0(r3)
181 std r6,8(r3)
182 addi r3,r3,16
183
1849: clrldi r5,r5,(64-4)
185
186 /* Up to 15B to go */
187.Lshort_copy:
188 mtocrf 0x01,r5
189 bf cr7*4+0,12f
190 lwz r0,0(r4) /* Less chance of a reject with word ops */
191 lwz r6,4(r4)
192 addi r4,r4,8
193 stw r0,0(r3)
194 stw r6,4(r3)
195 addi r3,r3,8
196
19712: bf cr7*4+1,13f
198 lwz r0,0(r4)
199 addi r4,r4,4
200 stw r0,0(r3)
201 addi r3,r3,4
202
20313: bf cr7*4+2,14f
204 lhz r0,0(r4)
205 addi r4,r4,2
206 sth r0,0(r3)
207 addi r3,r3,2
208
20914: bf cr7*4+3,15f
210 lbz r0,0(r4)
211 stb r0,0(r3)
212
21315: ld r3,48(r1)
214 blr
215
216.Lunwind_stack_nonvmx_copy:
217 addi r1,r1,STACKFRAMESIZE
218 b .Lnonvmx_copy
219
220#ifdef CONFIG_ALTIVEC
221.Lvmx_copy:
222 mflr r0
223 std r4,56(r1)
224 std r5,64(r1)
225 std r0,16(r1)
226 stdu r1,-STACKFRAMESIZE(r1)
227 bl .enter_vmx_copy
228 cmpwi r3,0
229 ld r0,STACKFRAMESIZE+16(r1)
230 ld r3,STACKFRAMESIZE+48(r1)
231 ld r4,STACKFRAMESIZE+56(r1)
232 ld r5,STACKFRAMESIZE+64(r1)
233 mtlr r0
234
235 /*
236 * We prefetch both the source and destination using enhanced touch
237 * instructions. We use a stream ID of 0 for the load side and
238 * 1 for the store side.
239 */
240 clrrdi r6,r4,7
241 clrrdi r9,r3,7
242 ori r9,r9,1 /* stream=1 */
243
244 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
245 cmpldi cr1,r7,0x3FF
246 ble cr1,1f
247 li r7,0x3FF
2481: lis r0,0x0E00 /* depth=7 */
249 sldi r7,r7,7
250 or r7,r7,r0
251 ori r10,r7,1 /* stream=1 */
252
253 lis r8,0x8000 /* GO=1 */
254 clrldi r8,r8,32
255
256.machine push
257.machine "power4"
258 dcbt r0,r6,0b01000
259 dcbt r0,r7,0b01010
260 dcbtst r0,r9,0b01000
261 dcbtst r0,r10,0b01010
262 eieio
263 dcbt r0,r8,0b01010 /* GO */
264.machine pop
265
266 beq .Lunwind_stack_nonvmx_copy
267
268 /*
269 * If source and destination are not relatively aligned we use a
270 * slower permute loop.
271 */
272 xor r6,r4,r3
273 rldicl. r6,r6,0,(64-4)
274 bne .Lvmx_unaligned_copy
275
276 /* Get the destination 16B aligned */
277 neg r6,r3
278 mtocrf 0x01,r6
279 clrldi r6,r6,(64-4)
280
281 bf cr7*4+3,1f
282 lbz r0,0(r4)
283 addi r4,r4,1
284 stb r0,0(r3)
285 addi r3,r3,1
286
2871: bf cr7*4+2,2f
288 lhz r0,0(r4)
289 addi r4,r4,2
290 sth r0,0(r3)
291 addi r3,r3,2
292
2932: bf cr7*4+1,3f
294 lwz r0,0(r4)
295 addi r4,r4,4
296 stw r0,0(r3)
297 addi r3,r3,4
298
2993: bf cr7*4+0,4f
300 ld r0,0(r4)
301 addi r4,r4,8
302 std r0,0(r3)
303 addi r3,r3,8
304
3054: sub r5,r5,r6
306
307 /* Get the desination 128B aligned */
308 neg r6,r3
309 srdi r7,r6,4
310 mtocrf 0x01,r7
311 clrldi r6,r6,(64-7)
312
313 li r9,16
314 li r10,32
315 li r11,48
316
317 bf cr7*4+3,5f
318 lvx vr1,r0,r4
319 addi r4,r4,16
320 stvx vr1,r0,r3
321 addi r3,r3,16
322
3235: bf cr7*4+2,6f
324 lvx vr1,r0,r4
325 lvx vr0,r4,r9
326 addi r4,r4,32
327 stvx vr1,r0,r3
328 stvx vr0,r3,r9
329 addi r3,r3,32
330
3316: bf cr7*4+1,7f
332 lvx vr3,r0,r4
333 lvx vr2,r4,r9
334 lvx vr1,r4,r10
335 lvx vr0,r4,r11
336 addi r4,r4,64
337 stvx vr3,r0,r3
338 stvx vr2,r3,r9
339 stvx vr1,r3,r10
340 stvx vr0,r3,r11
341 addi r3,r3,64
342
3437: sub r5,r5,r6
344 srdi r6,r5,7
345
346 std r14,STK_REG(r14)(r1)
347 std r15,STK_REG(r15)(r1)
348 std r16,STK_REG(r16)(r1)
349
350 li r12,64
351 li r14,80
352 li r15,96
353 li r16,112
354
355 mtctr r6
356
357 /*
358 * Now do cacheline sized loads and stores. By this stage the
359 * cacheline stores are also cacheline aligned.
360 */
361 .align 5
3628:
363 lvx vr7,r0,r4
364 lvx vr6,r4,r9
365 lvx vr5,r4,r10
366 lvx vr4,r4,r11
367 lvx vr3,r4,r12
368 lvx vr2,r4,r14
369 lvx vr1,r4,r15
370 lvx vr0,r4,r16
371 addi r4,r4,128
372 stvx vr7,r0,r3
373 stvx vr6,r3,r9
374 stvx vr5,r3,r10
375 stvx vr4,r3,r11
376 stvx vr3,r3,r12
377 stvx vr2,r3,r14
378 stvx vr1,r3,r15
379 stvx vr0,r3,r16
380 addi r3,r3,128
381 bdnz 8b
382
383 ld r14,STK_REG(r14)(r1)
384 ld r15,STK_REG(r15)(r1)
385 ld r16,STK_REG(r16)(r1)
386
387 /* Up to 127B to go */
388 clrldi r5,r5,(64-7)
389 srdi r6,r5,4
390 mtocrf 0x01,r6
391
392 bf cr7*4+1,9f
393 lvx vr3,r0,r4
394 lvx vr2,r4,r9
395 lvx vr1,r4,r10
396 lvx vr0,r4,r11
397 addi r4,r4,64
398 stvx vr3,r0,r3
399 stvx vr2,r3,r9
400 stvx vr1,r3,r10
401 stvx vr0,r3,r11
402 addi r3,r3,64
403
4049: bf cr7*4+2,10f
405 lvx vr1,r0,r4
406 lvx vr0,r4,r9
407 addi r4,r4,32
408 stvx vr1,r0,r3
409 stvx vr0,r3,r9
410 addi r3,r3,32
411
41210: bf cr7*4+3,11f
413 lvx vr1,r0,r4
414 addi r4,r4,16
415 stvx vr1,r0,r3
416 addi r3,r3,16
417
418 /* Up to 15B to go */
41911: clrldi r5,r5,(64-4)
420 mtocrf 0x01,r5
421 bf cr7*4+0,12f
422 ld r0,0(r4)
423 addi r4,r4,8
424 std r0,0(r3)
425 addi r3,r3,8
426
42712: bf cr7*4+1,13f
428 lwz r0,0(r4)
429 addi r4,r4,4
430 stw r0,0(r3)
431 addi r3,r3,4
432
43313: bf cr7*4+2,14f
434 lhz r0,0(r4)
435 addi r4,r4,2
436 sth r0,0(r3)
437 addi r3,r3,2
438
43914: bf cr7*4+3,15f
440 lbz r0,0(r4)
441 stb r0,0(r3)
442
44315: addi r1,r1,STACKFRAMESIZE
444 ld r3,48(r1)
445 b .exit_vmx_copy /* tail call optimise */
446
447.Lvmx_unaligned_copy:
448 /* Get the destination 16B aligned */
449 neg r6,r3
450 mtocrf 0x01,r6
451 clrldi r6,r6,(64-4)
452
453 bf cr7*4+3,1f
454 lbz r0,0(r4)
455 addi r4,r4,1
456 stb r0,0(r3)
457 addi r3,r3,1
458
4591: bf cr7*4+2,2f
460 lhz r0,0(r4)
461 addi r4,r4,2
462 sth r0,0(r3)
463 addi r3,r3,2
464
4652: bf cr7*4+1,3f
466 lwz r0,0(r4)
467 addi r4,r4,4
468 stw r0,0(r3)
469 addi r3,r3,4
470
4713: bf cr7*4+0,4f
472 lwz r0,0(r4) /* Less chance of a reject with word ops */
473 lwz r7,4(r4)
474 addi r4,r4,8
475 stw r0,0(r3)
476 stw r7,4(r3)
477 addi r3,r3,8
478
4794: sub r5,r5,r6
480
481 /* Get the desination 128B aligned */
482 neg r6,r3
483 srdi r7,r6,4
484 mtocrf 0x01,r7
485 clrldi r6,r6,(64-7)
486
487 li r9,16
488 li r10,32
489 li r11,48
490
491 lvsl vr16,0,r4 /* Setup permute control vector */
492 lvx vr0,0,r4
493 addi r4,r4,16
494
495 bf cr7*4+3,5f
496 lvx vr1,r0,r4
497 vperm vr8,vr0,vr1,vr16
498 addi r4,r4,16
499 stvx vr8,r0,r3
500 addi r3,r3,16
501 vor vr0,vr1,vr1
502
5035: bf cr7*4+2,6f
504 lvx vr1,r0,r4
505 vperm vr8,vr0,vr1,vr16
506 lvx vr0,r4,r9
507 vperm vr9,vr1,vr0,vr16
508 addi r4,r4,32
509 stvx vr8,r0,r3
510 stvx vr9,r3,r9
511 addi r3,r3,32
512
5136: bf cr7*4+1,7f
514 lvx vr3,r0,r4
515 vperm vr8,vr0,vr3,vr16
516 lvx vr2,r4,r9
517 vperm vr9,vr3,vr2,vr16
518 lvx vr1,r4,r10
519 vperm vr10,vr2,vr1,vr16
520 lvx vr0,r4,r11
521 vperm vr11,vr1,vr0,vr16
522 addi r4,r4,64
523 stvx vr8,r0,r3
524 stvx vr9,r3,r9
525 stvx vr10,r3,r10
526 stvx vr11,r3,r11
527 addi r3,r3,64
528
5297: sub r5,r5,r6
530 srdi r6,r5,7
531
532 std r14,STK_REG(r14)(r1)
533 std r15,STK_REG(r15)(r1)
534 std r16,STK_REG(r16)(r1)
535
536 li r12,64
537 li r14,80
538 li r15,96
539 li r16,112
540
541 mtctr r6
542
543 /*
544 * Now do cacheline sized loads and stores. By this stage the
545 * cacheline stores are also cacheline aligned.
546 */
547 .align 5
5488:
549 lvx vr7,r0,r4
550 vperm vr8,vr0,vr7,vr16
551 lvx vr6,r4,r9
552 vperm vr9,vr7,vr6,vr16
553 lvx vr5,r4,r10
554 vperm vr10,vr6,vr5,vr16
555 lvx vr4,r4,r11
556 vperm vr11,vr5,vr4,vr16
557 lvx vr3,r4,r12
558 vperm vr12,vr4,vr3,vr16
559 lvx vr2,r4,r14
560 vperm vr13,vr3,vr2,vr16
561 lvx vr1,r4,r15
562 vperm vr14,vr2,vr1,vr16
563 lvx vr0,r4,r16
564 vperm vr15,vr1,vr0,vr16
565 addi r4,r4,128
566 stvx vr8,r0,r3
567 stvx vr9,r3,r9
568 stvx vr10,r3,r10
569 stvx vr11,r3,r11
570 stvx vr12,r3,r12
571 stvx vr13,r3,r14
572 stvx vr14,r3,r15
573 stvx vr15,r3,r16
574 addi r3,r3,128
575 bdnz 8b
576
577 ld r14,STK_REG(r14)(r1)
578 ld r15,STK_REG(r15)(r1)
579 ld r16,STK_REG(r16)(r1)
580
581 /* Up to 127B to go */
582 clrldi r5,r5,(64-7)
583 srdi r6,r5,4
584 mtocrf 0x01,r6
585
586 bf cr7*4+1,9f
587 lvx vr3,r0,r4
588 vperm vr8,vr0,vr3,vr16
589 lvx vr2,r4,r9
590 vperm vr9,vr3,vr2,vr16
591 lvx vr1,r4,r10
592 vperm vr10,vr2,vr1,vr16
593 lvx vr0,r4,r11
594 vperm vr11,vr1,vr0,vr16
595 addi r4,r4,64
596 stvx vr8,r0,r3
597 stvx vr9,r3,r9
598 stvx vr10,r3,r10
599 stvx vr11,r3,r11
600 addi r3,r3,64
601
6029: bf cr7*4+2,10f
603 lvx vr1,r0,r4
604 vperm vr8,vr0,vr1,vr16
605 lvx vr0,r4,r9
606 vperm vr9,vr1,vr0,vr16
607 addi r4,r4,32
608 stvx vr8,r0,r3
609 stvx vr9,r3,r9
610 addi r3,r3,32
611
61210: bf cr7*4+3,11f
613 lvx vr1,r0,r4
614 vperm vr8,vr0,vr1,vr16
615 addi r4,r4,16
616 stvx vr8,r0,r3
617 addi r3,r3,16
618
619 /* Up to 15B to go */
62011: clrldi r5,r5,(64-4)
621 addi r4,r4,-16 /* Unwind the +16 load offset */
622 mtocrf 0x01,r5
623 bf cr7*4+0,12f
624 lwz r0,0(r4) /* Less chance of a reject with word ops */
625 lwz r6,4(r4)
626 addi r4,r4,8
627 stw r0,0(r3)
628 stw r6,4(r3)
629 addi r3,r3,8
630
63112: bf cr7*4+1,13f
632 lwz r0,0(r4)
633 addi r4,r4,4
634 stw r0,0(r3)
635 addi r3,r3,4
636
63713: bf cr7*4+2,14f
638 lhz r0,0(r4)
639 addi r4,r4,2
640 sth r0,0(r3)
641 addi r3,r3,2
642
64314: bf cr7*4+3,15f
644 lbz r0,0(r4)
645 stb r0,0(r3)
646
64715: addi r1,r1,STACKFRAMESIZE
648 ld r3,48(r1)
649 b .exit_vmx_copy /* tail call optimise */
650#endif /* CONFiG_ALTIVEC */