blob: 497db7b23bb1be8be3518c12a6e0e7307ddb2fe0 [file] [log] [blame]
Anton Blancharda66086b2011-12-07 20:11:45 +00001/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#define STACKFRAMESIZE 256
23#define STK_REG(i) (112 + ((i)-14)*8)
24
25 .macro err1
26100:
27 .section __ex_table,"a"
28 .align 3
29 .llong 100b,.Ldo_err1
30 .previous
31 .endm
32
33 .macro err2
34200:
35 .section __ex_table,"a"
36 .align 3
37 .llong 200b,.Ldo_err2
38 .previous
39 .endm
40
41#ifdef CONFIG_ALTIVEC
42 .macro err3
43300:
44 .section __ex_table,"a"
45 .align 3
46 .llong 300b,.Ldo_err3
47 .previous
48 .endm
49
50 .macro err4
51400:
52 .section __ex_table,"a"
53 .align 3
54 .llong 400b,.Ldo_err4
55 .previous
56 .endm
57
58
59.Ldo_err4:
60 ld r16,STK_REG(r16)(r1)
61 ld r15,STK_REG(r15)(r1)
62 ld r14,STK_REG(r14)(r1)
63.Ldo_err3:
64 bl .exit_vmx_copy
65 ld r0,STACKFRAMESIZE+16(r1)
66 mtlr r0
67 b .Lexit
68#endif /* CONFIG_ALTIVEC */
69
70.Ldo_err2:
71 ld r22,STK_REG(r22)(r1)
72 ld r21,STK_REG(r21)(r1)
73 ld r20,STK_REG(r20)(r1)
74 ld r19,STK_REG(r19)(r1)
75 ld r18,STK_REG(r18)(r1)
76 ld r17,STK_REG(r17)(r1)
77 ld r16,STK_REG(r16)(r1)
78 ld r15,STK_REG(r15)(r1)
79 ld r14,STK_REG(r14)(r1)
80.Lexit:
81 addi r1,r1,STACKFRAMESIZE
82.Ldo_err1:
83 ld r3,48(r1)
84 ld r4,56(r1)
85 ld r5,64(r1)
86 b __copy_tofrom_user_base
87
88
89_GLOBAL(__copy_tofrom_user_power7)
90#ifdef CONFIG_ALTIVEC
91 cmpldi r5,16
92 cmpldi cr1,r5,4096
93
94 std r3,48(r1)
95 std r4,56(r1)
96 std r5,64(r1)
97
98 blt .Lshort_copy
99 bgt cr1,.Lvmx_copy
100#else
101 cmpldi r5,16
102
103 std r3,48(r1)
104 std r4,56(r1)
105 std r5,64(r1)
106
107 blt .Lshort_copy
108#endif
109
110.Lnonvmx_copy:
111 /* Get the source 8B aligned */
112 neg r6,r4
113 mtocrf 0x01,r6
114 clrldi r6,r6,(64-3)
115
116 bf cr7*4+3,1f
117err1; lbz r0,0(r4)
118 addi r4,r4,1
119err1; stb r0,0(r3)
120 addi r3,r3,1
121
1221: bf cr7*4+2,2f
123err1; lhz r0,0(r4)
124 addi r4,r4,2
125err1; sth r0,0(r3)
126 addi r3,r3,2
127
1282: bf cr7*4+1,3f
129err1; lwz r0,0(r4)
130 addi r4,r4,4
131err1; stw r0,0(r3)
132 addi r3,r3,4
133
1343: sub r5,r5,r6
135 cmpldi r5,128
136 blt 5f
137
138 mflr r0
139 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1)
141 std r15,STK_REG(r15)(r1)
142 std r16,STK_REG(r16)(r1)
143 std r17,STK_REG(r17)(r1)
144 std r18,STK_REG(r18)(r1)
145 std r19,STK_REG(r19)(r1)
146 std r20,STK_REG(r20)(r1)
147 std r21,STK_REG(r21)(r1)
148 std r22,STK_REG(r22)(r1)
149 std r0,STACKFRAMESIZE+16(r1)
150
151 srdi r6,r5,7
152 mtctr r6
153
154 /* Now do cacheline (128B) sized loads and stores. */
155 .align 5
1564:
157err2; ld r0,0(r4)
158err2; ld r6,8(r4)
159err2; ld r7,16(r4)
160err2; ld r8,24(r4)
161err2; ld r9,32(r4)
162err2; ld r10,40(r4)
163err2; ld r11,48(r4)
164err2; ld r12,56(r4)
165err2; ld r14,64(r4)
166err2; ld r15,72(r4)
167err2; ld r16,80(r4)
168err2; ld r17,88(r4)
169err2; ld r18,96(r4)
170err2; ld r19,104(r4)
171err2; ld r20,112(r4)
172err2; ld r21,120(r4)
173 addi r4,r4,128
174err2; std r0,0(r3)
175err2; std r6,8(r3)
176err2; std r7,16(r3)
177err2; std r8,24(r3)
178err2; std r9,32(r3)
179err2; std r10,40(r3)
180err2; std r11,48(r3)
181err2; std r12,56(r3)
182err2; std r14,64(r3)
183err2; std r15,72(r3)
184err2; std r16,80(r3)
185err2; std r17,88(r3)
186err2; std r18,96(r3)
187err2; std r19,104(r3)
188err2; std r20,112(r3)
189err2; std r21,120(r3)
190 addi r3,r3,128
191 bdnz 4b
192
193 clrldi r5,r5,(64-7)
194
195 ld r14,STK_REG(r14)(r1)
196 ld r15,STK_REG(r15)(r1)
197 ld r16,STK_REG(r16)(r1)
198 ld r17,STK_REG(r17)(r1)
199 ld r18,STK_REG(r18)(r1)
200 ld r19,STK_REG(r19)(r1)
201 ld r20,STK_REG(r20)(r1)
202 ld r21,STK_REG(r21)(r1)
203 ld r22,STK_REG(r22)(r1)
204 addi r1,r1,STACKFRAMESIZE
205
206 /* Up to 127B to go */
2075: srdi r6,r5,4
208 mtocrf 0x01,r6
209
2106: bf cr7*4+1,7f
211err1; ld r0,0(r4)
212err1; ld r6,8(r4)
213err1; ld r7,16(r4)
214err1; ld r8,24(r4)
215err1; ld r9,32(r4)
216err1; ld r10,40(r4)
217err1; ld r11,48(r4)
218err1; ld r12,56(r4)
219 addi r4,r4,64
220err1; std r0,0(r3)
221err1; std r6,8(r3)
222err1; std r7,16(r3)
223err1; std r8,24(r3)
224err1; std r9,32(r3)
225err1; std r10,40(r3)
226err1; std r11,48(r3)
227err1; std r12,56(r3)
228 addi r3,r3,64
229
230 /* Up to 63B to go */
2317: bf cr7*4+2,8f
232err1; ld r0,0(r4)
233err1; ld r6,8(r4)
234err1; ld r7,16(r4)
235err1; ld r8,24(r4)
236 addi r4,r4,32
237err1; std r0,0(r3)
238err1; std r6,8(r3)
239err1; std r7,16(r3)
240err1; std r8,24(r3)
241 addi r3,r3,32
242
243 /* Up to 31B to go */
2448: bf cr7*4+3,9f
245err1; ld r0,0(r4)
246err1; ld r6,8(r4)
247 addi r4,r4,16
248err1; std r0,0(r3)
249err1; std r6,8(r3)
250 addi r3,r3,16
251
2529: clrldi r5,r5,(64-4)
253
254 /* Up to 15B to go */
255.Lshort_copy:
256 mtocrf 0x01,r5
257 bf cr7*4+0,12f
258err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
259err1; lwz r6,4(r4)
260 addi r4,r4,8
261err1; stw r0,0(r3)
262err1; stw r6,4(r3)
263 addi r3,r3,8
264
26512: bf cr7*4+1,13f
266err1; lwz r0,0(r4)
267 addi r4,r4,4
268err1; stw r0,0(r3)
269 addi r3,r3,4
270
27113: bf cr7*4+2,14f
272err1; lhz r0,0(r4)
273 addi r4,r4,2
274err1; sth r0,0(r3)
275 addi r3,r3,2
276
27714: bf cr7*4+3,15f
278err1; lbz r0,0(r4)
279err1; stb r0,0(r3)
280
28115: li r3,0
282 blr
283
284.Lunwind_stack_nonvmx_copy:
285 addi r1,r1,STACKFRAMESIZE
286 b .Lnonvmx_copy
287
288#ifdef CONFIG_ALTIVEC
289.Lvmx_copy:
290 mflr r0
291 std r0,16(r1)
292 stdu r1,-STACKFRAMESIZE(r1)
293 bl .enter_vmx_copy
294 cmpwi r3,0
295 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1)
297 ld r4,STACKFRAMESIZE+56(r1)
298 ld r5,STACKFRAMESIZE+64(r1)
299 mtlr r0
300
301 beq .Lunwind_stack_nonvmx_copy
302
303 /*
304 * If source and destination are not relatively aligned we use a
305 * slower permute loop.
306 */
307 xor r6,r4,r3
308 rldicl. r6,r6,0,(64-4)
309 bne .Lvmx_unaligned_copy
310
311 /* Get the destination 16B aligned */
312 neg r6,r3
313 mtocrf 0x01,r6
314 clrldi r6,r6,(64-4)
315
316 bf cr7*4+3,1f
317err3; lbz r0,0(r4)
318 addi r4,r4,1
319err3; stb r0,0(r3)
320 addi r3,r3,1
321
3221: bf cr7*4+2,2f
323err3; lhz r0,0(r4)
324 addi r4,r4,2
325err3; sth r0,0(r3)
326 addi r3,r3,2
327
3282: bf cr7*4+1,3f
329err3; lwz r0,0(r4)
330 addi r4,r4,4
331err3; stw r0,0(r3)
332 addi r3,r3,4
333
3343: bf cr7*4+0,4f
335err3; ld r0,0(r4)
336 addi r4,r4,8
337err3; std r0,0(r3)
338 addi r3,r3,8
339
3404: sub r5,r5,r6
341
342 /* Get the desination 128B aligned */
343 neg r6,r3
344 srdi r7,r6,4
345 mtocrf 0x01,r7
346 clrldi r6,r6,(64-7)
347
348 li r9,16
349 li r10,32
350 li r11,48
351
352 bf cr7*4+3,5f
353err3; lvx vr1,r0,r4
354 addi r4,r4,16
355err3; stvx vr1,r0,r3
356 addi r3,r3,16
357
3585: bf cr7*4+2,6f
359err3; lvx vr1,r0,r4
360err3; lvx vr0,r4,r9
361 addi r4,r4,32
362err3; stvx vr1,r0,r3
363err3; stvx vr0,r3,r9
364 addi r3,r3,32
365
3666: bf cr7*4+1,7f
367err3; lvx vr3,r0,r4
368err3; lvx vr2,r4,r9
369err3; lvx vr1,r4,r10
370err3; lvx vr0,r4,r11
371 addi r4,r4,64
372err3; stvx vr3,r0,r3
373err3; stvx vr2,r3,r9
374err3; stvx vr1,r3,r10
375err3; stvx vr0,r3,r11
376 addi r3,r3,64
377
3787: sub r5,r5,r6
379 srdi r6,r5,7
380
381 std r14,STK_REG(r14)(r1)
382 std r15,STK_REG(r15)(r1)
383 std r16,STK_REG(r16)(r1)
384
385 li r12,64
386 li r14,80
387 li r15,96
388 li r16,112
389
390 mtctr r6
391
392 /*
393 * Now do cacheline sized loads and stores. By this stage the
394 * cacheline stores are also cacheline aligned.
395 */
396 .align 5
3978:
398err4; lvx vr7,r0,r4
399err4; lvx vr6,r4,r9
400err4; lvx vr5,r4,r10
401err4; lvx vr4,r4,r11
402err4; lvx vr3,r4,r12
403err4; lvx vr2,r4,r14
404err4; lvx vr1,r4,r15
405err4; lvx vr0,r4,r16
406 addi r4,r4,128
407err4; stvx vr7,r0,r3
408err4; stvx vr6,r3,r9
409err4; stvx vr5,r3,r10
410err4; stvx vr4,r3,r11
411err4; stvx vr3,r3,r12
412err4; stvx vr2,r3,r14
413err4; stvx vr1,r3,r15
414err4; stvx vr0,r3,r16
415 addi r3,r3,128
416 bdnz 8b
417
418 ld r14,STK_REG(r14)(r1)
419 ld r15,STK_REG(r15)(r1)
420 ld r16,STK_REG(r16)(r1)
421
422 /* Up to 127B to go */
423 clrldi r5,r5,(64-7)
424 srdi r6,r5,4
425 mtocrf 0x01,r6
426
427 bf cr7*4+1,9f
428err3; lvx vr3,r0,r4
429err3; lvx vr2,r4,r9
430err3; lvx vr1,r4,r10
431err3; lvx vr0,r4,r11
432 addi r4,r4,64
433err3; stvx vr3,r0,r3
434err3; stvx vr2,r3,r9
435err3; stvx vr1,r3,r10
436err3; stvx vr0,r3,r11
437 addi r3,r3,64
438
4399: bf cr7*4+2,10f
440err3; lvx vr1,r0,r4
441err3; lvx vr0,r4,r9
442 addi r4,r4,32
443err3; stvx vr1,r0,r3
444err3; stvx vr0,r3,r9
445 addi r3,r3,32
446
44710: bf cr7*4+3,11f
448err3; lvx vr1,r0,r4
449 addi r4,r4,16
450err3; stvx vr1,r0,r3
451 addi r3,r3,16
452
453 /* Up to 15B to go */
45411: clrldi r5,r5,(64-4)
455 mtocrf 0x01,r5
456 bf cr7*4+0,12f
457err3; ld r0,0(r4)
458 addi r4,r4,8
459err3; std r0,0(r3)
460 addi r3,r3,8
461
46212: bf cr7*4+1,13f
463err3; lwz r0,0(r4)
464 addi r4,r4,4
465err3; stw r0,0(r3)
466 addi r3,r3,4
467
46813: bf cr7*4+2,14f
469err3; lhz r0,0(r4)
470 addi r4,r4,2
471err3; sth r0,0(r3)
472 addi r3,r3,2
473
47414: bf cr7*4+3,15f
475err3; lbz r0,0(r4)
476err3; stb r0,0(r3)
477
47815: addi r1,r1,STACKFRAMESIZE
479 b .exit_vmx_copy /* tail call optimise */
480
481.Lvmx_unaligned_copy:
482 /* Get the destination 16B aligned */
483 neg r6,r3
484 mtocrf 0x01,r6
485 clrldi r6,r6,(64-4)
486
487 bf cr7*4+3,1f
488err3; lbz r0,0(r4)
489 addi r4,r4,1
490err3; stb r0,0(r3)
491 addi r3,r3,1
492
4931: bf cr7*4+2,2f
494err3; lhz r0,0(r4)
495 addi r4,r4,2
496err3; sth r0,0(r3)
497 addi r3,r3,2
498
4992: bf cr7*4+1,3f
500err3; lwz r0,0(r4)
501 addi r4,r4,4
502err3; stw r0,0(r3)
503 addi r3,r3,4
504
5053: bf cr7*4+0,4f
506err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
507err3; lwz r7,4(r4)
508 addi r4,r4,8
509err3; stw r0,0(r3)
510err3; stw r7,4(r3)
511 addi r3,r3,8
512
5134: sub r5,r5,r6
514
515 /* Get the desination 128B aligned */
516 neg r6,r3
517 srdi r7,r6,4
518 mtocrf 0x01,r7
519 clrldi r6,r6,(64-7)
520
521 li r9,16
522 li r10,32
523 li r11,48
524
525 lvsl vr16,0,r4 /* Setup permute control vector */
526err3; lvx vr0,0,r4
527 addi r4,r4,16
528
529 bf cr7*4+3,5f
530err3; lvx vr1,r0,r4
531 vperm vr8,vr0,vr1,vr16
532 addi r4,r4,16
533err3; stvx vr8,r0,r3
534 addi r3,r3,16
535 vor vr0,vr1,vr1
536
5375: bf cr7*4+2,6f
538err3; lvx vr1,r0,r4
539 vperm vr8,vr0,vr1,vr16
540err3; lvx vr0,r4,r9
541 vperm vr9,vr1,vr0,vr16
542 addi r4,r4,32
543err3; stvx vr8,r0,r3
544err3; stvx vr9,r3,r9
545 addi r3,r3,32
546
5476: bf cr7*4+1,7f
548err3; lvx vr3,r0,r4
549 vperm vr8,vr0,vr3,vr16
550err3; lvx vr2,r4,r9
551 vperm vr9,vr3,vr2,vr16
552err3; lvx vr1,r4,r10
553 vperm vr10,vr2,vr1,vr16
554err3; lvx vr0,r4,r11
555 vperm vr11,vr1,vr0,vr16
556 addi r4,r4,64
557err3; stvx vr8,r0,r3
558err3; stvx vr9,r3,r9
559err3; stvx vr10,r3,r10
560err3; stvx vr11,r3,r11
561 addi r3,r3,64
562
5637: sub r5,r5,r6
564 srdi r6,r5,7
565
566 std r14,STK_REG(r14)(r1)
567 std r15,STK_REG(r15)(r1)
568 std r16,STK_REG(r16)(r1)
569
570 li r12,64
571 li r14,80
572 li r15,96
573 li r16,112
574
575 mtctr r6
576
577 /*
578 * Now do cacheline sized loads and stores. By this stage the
579 * cacheline stores are also cacheline aligned.
580 */
581 .align 5
5828:
583err4; lvx vr7,r0,r4
584 vperm vr8,vr0,vr7,vr16
585err4; lvx vr6,r4,r9
586 vperm vr9,vr7,vr6,vr16
587err4; lvx vr5,r4,r10
588 vperm vr10,vr6,vr5,vr16
589err4; lvx vr4,r4,r11
590 vperm vr11,vr5,vr4,vr16
591err4; lvx vr3,r4,r12
592 vperm vr12,vr4,vr3,vr16
593err4; lvx vr2,r4,r14
594 vperm vr13,vr3,vr2,vr16
595err4; lvx vr1,r4,r15
596 vperm vr14,vr2,vr1,vr16
597err4; lvx vr0,r4,r16
598 vperm vr15,vr1,vr0,vr16
599 addi r4,r4,128
600err4; stvx vr8,r0,r3
601err4; stvx vr9,r3,r9
602err4; stvx vr10,r3,r10
603err4; stvx vr11,r3,r11
604err4; stvx vr12,r3,r12
605err4; stvx vr13,r3,r14
606err4; stvx vr14,r3,r15
607err4; stvx vr15,r3,r16
608 addi r3,r3,128
609 bdnz 8b
610
611 ld r14,STK_REG(r14)(r1)
612 ld r15,STK_REG(r15)(r1)
613 ld r16,STK_REG(r16)(r1)
614
615 /* Up to 127B to go */
616 clrldi r5,r5,(64-7)
617 srdi r6,r5,4
618 mtocrf 0x01,r6
619
620 bf cr7*4+1,9f
621err3; lvx vr3,r0,r4
622 vperm vr8,vr0,vr3,vr16
623err3; lvx vr2,r4,r9
624 vperm vr9,vr3,vr2,vr16
625err3; lvx vr1,r4,r10
626 vperm vr10,vr2,vr1,vr16
627err3; lvx vr0,r4,r11
628 vperm vr11,vr1,vr0,vr16
629 addi r4,r4,64
630err3; stvx vr8,r0,r3
631err3; stvx vr9,r3,r9
632err3; stvx vr10,r3,r10
633err3; stvx vr11,r3,r11
634 addi r3,r3,64
635
6369: bf cr7*4+2,10f
637err3; lvx vr1,r0,r4
638 vperm vr8,vr0,vr1,vr16
639err3; lvx vr0,r4,r9
640 vperm vr9,vr1,vr0,vr16
641 addi r4,r4,32
642err3; stvx vr8,r0,r3
643err3; stvx vr9,r3,r9
644 addi r3,r3,32
645
64610: bf cr7*4+3,11f
647err3; lvx vr1,r0,r4
648 vperm vr8,vr0,vr1,vr16
649 addi r4,r4,16
650err3; stvx vr8,r0,r3
651 addi r3,r3,16
652
653 /* Up to 15B to go */
65411: clrldi r5,r5,(64-4)
655 addi r4,r4,-16 /* Unwind the +16 load offset */
656 mtocrf 0x01,r5
657 bf cr7*4+0,12f
658err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
659err3; lwz r6,4(r4)
660 addi r4,r4,8
661err3; stw r0,0(r3)
662err3; stw r6,4(r3)
663 addi r3,r3,8
664
66512: bf cr7*4+1,13f
666err3; lwz r0,0(r4)
667 addi r4,r4,4
668err3; stw r0,0(r3)
669 addi r3,r3,4
670
67113: bf cr7*4+2,14f
672err3; lhz r0,0(r4)
673 addi r4,r4,2
674err3; sth r0,0(r3)
675 addi r3,r3,2
676
67714: bf cr7*4+3,15f
678err3; lbz r0,0(r4)
679err3; stb r0,0(r3)
680
68115: addi r1,r1,STACKFRAMESIZE
682 b .exit_vmx_copy /* tail call optimise */
683#endif /* CONFiG_ALTIVEC */