blob: f560f83a3ab0511b462ce6cc072866712c37be0b [file] [log] [blame]
Anton Blancharda66086b2011-12-07 20:11:45 +00001/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22#define STACKFRAMESIZE 256
23#define STK_REG(i) (112 + ((i)-14)*8)
24
25 .macro err1
26100:
27 .section __ex_table,"a"
28 .align 3
29 .llong 100b,.Ldo_err1
30 .previous
31 .endm
32
33 .macro err2
34200:
35 .section __ex_table,"a"
36 .align 3
37 .llong 200b,.Ldo_err2
38 .previous
39 .endm
40
41#ifdef CONFIG_ALTIVEC
42 .macro err3
43300:
44 .section __ex_table,"a"
45 .align 3
46 .llong 300b,.Ldo_err3
47 .previous
48 .endm
49
50 .macro err4
51400:
52 .section __ex_table,"a"
53 .align 3
54 .llong 400b,.Ldo_err4
55 .previous
56 .endm
57
58
59.Ldo_err4:
60 ld r16,STK_REG(r16)(r1)
61 ld r15,STK_REG(r15)(r1)
62 ld r14,STK_REG(r14)(r1)
63.Ldo_err3:
Anton Blanchard6f7839e2012-05-29 19:31:24 +000064 bl .exit_vmx_usercopy
Anton Blancharda66086b2011-12-07 20:11:45 +000065 ld r0,STACKFRAMESIZE+16(r1)
66 mtlr r0
67 b .Lexit
68#endif /* CONFIG_ALTIVEC */
69
70.Ldo_err2:
71 ld r22,STK_REG(r22)(r1)
72 ld r21,STK_REG(r21)(r1)
73 ld r20,STK_REG(r20)(r1)
74 ld r19,STK_REG(r19)(r1)
75 ld r18,STK_REG(r18)(r1)
76 ld r17,STK_REG(r17)(r1)
77 ld r16,STK_REG(r16)(r1)
78 ld r15,STK_REG(r15)(r1)
79 ld r14,STK_REG(r14)(r1)
80.Lexit:
81 addi r1,r1,STACKFRAMESIZE
82.Ldo_err1:
83 ld r3,48(r1)
84 ld r4,56(r1)
85 ld r5,64(r1)
86 b __copy_tofrom_user_base
87
88
89_GLOBAL(__copy_tofrom_user_power7)
90#ifdef CONFIG_ALTIVEC
91 cmpldi r5,16
92 cmpldi cr1,r5,4096
93
94 std r3,48(r1)
95 std r4,56(r1)
96 std r5,64(r1)
97
98 blt .Lshort_copy
99 bgt cr1,.Lvmx_copy
100#else
101 cmpldi r5,16
102
103 std r3,48(r1)
104 std r4,56(r1)
105 std r5,64(r1)
106
107 blt .Lshort_copy
108#endif
109
110.Lnonvmx_copy:
111 /* Get the source 8B aligned */
112 neg r6,r4
113 mtocrf 0x01,r6
114 clrldi r6,r6,(64-3)
115
116 bf cr7*4+3,1f
117err1; lbz r0,0(r4)
118 addi r4,r4,1
119err1; stb r0,0(r3)
120 addi r3,r3,1
121
1221: bf cr7*4+2,2f
123err1; lhz r0,0(r4)
124 addi r4,r4,2
125err1; sth r0,0(r3)
126 addi r3,r3,2
127
1282: bf cr7*4+1,3f
129err1; lwz r0,0(r4)
130 addi r4,r4,4
131err1; stw r0,0(r3)
132 addi r3,r3,4
133
1343: sub r5,r5,r6
135 cmpldi r5,128
136 blt 5f
137
138 mflr r0
139 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1)
141 std r15,STK_REG(r15)(r1)
142 std r16,STK_REG(r16)(r1)
143 std r17,STK_REG(r17)(r1)
144 std r18,STK_REG(r18)(r1)
145 std r19,STK_REG(r19)(r1)
146 std r20,STK_REG(r20)(r1)
147 std r21,STK_REG(r21)(r1)
148 std r22,STK_REG(r22)(r1)
149 std r0,STACKFRAMESIZE+16(r1)
150
151 srdi r6,r5,7
152 mtctr r6
153
154 /* Now do cacheline (128B) sized loads and stores. */
155 .align 5
1564:
157err2; ld r0,0(r4)
158err2; ld r6,8(r4)
159err2; ld r7,16(r4)
160err2; ld r8,24(r4)
161err2; ld r9,32(r4)
162err2; ld r10,40(r4)
163err2; ld r11,48(r4)
164err2; ld r12,56(r4)
165err2; ld r14,64(r4)
166err2; ld r15,72(r4)
167err2; ld r16,80(r4)
168err2; ld r17,88(r4)
169err2; ld r18,96(r4)
170err2; ld r19,104(r4)
171err2; ld r20,112(r4)
172err2; ld r21,120(r4)
173 addi r4,r4,128
174err2; std r0,0(r3)
175err2; std r6,8(r3)
176err2; std r7,16(r3)
177err2; std r8,24(r3)
178err2; std r9,32(r3)
179err2; std r10,40(r3)
180err2; std r11,48(r3)
181err2; std r12,56(r3)
182err2; std r14,64(r3)
183err2; std r15,72(r3)
184err2; std r16,80(r3)
185err2; std r17,88(r3)
186err2; std r18,96(r3)
187err2; std r19,104(r3)
188err2; std r20,112(r3)
189err2; std r21,120(r3)
190 addi r3,r3,128
191 bdnz 4b
192
193 clrldi r5,r5,(64-7)
194
195 ld r14,STK_REG(r14)(r1)
196 ld r15,STK_REG(r15)(r1)
197 ld r16,STK_REG(r16)(r1)
198 ld r17,STK_REG(r17)(r1)
199 ld r18,STK_REG(r18)(r1)
200 ld r19,STK_REG(r19)(r1)
201 ld r20,STK_REG(r20)(r1)
202 ld r21,STK_REG(r21)(r1)
203 ld r22,STK_REG(r22)(r1)
204 addi r1,r1,STACKFRAMESIZE
205
206 /* Up to 127B to go */
2075: srdi r6,r5,4
208 mtocrf 0x01,r6
209
2106: bf cr7*4+1,7f
211err1; ld r0,0(r4)
212err1; ld r6,8(r4)
213err1; ld r7,16(r4)
214err1; ld r8,24(r4)
215err1; ld r9,32(r4)
216err1; ld r10,40(r4)
217err1; ld r11,48(r4)
218err1; ld r12,56(r4)
219 addi r4,r4,64
220err1; std r0,0(r3)
221err1; std r6,8(r3)
222err1; std r7,16(r3)
223err1; std r8,24(r3)
224err1; std r9,32(r3)
225err1; std r10,40(r3)
226err1; std r11,48(r3)
227err1; std r12,56(r3)
228 addi r3,r3,64
229
230 /* Up to 63B to go */
2317: bf cr7*4+2,8f
232err1; ld r0,0(r4)
233err1; ld r6,8(r4)
234err1; ld r7,16(r4)
235err1; ld r8,24(r4)
236 addi r4,r4,32
237err1; std r0,0(r3)
238err1; std r6,8(r3)
239err1; std r7,16(r3)
240err1; std r8,24(r3)
241 addi r3,r3,32
242
243 /* Up to 31B to go */
2448: bf cr7*4+3,9f
245err1; ld r0,0(r4)
246err1; ld r6,8(r4)
247 addi r4,r4,16
248err1; std r0,0(r3)
249err1; std r6,8(r3)
250 addi r3,r3,16
251
2529: clrldi r5,r5,(64-4)
253
254 /* Up to 15B to go */
255.Lshort_copy:
256 mtocrf 0x01,r5
257 bf cr7*4+0,12f
258err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
259err1; lwz r6,4(r4)
260 addi r4,r4,8
261err1; stw r0,0(r3)
262err1; stw r6,4(r3)
263 addi r3,r3,8
264
26512: bf cr7*4+1,13f
266err1; lwz r0,0(r4)
267 addi r4,r4,4
268err1; stw r0,0(r3)
269 addi r3,r3,4
270
27113: bf cr7*4+2,14f
272err1; lhz r0,0(r4)
273 addi r4,r4,2
274err1; sth r0,0(r3)
275 addi r3,r3,2
276
27714: bf cr7*4+3,15f
278err1; lbz r0,0(r4)
279err1; stb r0,0(r3)
280
28115: li r3,0
282 blr
283
284.Lunwind_stack_nonvmx_copy:
285 addi r1,r1,STACKFRAMESIZE
286 b .Lnonvmx_copy
287
288#ifdef CONFIG_ALTIVEC
289.Lvmx_copy:
290 mflr r0
291 std r0,16(r1)
292 stdu r1,-STACKFRAMESIZE(r1)
Anton Blanchard6f7839e2012-05-29 19:31:24 +0000293 bl .enter_vmx_usercopy
Anton Blancharda66086b2011-12-07 20:11:45 +0000294 cmpwi r3,0
295 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1)
297 ld r4,STACKFRAMESIZE+56(r1)
298 ld r5,STACKFRAMESIZE+64(r1)
299 mtlr r0
300
Anton Blancharda9514dc2012-05-28 22:14:32 +0000301 /*
302 * We prefetch both the source and destination using enhanced touch
303 * instructions. We use a stream ID of 0 for the load side and
304 * 1 for the store side.
305 */
306 clrrdi r6,r4,7
307 clrrdi r9,r3,7
308 ori r9,r9,1 /* stream=1 */
309
310 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
311 cmpldi r7,0x3FF
312 ble 1f
313 li r7,0x3FF
3141: lis r0,0x0E00 /* depth=7 */
315 sldi r7,r7,7
316 or r7,r7,r0
317 ori r10,r7,1 /* stream=1 */
318
319 lis r8,0x8000 /* GO=1 */
320 clrldi r8,r8,32
321
322.machine push
323.machine "power4"
324 dcbt r0,r6,0b01000
325 dcbt r0,r7,0b01010
326 dcbtst r0,r9,0b01000
327 dcbtst r0,r10,0b01010
328 eieio
329 dcbt r0,r8,0b01010 /* GO */
330.machine pop
331
Anton Blancharda66086b2011-12-07 20:11:45 +0000332 beq .Lunwind_stack_nonvmx_copy
333
334 /*
335 * If source and destination are not relatively aligned we use a
336 * slower permute loop.
337 */
338 xor r6,r4,r3
339 rldicl. r6,r6,0,(64-4)
340 bne .Lvmx_unaligned_copy
341
342 /* Get the destination 16B aligned */
343 neg r6,r3
344 mtocrf 0x01,r6
345 clrldi r6,r6,(64-4)
346
347 bf cr7*4+3,1f
348err3; lbz r0,0(r4)
349 addi r4,r4,1
350err3; stb r0,0(r3)
351 addi r3,r3,1
352
3531: bf cr7*4+2,2f
354err3; lhz r0,0(r4)
355 addi r4,r4,2
356err3; sth r0,0(r3)
357 addi r3,r3,2
358
3592: bf cr7*4+1,3f
360err3; lwz r0,0(r4)
361 addi r4,r4,4
362err3; stw r0,0(r3)
363 addi r3,r3,4
364
3653: bf cr7*4+0,4f
366err3; ld r0,0(r4)
367 addi r4,r4,8
368err3; std r0,0(r3)
369 addi r3,r3,8
370
3714: sub r5,r5,r6
372
373 /* Get the desination 128B aligned */
374 neg r6,r3
375 srdi r7,r6,4
376 mtocrf 0x01,r7
377 clrldi r6,r6,(64-7)
378
379 li r9,16
380 li r10,32
381 li r11,48
382
383 bf cr7*4+3,5f
384err3; lvx vr1,r0,r4
385 addi r4,r4,16
386err3; stvx vr1,r0,r3
387 addi r3,r3,16
388
3895: bf cr7*4+2,6f
390err3; lvx vr1,r0,r4
391err3; lvx vr0,r4,r9
392 addi r4,r4,32
393err3; stvx vr1,r0,r3
394err3; stvx vr0,r3,r9
395 addi r3,r3,32
396
3976: bf cr7*4+1,7f
398err3; lvx vr3,r0,r4
399err3; lvx vr2,r4,r9
400err3; lvx vr1,r4,r10
401err3; lvx vr0,r4,r11
402 addi r4,r4,64
403err3; stvx vr3,r0,r3
404err3; stvx vr2,r3,r9
405err3; stvx vr1,r3,r10
406err3; stvx vr0,r3,r11
407 addi r3,r3,64
408
4097: sub r5,r5,r6
410 srdi r6,r5,7
411
412 std r14,STK_REG(r14)(r1)
413 std r15,STK_REG(r15)(r1)
414 std r16,STK_REG(r16)(r1)
415
416 li r12,64
417 li r14,80
418 li r15,96
419 li r16,112
420
421 mtctr r6
422
423 /*
424 * Now do cacheline sized loads and stores. By this stage the
425 * cacheline stores are also cacheline aligned.
426 */
427 .align 5
4288:
429err4; lvx vr7,r0,r4
430err4; lvx vr6,r4,r9
431err4; lvx vr5,r4,r10
432err4; lvx vr4,r4,r11
433err4; lvx vr3,r4,r12
434err4; lvx vr2,r4,r14
435err4; lvx vr1,r4,r15
436err4; lvx vr0,r4,r16
437 addi r4,r4,128
438err4; stvx vr7,r0,r3
439err4; stvx vr6,r3,r9
440err4; stvx vr5,r3,r10
441err4; stvx vr4,r3,r11
442err4; stvx vr3,r3,r12
443err4; stvx vr2,r3,r14
444err4; stvx vr1,r3,r15
445err4; stvx vr0,r3,r16
446 addi r3,r3,128
447 bdnz 8b
448
449 ld r14,STK_REG(r14)(r1)
450 ld r15,STK_REG(r15)(r1)
451 ld r16,STK_REG(r16)(r1)
452
453 /* Up to 127B to go */
454 clrldi r5,r5,(64-7)
455 srdi r6,r5,4
456 mtocrf 0x01,r6
457
458 bf cr7*4+1,9f
459err3; lvx vr3,r0,r4
460err3; lvx vr2,r4,r9
461err3; lvx vr1,r4,r10
462err3; lvx vr0,r4,r11
463 addi r4,r4,64
464err3; stvx vr3,r0,r3
465err3; stvx vr2,r3,r9
466err3; stvx vr1,r3,r10
467err3; stvx vr0,r3,r11
468 addi r3,r3,64
469
4709: bf cr7*4+2,10f
471err3; lvx vr1,r0,r4
472err3; lvx vr0,r4,r9
473 addi r4,r4,32
474err3; stvx vr1,r0,r3
475err3; stvx vr0,r3,r9
476 addi r3,r3,32
477
47810: bf cr7*4+3,11f
479err3; lvx vr1,r0,r4
480 addi r4,r4,16
481err3; stvx vr1,r0,r3
482 addi r3,r3,16
483
484 /* Up to 15B to go */
48511: clrldi r5,r5,(64-4)
486 mtocrf 0x01,r5
487 bf cr7*4+0,12f
488err3; ld r0,0(r4)
489 addi r4,r4,8
490err3; std r0,0(r3)
491 addi r3,r3,8
492
49312: bf cr7*4+1,13f
494err3; lwz r0,0(r4)
495 addi r4,r4,4
496err3; stw r0,0(r3)
497 addi r3,r3,4
498
49913: bf cr7*4+2,14f
500err3; lhz r0,0(r4)
501 addi r4,r4,2
502err3; sth r0,0(r3)
503 addi r3,r3,2
504
50514: bf cr7*4+3,15f
506err3; lbz r0,0(r4)
507err3; stb r0,0(r3)
508
50915: addi r1,r1,STACKFRAMESIZE
Anton Blanchard6f7839e2012-05-29 19:31:24 +0000510 b .exit_vmx_usercopy /* tail call optimise */
Anton Blancharda66086b2011-12-07 20:11:45 +0000511
512.Lvmx_unaligned_copy:
513 /* Get the destination 16B aligned */
514 neg r6,r3
515 mtocrf 0x01,r6
516 clrldi r6,r6,(64-4)
517
518 bf cr7*4+3,1f
519err3; lbz r0,0(r4)
520 addi r4,r4,1
521err3; stb r0,0(r3)
522 addi r3,r3,1
523
5241: bf cr7*4+2,2f
525err3; lhz r0,0(r4)
526 addi r4,r4,2
527err3; sth r0,0(r3)
528 addi r3,r3,2
529
5302: bf cr7*4+1,3f
531err3; lwz r0,0(r4)
532 addi r4,r4,4
533err3; stw r0,0(r3)
534 addi r3,r3,4
535
5363: bf cr7*4+0,4f
537err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
538err3; lwz r7,4(r4)
539 addi r4,r4,8
540err3; stw r0,0(r3)
541err3; stw r7,4(r3)
542 addi r3,r3,8
543
5444: sub r5,r5,r6
545
546 /* Get the desination 128B aligned */
547 neg r6,r3
548 srdi r7,r6,4
549 mtocrf 0x01,r7
550 clrldi r6,r6,(64-7)
551
552 li r9,16
553 li r10,32
554 li r11,48
555
556 lvsl vr16,0,r4 /* Setup permute control vector */
557err3; lvx vr0,0,r4
558 addi r4,r4,16
559
560 bf cr7*4+3,5f
561err3; lvx vr1,r0,r4
562 vperm vr8,vr0,vr1,vr16
563 addi r4,r4,16
564err3; stvx vr8,r0,r3
565 addi r3,r3,16
566 vor vr0,vr1,vr1
567
5685: bf cr7*4+2,6f
569err3; lvx vr1,r0,r4
570 vperm vr8,vr0,vr1,vr16
571err3; lvx vr0,r4,r9
572 vperm vr9,vr1,vr0,vr16
573 addi r4,r4,32
574err3; stvx vr8,r0,r3
575err3; stvx vr9,r3,r9
576 addi r3,r3,32
577
5786: bf cr7*4+1,7f
579err3; lvx vr3,r0,r4
580 vperm vr8,vr0,vr3,vr16
581err3; lvx vr2,r4,r9
582 vperm vr9,vr3,vr2,vr16
583err3; lvx vr1,r4,r10
584 vperm vr10,vr2,vr1,vr16
585err3; lvx vr0,r4,r11
586 vperm vr11,vr1,vr0,vr16
587 addi r4,r4,64
588err3; stvx vr8,r0,r3
589err3; stvx vr9,r3,r9
590err3; stvx vr10,r3,r10
591err3; stvx vr11,r3,r11
592 addi r3,r3,64
593
5947: sub r5,r5,r6
595 srdi r6,r5,7
596
597 std r14,STK_REG(r14)(r1)
598 std r15,STK_REG(r15)(r1)
599 std r16,STK_REG(r16)(r1)
600
601 li r12,64
602 li r14,80
603 li r15,96
604 li r16,112
605
606 mtctr r6
607
608 /*
609 * Now do cacheline sized loads and stores. By this stage the
610 * cacheline stores are also cacheline aligned.
611 */
612 .align 5
6138:
614err4; lvx vr7,r0,r4
615 vperm vr8,vr0,vr7,vr16
616err4; lvx vr6,r4,r9
617 vperm vr9,vr7,vr6,vr16
618err4; lvx vr5,r4,r10
619 vperm vr10,vr6,vr5,vr16
620err4; lvx vr4,r4,r11
621 vperm vr11,vr5,vr4,vr16
622err4; lvx vr3,r4,r12
623 vperm vr12,vr4,vr3,vr16
624err4; lvx vr2,r4,r14
625 vperm vr13,vr3,vr2,vr16
626err4; lvx vr1,r4,r15
627 vperm vr14,vr2,vr1,vr16
628err4; lvx vr0,r4,r16
629 vperm vr15,vr1,vr0,vr16
630 addi r4,r4,128
631err4; stvx vr8,r0,r3
632err4; stvx vr9,r3,r9
633err4; stvx vr10,r3,r10
634err4; stvx vr11,r3,r11
635err4; stvx vr12,r3,r12
636err4; stvx vr13,r3,r14
637err4; stvx vr14,r3,r15
638err4; stvx vr15,r3,r16
639 addi r3,r3,128
640 bdnz 8b
641
642 ld r14,STK_REG(r14)(r1)
643 ld r15,STK_REG(r15)(r1)
644 ld r16,STK_REG(r16)(r1)
645
646 /* Up to 127B to go */
647 clrldi r5,r5,(64-7)
648 srdi r6,r5,4
649 mtocrf 0x01,r6
650
651 bf cr7*4+1,9f
652err3; lvx vr3,r0,r4
653 vperm vr8,vr0,vr3,vr16
654err3; lvx vr2,r4,r9
655 vperm vr9,vr3,vr2,vr16
656err3; lvx vr1,r4,r10
657 vperm vr10,vr2,vr1,vr16
658err3; lvx vr0,r4,r11
659 vperm vr11,vr1,vr0,vr16
660 addi r4,r4,64
661err3; stvx vr8,r0,r3
662err3; stvx vr9,r3,r9
663err3; stvx vr10,r3,r10
664err3; stvx vr11,r3,r11
665 addi r3,r3,64
666
6679: bf cr7*4+2,10f
668err3; lvx vr1,r0,r4
669 vperm vr8,vr0,vr1,vr16
670err3; lvx vr0,r4,r9
671 vperm vr9,vr1,vr0,vr16
672 addi r4,r4,32
673err3; stvx vr8,r0,r3
674err3; stvx vr9,r3,r9
675 addi r3,r3,32
676
67710: bf cr7*4+3,11f
678err3; lvx vr1,r0,r4
679 vperm vr8,vr0,vr1,vr16
680 addi r4,r4,16
681err3; stvx vr8,r0,r3
682 addi r3,r3,16
683
684 /* Up to 15B to go */
68511: clrldi r5,r5,(64-4)
686 addi r4,r4,-16 /* Unwind the +16 load offset */
687 mtocrf 0x01,r5
688 bf cr7*4+0,12f
689err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
690err3; lwz r6,4(r4)
691 addi r4,r4,8
692err3; stw r0,0(r3)
693err3; stw r6,4(r3)
694 addi r3,r3,8
695
69612: bf cr7*4+1,13f
697err3; lwz r0,0(r4)
698 addi r4,r4,4
699err3; stw r0,0(r3)
700 addi r3,r3,4
701
70213: bf cr7*4+2,14f
703err3; lhz r0,0(r4)
704 addi r4,r4,2
705err3; sth r0,0(r3)
706 addi r3,r3,2
707
70814: bf cr7*4+3,15f
709err3; lbz r0,0(r4)
710err3; stb r0,0(r3)
711
71215: addi r1,r1,STACKFRAMESIZE
Anton Blanchard6f7839e2012-05-29 19:31:24 +0000713 b .exit_vmx_usercopy /* tail call optimise */
Anton Blancharda66086b2011-12-07 20:11:45 +0000714#endif /* CONFiG_ALTIVEC */