blob: da0c568d18c42807bc131eaee94287324df29d32 [file] [log] [blame]
Anton Blancharda66086b2011-12-07 20:11:45 +00001/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
Anton Blanchard32ee1e12013-09-23 12:04:35 +100022#ifdef __BIG_ENDIAN__
23#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
24#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
25#else
26#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
27#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
28#endif
29
Anton Blancharda66086b2011-12-07 20:11:45 +000030 .macro err1
31100:
32 .section __ex_table,"a"
33 .align 3
34 .llong 100b,.Ldo_err1
35 .previous
36 .endm
37
38 .macro err2
39200:
40 .section __ex_table,"a"
41 .align 3
42 .llong 200b,.Ldo_err2
43 .previous
44 .endm
45
46#ifdef CONFIG_ALTIVEC
47 .macro err3
48300:
49 .section __ex_table,"a"
50 .align 3
51 .llong 300b,.Ldo_err3
52 .previous
53 .endm
54
55 .macro err4
56400:
57 .section __ex_table,"a"
58 .align 3
59 .llong 400b,.Ldo_err4
60 .previous
61 .endm
62
63
64.Ldo_err4:
Michael Neulingc75df6f2012-06-25 13:33:10 +000065 ld r16,STK_REG(R16)(r1)
66 ld r15,STK_REG(R15)(r1)
67 ld r14,STK_REG(R14)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +000068.Ldo_err3:
Anton Blanchardb1576fe2014-02-04 16:04:35 +110069 bl exit_vmx_usercopy
Anton Blancharda66086b2011-12-07 20:11:45 +000070 ld r0,STACKFRAMESIZE+16(r1)
71 mtlr r0
72 b .Lexit
73#endif /* CONFIG_ALTIVEC */
74
75.Ldo_err2:
Michael Neulingc75df6f2012-06-25 13:33:10 +000076 ld r22,STK_REG(R22)(r1)
77 ld r21,STK_REG(R21)(r1)
78 ld r20,STK_REG(R20)(r1)
79 ld r19,STK_REG(R19)(r1)
80 ld r18,STK_REG(R18)(r1)
81 ld r17,STK_REG(R17)(r1)
82 ld r16,STK_REG(R16)(r1)
83 ld r15,STK_REG(R15)(r1)
84 ld r14,STK_REG(R14)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +000085.Lexit:
86 addi r1,r1,STACKFRAMESIZE
87.Ldo_err1:
Ulrich Weigand752a6422014-02-14 19:21:03 +010088 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
89 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
90 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +000091 b __copy_tofrom_user_base
92
93
94_GLOBAL(__copy_tofrom_user_power7)
95#ifdef CONFIG_ALTIVEC
96 cmpldi r5,16
97 cmpldi cr1,r5,4096
98
Ulrich Weigand752a6422014-02-14 19:21:03 +010099 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
100 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
101 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000102
103 blt .Lshort_copy
104 bgt cr1,.Lvmx_copy
105#else
106 cmpldi r5,16
107
Ulrich Weigand752a6422014-02-14 19:21:03 +0100108 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
109 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
110 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000111
112 blt .Lshort_copy
113#endif
114
115.Lnonvmx_copy:
116 /* Get the source 8B aligned */
117 neg r6,r4
118 mtocrf 0x01,r6
119 clrldi r6,r6,(64-3)
120
121 bf cr7*4+3,1f
122err1; lbz r0,0(r4)
123 addi r4,r4,1
124err1; stb r0,0(r3)
125 addi r3,r3,1
126
1271: bf cr7*4+2,2f
128err1; lhz r0,0(r4)
129 addi r4,r4,2
130err1; sth r0,0(r3)
131 addi r3,r3,2
132
1332: bf cr7*4+1,3f
134err1; lwz r0,0(r4)
135 addi r4,r4,4
136err1; stw r0,0(r3)
137 addi r3,r3,4
138
1393: sub r5,r5,r6
140 cmpldi r5,128
141 blt 5f
142
143 mflr r0
144 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +0000145 std r14,STK_REG(R14)(r1)
146 std r15,STK_REG(R15)(r1)
147 std r16,STK_REG(R16)(r1)
148 std r17,STK_REG(R17)(r1)
149 std r18,STK_REG(R18)(r1)
150 std r19,STK_REG(R19)(r1)
151 std r20,STK_REG(R20)(r1)
152 std r21,STK_REG(R21)(r1)
153 std r22,STK_REG(R22)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000154 std r0,STACKFRAMESIZE+16(r1)
155
156 srdi r6,r5,7
157 mtctr r6
158
159 /* Now do cacheline (128B) sized loads and stores. */
160 .align 5
1614:
162err2; ld r0,0(r4)
163err2; ld r6,8(r4)
164err2; ld r7,16(r4)
165err2; ld r8,24(r4)
166err2; ld r9,32(r4)
167err2; ld r10,40(r4)
168err2; ld r11,48(r4)
169err2; ld r12,56(r4)
170err2; ld r14,64(r4)
171err2; ld r15,72(r4)
172err2; ld r16,80(r4)
173err2; ld r17,88(r4)
174err2; ld r18,96(r4)
175err2; ld r19,104(r4)
176err2; ld r20,112(r4)
177err2; ld r21,120(r4)
178 addi r4,r4,128
179err2; std r0,0(r3)
180err2; std r6,8(r3)
181err2; std r7,16(r3)
182err2; std r8,24(r3)
183err2; std r9,32(r3)
184err2; std r10,40(r3)
185err2; std r11,48(r3)
186err2; std r12,56(r3)
187err2; std r14,64(r3)
188err2; std r15,72(r3)
189err2; std r16,80(r3)
190err2; std r17,88(r3)
191err2; std r18,96(r3)
192err2; std r19,104(r3)
193err2; std r20,112(r3)
194err2; std r21,120(r3)
195 addi r3,r3,128
196 bdnz 4b
197
198 clrldi r5,r5,(64-7)
199
Michael Neulingc75df6f2012-06-25 13:33:10 +0000200 ld r14,STK_REG(R14)(r1)
201 ld r15,STK_REG(R15)(r1)
202 ld r16,STK_REG(R16)(r1)
203 ld r17,STK_REG(R17)(r1)
204 ld r18,STK_REG(R18)(r1)
205 ld r19,STK_REG(R19)(r1)
206 ld r20,STK_REG(R20)(r1)
207 ld r21,STK_REG(R21)(r1)
208 ld r22,STK_REG(R22)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000209 addi r1,r1,STACKFRAMESIZE
210
211 /* Up to 127B to go */
2125: srdi r6,r5,4
213 mtocrf 0x01,r6
214
2156: bf cr7*4+1,7f
216err1; ld r0,0(r4)
217err1; ld r6,8(r4)
218err1; ld r7,16(r4)
219err1; ld r8,24(r4)
220err1; ld r9,32(r4)
221err1; ld r10,40(r4)
222err1; ld r11,48(r4)
223err1; ld r12,56(r4)
224 addi r4,r4,64
225err1; std r0,0(r3)
226err1; std r6,8(r3)
227err1; std r7,16(r3)
228err1; std r8,24(r3)
229err1; std r9,32(r3)
230err1; std r10,40(r3)
231err1; std r11,48(r3)
232err1; std r12,56(r3)
233 addi r3,r3,64
234
235 /* Up to 63B to go */
2367: bf cr7*4+2,8f
237err1; ld r0,0(r4)
238err1; ld r6,8(r4)
239err1; ld r7,16(r4)
240err1; ld r8,24(r4)
241 addi r4,r4,32
242err1; std r0,0(r3)
243err1; std r6,8(r3)
244err1; std r7,16(r3)
245err1; std r8,24(r3)
246 addi r3,r3,32
247
248 /* Up to 31B to go */
2498: bf cr7*4+3,9f
250err1; ld r0,0(r4)
251err1; ld r6,8(r4)
252 addi r4,r4,16
253err1; std r0,0(r3)
254err1; std r6,8(r3)
255 addi r3,r3,16
256
2579: clrldi r5,r5,(64-4)
258
259 /* Up to 15B to go */
260.Lshort_copy:
261 mtocrf 0x01,r5
262 bf cr7*4+0,12f
263err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
264err1; lwz r6,4(r4)
265 addi r4,r4,8
266err1; stw r0,0(r3)
267err1; stw r6,4(r3)
268 addi r3,r3,8
269
27012: bf cr7*4+1,13f
271err1; lwz r0,0(r4)
272 addi r4,r4,4
273err1; stw r0,0(r3)
274 addi r3,r3,4
275
27613: bf cr7*4+2,14f
277err1; lhz r0,0(r4)
278 addi r4,r4,2
279err1; sth r0,0(r3)
280 addi r3,r3,2
281
28214: bf cr7*4+3,15f
283err1; lbz r0,0(r4)
284err1; stb r0,0(r3)
285
28615: li r3,0
287 blr
288
289.Lunwind_stack_nonvmx_copy:
290 addi r1,r1,STACKFRAMESIZE
291 b .Lnonvmx_copy
292
293#ifdef CONFIG_ALTIVEC
294.Lvmx_copy:
295 mflr r0
296 std r0,16(r1)
297 stdu r1,-STACKFRAMESIZE(r1)
Anton Blanchardb1576fe2014-02-04 16:04:35 +1100298 bl enter_vmx_usercopy
Anton Blanchard2fae7cd2012-08-07 17:51:41 +0000299 cmpwi cr1,r3,0
Anton Blancharda66086b2011-12-07 20:11:45 +0000300 ld r0,STACKFRAMESIZE+16(r1)
Ulrich Weigand752a6422014-02-14 19:21:03 +0100301 ld r3,STK_REG(R31)(r1)
302 ld r4,STK_REG(R30)(r1)
303 ld r5,STK_REG(R29)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000304 mtlr r0
305
Anton Blancharda9514dc2012-05-28 22:14:32 +0000306 /*
307 * We prefetch both the source and destination using enhanced touch
308 * instructions. We use a stream ID of 0 for the load side and
309 * 1 for the store side.
310 */
311 clrrdi r6,r4,7
312 clrrdi r9,r3,7
313 ori r9,r9,1 /* stream=1 */
314
315 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
316 cmpldi r7,0x3FF
317 ble 1f
318 li r7,0x3FF
3191: lis r0,0x0E00 /* depth=7 */
320 sldi r7,r7,7
321 or r7,r7,r0
322 ori r10,r7,1 /* stream=1 */
323
324 lis r8,0x8000 /* GO=1 */
325 clrldi r8,r8,32
326
327.machine push
328.machine "power4"
Michael Neuling280a5ba2013-05-29 19:34:29 +0000329 /* setup read stream 0 */
330 dcbt r0,r6,0b01000 /* addr from */
331 dcbt r0,r7,0b01010 /* length and depth from */
332 /* setup write stream 1 */
333 dcbtst r0,r9,0b01000 /* addr to */
334 dcbtst r0,r10,0b01010 /* length and depth to */
Anton Blancharda9514dc2012-05-28 22:14:32 +0000335 eieio
Michael Neuling280a5ba2013-05-29 19:34:29 +0000336 dcbt r0,r8,0b01010 /* all streams GO */
Anton Blancharda9514dc2012-05-28 22:14:32 +0000337.machine pop
338
Anton Blanchard2fae7cd2012-08-07 17:51:41 +0000339 beq cr1,.Lunwind_stack_nonvmx_copy
Anton Blancharda66086b2011-12-07 20:11:45 +0000340
341 /*
342 * If source and destination are not relatively aligned we use a
343 * slower permute loop.
344 */
345 xor r6,r4,r3
346 rldicl. r6,r6,0,(64-4)
347 bne .Lvmx_unaligned_copy
348
349 /* Get the destination 16B aligned */
350 neg r6,r3
351 mtocrf 0x01,r6
352 clrldi r6,r6,(64-4)
353
354 bf cr7*4+3,1f
355err3; lbz r0,0(r4)
356 addi r4,r4,1
357err3; stb r0,0(r3)
358 addi r3,r3,1
359
3601: bf cr7*4+2,2f
361err3; lhz r0,0(r4)
362 addi r4,r4,2
363err3; sth r0,0(r3)
364 addi r3,r3,2
365
3662: bf cr7*4+1,3f
367err3; lwz r0,0(r4)
368 addi r4,r4,4
369err3; stw r0,0(r3)
370 addi r3,r3,4
371
3723: bf cr7*4+0,4f
373err3; ld r0,0(r4)
374 addi r4,r4,8
375err3; std r0,0(r3)
376 addi r3,r3,8
377
3784: sub r5,r5,r6
379
380 /* Get the desination 128B aligned */
381 neg r6,r3
382 srdi r7,r6,4
383 mtocrf 0x01,r7
384 clrldi r6,r6,(64-7)
385
386 li r9,16
387 li r10,32
388 li r11,48
389
390 bf cr7*4+3,5f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100391err3; lvx v1,r0,r4
Anton Blancharda66086b2011-12-07 20:11:45 +0000392 addi r4,r4,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100393err3; stvx v1,r0,r3
Anton Blancharda66086b2011-12-07 20:11:45 +0000394 addi r3,r3,16
395
3965: bf cr7*4+2,6f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100397err3; lvx v1,r0,r4
398err3; lvx v0,r4,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000399 addi r4,r4,32
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100400err3; stvx v1,r0,r3
401err3; stvx v0,r3,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000402 addi r3,r3,32
403
4046: bf cr7*4+1,7f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100405err3; lvx v3,r0,r4
406err3; lvx v2,r4,r9
407err3; lvx v1,r4,r10
408err3; lvx v0,r4,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000409 addi r4,r4,64
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100410err3; stvx v3,r0,r3
411err3; stvx v2,r3,r9
412err3; stvx v1,r3,r10
413err3; stvx v0,r3,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000414 addi r3,r3,64
415
4167: sub r5,r5,r6
417 srdi r6,r5,7
418
Michael Neulingc75df6f2012-06-25 13:33:10 +0000419 std r14,STK_REG(R14)(r1)
420 std r15,STK_REG(R15)(r1)
421 std r16,STK_REG(R16)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000422
423 li r12,64
424 li r14,80
425 li r15,96
426 li r16,112
427
428 mtctr r6
429
430 /*
431 * Now do cacheline sized loads and stores. By this stage the
432 * cacheline stores are also cacheline aligned.
433 */
434 .align 5
4358:
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100436err4; lvx v7,r0,r4
437err4; lvx v6,r4,r9
438err4; lvx v5,r4,r10
439err4; lvx v4,r4,r11
440err4; lvx v3,r4,r12
441err4; lvx v2,r4,r14
442err4; lvx v1,r4,r15
443err4; lvx v0,r4,r16
Anton Blancharda66086b2011-12-07 20:11:45 +0000444 addi r4,r4,128
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100445err4; stvx v7,r0,r3
446err4; stvx v6,r3,r9
447err4; stvx v5,r3,r10
448err4; stvx v4,r3,r11
449err4; stvx v3,r3,r12
450err4; stvx v2,r3,r14
451err4; stvx v1,r3,r15
452err4; stvx v0,r3,r16
Anton Blancharda66086b2011-12-07 20:11:45 +0000453 addi r3,r3,128
454 bdnz 8b
455
Michael Neulingc75df6f2012-06-25 13:33:10 +0000456 ld r14,STK_REG(R14)(r1)
457 ld r15,STK_REG(R15)(r1)
458 ld r16,STK_REG(R16)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000459
460 /* Up to 127B to go */
461 clrldi r5,r5,(64-7)
462 srdi r6,r5,4
463 mtocrf 0x01,r6
464
465 bf cr7*4+1,9f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100466err3; lvx v3,r0,r4
467err3; lvx v2,r4,r9
468err3; lvx v1,r4,r10
469err3; lvx v0,r4,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000470 addi r4,r4,64
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100471err3; stvx v3,r0,r3
472err3; stvx v2,r3,r9
473err3; stvx v1,r3,r10
474err3; stvx v0,r3,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000475 addi r3,r3,64
476
4779: bf cr7*4+2,10f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100478err3; lvx v1,r0,r4
479err3; lvx v0,r4,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000480 addi r4,r4,32
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100481err3; stvx v1,r0,r3
482err3; stvx v0,r3,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000483 addi r3,r3,32
484
48510: bf cr7*4+3,11f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100486err3; lvx v1,r0,r4
Anton Blancharda66086b2011-12-07 20:11:45 +0000487 addi r4,r4,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100488err3; stvx v1,r0,r3
Anton Blancharda66086b2011-12-07 20:11:45 +0000489 addi r3,r3,16
490
491 /* Up to 15B to go */
49211: clrldi r5,r5,(64-4)
493 mtocrf 0x01,r5
494 bf cr7*4+0,12f
495err3; ld r0,0(r4)
496 addi r4,r4,8
497err3; std r0,0(r3)
498 addi r3,r3,8
499
50012: bf cr7*4+1,13f
501err3; lwz r0,0(r4)
502 addi r4,r4,4
503err3; stw r0,0(r3)
504 addi r3,r3,4
505
50613: bf cr7*4+2,14f
507err3; lhz r0,0(r4)
508 addi r4,r4,2
509err3; sth r0,0(r3)
510 addi r3,r3,2
511
51214: bf cr7*4+3,15f
513err3; lbz r0,0(r4)
514err3; stb r0,0(r3)
515
51615: addi r1,r1,STACKFRAMESIZE
Anton Blanchardb1576fe2014-02-04 16:04:35 +1100517 b exit_vmx_usercopy /* tail call optimise */
Anton Blancharda66086b2011-12-07 20:11:45 +0000518
519.Lvmx_unaligned_copy:
520 /* Get the destination 16B aligned */
521 neg r6,r3
522 mtocrf 0x01,r6
523 clrldi r6,r6,(64-4)
524
525 bf cr7*4+3,1f
526err3; lbz r0,0(r4)
527 addi r4,r4,1
528err3; stb r0,0(r3)
529 addi r3,r3,1
530
5311: bf cr7*4+2,2f
532err3; lhz r0,0(r4)
533 addi r4,r4,2
534err3; sth r0,0(r3)
535 addi r3,r3,2
536
5372: bf cr7*4+1,3f
538err3; lwz r0,0(r4)
539 addi r4,r4,4
540err3; stw r0,0(r3)
541 addi r3,r3,4
542
5433: bf cr7*4+0,4f
544err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
545err3; lwz r7,4(r4)
546 addi r4,r4,8
547err3; stw r0,0(r3)
548err3; stw r7,4(r3)
549 addi r3,r3,8
550
5514: sub r5,r5,r6
552
553 /* Get the desination 128B aligned */
554 neg r6,r3
555 srdi r7,r6,4
556 mtocrf 0x01,r7
557 clrldi r6,r6,(64-7)
558
559 li r9,16
560 li r10,32
561 li r11,48
562
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100563 LVS(v16,0,r4) /* Setup permute control vector */
564err3; lvx v0,0,r4
Anton Blancharda66086b2011-12-07 20:11:45 +0000565 addi r4,r4,16
566
567 bf cr7*4+3,5f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100568err3; lvx v1,r0,r4
569 VPERM(v8,v0,v1,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000570 addi r4,r4,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100571err3; stvx v8,r0,r3
Anton Blancharda66086b2011-12-07 20:11:45 +0000572 addi r3,r3,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100573 vor v0,v1,v1
Anton Blancharda66086b2011-12-07 20:11:45 +0000574
5755: bf cr7*4+2,6f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100576err3; lvx v1,r0,r4
577 VPERM(v8,v0,v1,v16)
578err3; lvx v0,r4,r9
579 VPERM(v9,v1,v0,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000580 addi r4,r4,32
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100581err3; stvx v8,r0,r3
582err3; stvx v9,r3,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000583 addi r3,r3,32
584
5856: bf cr7*4+1,7f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100586err3; lvx v3,r0,r4
587 VPERM(v8,v0,v3,v16)
588err3; lvx v2,r4,r9
589 VPERM(v9,v3,v2,v16)
590err3; lvx v1,r4,r10
591 VPERM(v10,v2,v1,v16)
592err3; lvx v0,r4,r11
593 VPERM(v11,v1,v0,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000594 addi r4,r4,64
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100595err3; stvx v8,r0,r3
596err3; stvx v9,r3,r9
597err3; stvx v10,r3,r10
598err3; stvx v11,r3,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000599 addi r3,r3,64
600
6017: sub r5,r5,r6
602 srdi r6,r5,7
603
Michael Neulingc75df6f2012-06-25 13:33:10 +0000604 std r14,STK_REG(R14)(r1)
605 std r15,STK_REG(R15)(r1)
606 std r16,STK_REG(R16)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000607
608 li r12,64
609 li r14,80
610 li r15,96
611 li r16,112
612
613 mtctr r6
614
615 /*
616 * Now do cacheline sized loads and stores. By this stage the
617 * cacheline stores are also cacheline aligned.
618 */
619 .align 5
6208:
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100621err4; lvx v7,r0,r4
622 VPERM(v8,v0,v7,v16)
623err4; lvx v6,r4,r9
624 VPERM(v9,v7,v6,v16)
625err4; lvx v5,r4,r10
626 VPERM(v10,v6,v5,v16)
627err4; lvx v4,r4,r11
628 VPERM(v11,v5,v4,v16)
629err4; lvx v3,r4,r12
630 VPERM(v12,v4,v3,v16)
631err4; lvx v2,r4,r14
632 VPERM(v13,v3,v2,v16)
633err4; lvx v1,r4,r15
634 VPERM(v14,v2,v1,v16)
635err4; lvx v0,r4,r16
636 VPERM(v15,v1,v0,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000637 addi r4,r4,128
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100638err4; stvx v8,r0,r3
639err4; stvx v9,r3,r9
640err4; stvx v10,r3,r10
641err4; stvx v11,r3,r11
642err4; stvx v12,r3,r12
643err4; stvx v13,r3,r14
644err4; stvx v14,r3,r15
645err4; stvx v15,r3,r16
Anton Blancharda66086b2011-12-07 20:11:45 +0000646 addi r3,r3,128
647 bdnz 8b
648
Michael Neulingc75df6f2012-06-25 13:33:10 +0000649 ld r14,STK_REG(R14)(r1)
650 ld r15,STK_REG(R15)(r1)
651 ld r16,STK_REG(R16)(r1)
Anton Blancharda66086b2011-12-07 20:11:45 +0000652
653 /* Up to 127B to go */
654 clrldi r5,r5,(64-7)
655 srdi r6,r5,4
656 mtocrf 0x01,r6
657
658 bf cr7*4+1,9f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100659err3; lvx v3,r0,r4
660 VPERM(v8,v0,v3,v16)
661err3; lvx v2,r4,r9
662 VPERM(v9,v3,v2,v16)
663err3; lvx v1,r4,r10
664 VPERM(v10,v2,v1,v16)
665err3; lvx v0,r4,r11
666 VPERM(v11,v1,v0,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000667 addi r4,r4,64
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100668err3; stvx v8,r0,r3
669err3; stvx v9,r3,r9
670err3; stvx v10,r3,r10
671err3; stvx v11,r3,r11
Anton Blancharda66086b2011-12-07 20:11:45 +0000672 addi r3,r3,64
673
6749: bf cr7*4+2,10f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100675err3; lvx v1,r0,r4
676 VPERM(v8,v0,v1,v16)
677err3; lvx v0,r4,r9
678 VPERM(v9,v1,v0,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000679 addi r4,r4,32
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100680err3; stvx v8,r0,r3
681err3; stvx v9,r3,r9
Anton Blancharda66086b2011-12-07 20:11:45 +0000682 addi r3,r3,32
683
68410: bf cr7*4+3,11f
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100685err3; lvx v1,r0,r4
686 VPERM(v8,v0,v1,v16)
Anton Blancharda66086b2011-12-07 20:11:45 +0000687 addi r4,r4,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100688err3; stvx v8,r0,r3
Anton Blancharda66086b2011-12-07 20:11:45 +0000689 addi r3,r3,16
690
691 /* Up to 15B to go */
69211: clrldi r5,r5,(64-4)
693 addi r4,r4,-16 /* Unwind the +16 load offset */
694 mtocrf 0x01,r5
695 bf cr7*4+0,12f
696err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
697err3; lwz r6,4(r4)
698 addi r4,r4,8
699err3; stw r0,0(r3)
700err3; stw r6,4(r3)
701 addi r3,r3,8
702
70312: bf cr7*4+1,13f
704err3; lwz r0,0(r4)
705 addi r4,r4,4
706err3; stw r0,0(r3)
707 addi r3,r3,4
708
70913: bf cr7*4+2,14f
710err3; lhz r0,0(r4)
711 addi r4,r4,2
712err3; sth r0,0(r3)
713 addi r3,r3,2
714
71514: bf cr7*4+3,15f
716err3; lbz r0,0(r4)
717err3; stb r0,0(r3)
718
71915: addi r1,r1,STACKFRAMESIZE
Anton Blanchardb1576fe2014-02-04 16:04:35 +1100720 b exit_vmx_usercopy /* tail call optimise */
Paul Bollec2522dc2014-09-26 19:45:34 +0200721#endif /* CONFIG_ALTIVEC */