blob: 89bfefcf7fcc8aed2dfc6c6d7f2fe4b4bb2c4255 [file] [log] [blame]
Anton Blanchardb3f271e2012-05-30 20:22:09 +00001/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
Paul Mackerras98c45f52018-08-03 20:13:04 +100022#ifndef SELFTEST_CASE
23/* 0 == don't use VMX, 1 == use VMX */
24#define SELFTEST_CASE 0
25#endif
Anton Blanchard32ee1e12013-09-23 12:04:35 +100026
27#ifdef __BIG_ENDIAN__
28#define LVS(VRT,RA,RB) lvsl VRT,RA,RB
29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC
30#else
31#define LVS(VRT,RA,RB) lvsr VRT,RA,RB
32#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC
33#endif
34
Paul Mackerras98c45f52018-08-03 20:13:04 +100035_GLOBAL(memcpy_power7)
Anton Blanchardb3f271e2012-05-30 20:22:09 +000036 cmpldi r5,16
37 cmpldi cr1,r5,4096
Ulrich Weigand752a6422014-02-14 19:21:03 +010038 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +000039 blt .Lshort_copy
Anton Blanchardb3f271e2012-05-30 20:22:09 +000040
Paul Mackerras98c45f52018-08-03 20:13:04 +100041#ifdef CONFIG_ALTIVEC
42test_feature = SELFTEST_CASE
43BEGIN_FTR_SECTION
44 bgt cr1, .Lvmx_copy
45END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
Anton Blanchardb3f271e2012-05-30 20:22:09 +000046#endif
47
48.Lnonvmx_copy:
49 /* Get the source 8B aligned */
50 neg r6,r4
51 mtocrf 0x01,r6
52 clrldi r6,r6,(64-3)
53
54 bf cr7*4+3,1f
55 lbz r0,0(r4)
56 addi r4,r4,1
57 stb r0,0(r3)
58 addi r3,r3,1
59
601: bf cr7*4+2,2f
61 lhz r0,0(r4)
62 addi r4,r4,2
63 sth r0,0(r3)
64 addi r3,r3,2
65
662: bf cr7*4+1,3f
67 lwz r0,0(r4)
68 addi r4,r4,4
69 stw r0,0(r3)
70 addi r3,r3,4
71
723: sub r5,r5,r6
73 cmpldi r5,128
74 blt 5f
75
76 mflr r0
77 stdu r1,-STACKFRAMESIZE(r1)
Michael Neulingc75df6f2012-06-25 13:33:10 +000078 std r14,STK_REG(R14)(r1)
79 std r15,STK_REG(R15)(r1)
80 std r16,STK_REG(R16)(r1)
81 std r17,STK_REG(R17)(r1)
82 std r18,STK_REG(R18)(r1)
83 std r19,STK_REG(R19)(r1)
84 std r20,STK_REG(R20)(r1)
85 std r21,STK_REG(R21)(r1)
86 std r22,STK_REG(R22)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +000087 std r0,STACKFRAMESIZE+16(r1)
88
89 srdi r6,r5,7
90 mtctr r6
91
92 /* Now do cacheline (128B) sized loads and stores. */
93 .align 5
944:
95 ld r0,0(r4)
96 ld r6,8(r4)
97 ld r7,16(r4)
98 ld r8,24(r4)
99 ld r9,32(r4)
100 ld r10,40(r4)
101 ld r11,48(r4)
102 ld r12,56(r4)
103 ld r14,64(r4)
104 ld r15,72(r4)
105 ld r16,80(r4)
106 ld r17,88(r4)
107 ld r18,96(r4)
108 ld r19,104(r4)
109 ld r20,112(r4)
110 ld r21,120(r4)
111 addi r4,r4,128
112 std r0,0(r3)
113 std r6,8(r3)
114 std r7,16(r3)
115 std r8,24(r3)
116 std r9,32(r3)
117 std r10,40(r3)
118 std r11,48(r3)
119 std r12,56(r3)
120 std r14,64(r3)
121 std r15,72(r3)
122 std r16,80(r3)
123 std r17,88(r3)
124 std r18,96(r3)
125 std r19,104(r3)
126 std r20,112(r3)
127 std r21,120(r3)
128 addi r3,r3,128
129 bdnz 4b
130
131 clrldi r5,r5,(64-7)
132
Michael Neulingc75df6f2012-06-25 13:33:10 +0000133 ld r14,STK_REG(R14)(r1)
134 ld r15,STK_REG(R15)(r1)
135 ld r16,STK_REG(R16)(r1)
136 ld r17,STK_REG(R17)(r1)
137 ld r18,STK_REG(R18)(r1)
138 ld r19,STK_REG(R19)(r1)
139 ld r20,STK_REG(R20)(r1)
140 ld r21,STK_REG(R21)(r1)
141 ld r22,STK_REG(R22)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000142 addi r1,r1,STACKFRAMESIZE
143
144 /* Up to 127B to go */
1455: srdi r6,r5,4
146 mtocrf 0x01,r6
147
1486: bf cr7*4+1,7f
149 ld r0,0(r4)
150 ld r6,8(r4)
151 ld r7,16(r4)
152 ld r8,24(r4)
153 ld r9,32(r4)
154 ld r10,40(r4)
155 ld r11,48(r4)
156 ld r12,56(r4)
157 addi r4,r4,64
158 std r0,0(r3)
159 std r6,8(r3)
160 std r7,16(r3)
161 std r8,24(r3)
162 std r9,32(r3)
163 std r10,40(r3)
164 std r11,48(r3)
165 std r12,56(r3)
166 addi r3,r3,64
167
168 /* Up to 63B to go */
1697: bf cr7*4+2,8f
170 ld r0,0(r4)
171 ld r6,8(r4)
172 ld r7,16(r4)
173 ld r8,24(r4)
174 addi r4,r4,32
175 std r0,0(r3)
176 std r6,8(r3)
177 std r7,16(r3)
178 std r8,24(r3)
179 addi r3,r3,32
180
181 /* Up to 31B to go */
1828: bf cr7*4+3,9f
183 ld r0,0(r4)
184 ld r6,8(r4)
185 addi r4,r4,16
186 std r0,0(r3)
187 std r6,8(r3)
188 addi r3,r3,16
189
1909: clrldi r5,r5,(64-4)
191
192 /* Up to 15B to go */
193.Lshort_copy:
194 mtocrf 0x01,r5
195 bf cr7*4+0,12f
196 lwz r0,0(r4) /* Less chance of a reject with word ops */
197 lwz r6,4(r4)
198 addi r4,r4,8
199 stw r0,0(r3)
200 stw r6,4(r3)
201 addi r3,r3,8
202
20312: bf cr7*4+1,13f
204 lwz r0,0(r4)
205 addi r4,r4,4
206 stw r0,0(r3)
207 addi r3,r3,4
208
20913: bf cr7*4+2,14f
210 lhz r0,0(r4)
211 addi r4,r4,2
212 sth r0,0(r3)
213 addi r3,r3,2
214
21514: bf cr7*4+3,15f
216 lbz r0,0(r4)
217 stb r0,0(r3)
218
Ulrich Weigand752a6422014-02-14 19:21:03 +010021915: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000220 blr
221
222.Lunwind_stack_nonvmx_copy:
223 addi r1,r1,STACKFRAMESIZE
224 b .Lnonvmx_copy
225
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000226.Lvmx_copy:
Paul Mackerras98c45f52018-08-03 20:13:04 +1000227#ifdef CONFIG_ALTIVEC
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000228 mflr r0
Ulrich Weigand752a6422014-02-14 19:21:03 +0100229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000231 std r0,16(r1)
232 stdu r1,-STACKFRAMESIZE(r1)
Simon Guod58badf2018-06-07 09:57:53 +0800233 bl enter_vmx_ops
Anton Blanchard2fae7cd2012-08-07 17:51:41 +0000234 cmpwi cr1,r3,0
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000235 ld r0,STACKFRAMESIZE+16(r1)
Ulrich Weigand752a6422014-02-14 19:21:03 +0100236 ld r3,STK_REG(R31)(r1)
237 ld r4,STK_REG(R30)(r1)
238 ld r5,STK_REG(R29)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000239 mtlr r0
240
241 /*
242 * We prefetch both the source and destination using enhanced touch
243 * instructions. We use a stream ID of 0 for the load side and
244 * 1 for the store side.
245 */
246 clrrdi r6,r4,7
247 clrrdi r9,r3,7
248 ori r9,r9,1 /* stream=1 */
249
250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
Nishanth Aravamudanc8adfec2012-10-01 14:59:13 +0000251 cmpldi r7,0x3FF
252 ble 1f
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000253 li r7,0x3FF
2541: lis r0,0x0E00 /* depth=7 */
255 sldi r7,r7,7
256 or r7,r7,r0
257 ori r10,r7,1 /* stream=1 */
258
259 lis r8,0x8000 /* GO=1 */
260 clrldi r8,r8,32
261
Andreas Schwab8a583c02017-08-05 19:55:11 +0200262 dcbt 0,r6,0b01000
263 dcbt 0,r7,0b01010
264 dcbtst 0,r9,0b01000
265 dcbtst 0,r10,0b01010
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000266 eieio
Andreas Schwab8a583c02017-08-05 19:55:11 +0200267 dcbt 0,r8,0b01010 /* GO */
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000268
Anton Blanchard2fae7cd2012-08-07 17:51:41 +0000269 beq cr1,.Lunwind_stack_nonvmx_copy
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000270
271 /*
272 * If source and destination are not relatively aligned we use a
273 * slower permute loop.
274 */
275 xor r6,r4,r3
276 rldicl. r6,r6,0,(64-4)
277 bne .Lvmx_unaligned_copy
278
279 /* Get the destination 16B aligned */
280 neg r6,r3
281 mtocrf 0x01,r6
282 clrldi r6,r6,(64-4)
283
284 bf cr7*4+3,1f
285 lbz r0,0(r4)
286 addi r4,r4,1
287 stb r0,0(r3)
288 addi r3,r3,1
289
2901: bf cr7*4+2,2f
291 lhz r0,0(r4)
292 addi r4,r4,2
293 sth r0,0(r3)
294 addi r3,r3,2
295
2962: bf cr7*4+1,3f
297 lwz r0,0(r4)
298 addi r4,r4,4
299 stw r0,0(r3)
300 addi r3,r3,4
301
3023: bf cr7*4+0,4f
303 ld r0,0(r4)
304 addi r4,r4,8
305 std r0,0(r3)
306 addi r3,r3,8
307
3084: sub r5,r5,r6
309
310 /* Get the desination 128B aligned */
311 neg r6,r3
312 srdi r7,r6,4
313 mtocrf 0x01,r7
314 clrldi r6,r6,(64-7)
315
316 li r9,16
317 li r10,32
318 li r11,48
319
320 bf cr7*4+3,5f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200321 lvx v1,0,r4
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000322 addi r4,r4,16
Andreas Schwab8a583c02017-08-05 19:55:11 +0200323 stvx v1,0,r3
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000324 addi r3,r3,16
325
3265: bf cr7*4+2,6f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200327 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100328 lvx v0,r4,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000329 addi r4,r4,32
Andreas Schwab8a583c02017-08-05 19:55:11 +0200330 stvx v1,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100331 stvx v0,r3,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000332 addi r3,r3,32
333
3346: bf cr7*4+1,7f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200335 lvx v3,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100336 lvx v2,r4,r9
337 lvx v1,r4,r10
338 lvx v0,r4,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000339 addi r4,r4,64
Andreas Schwab8a583c02017-08-05 19:55:11 +0200340 stvx v3,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100341 stvx v2,r3,r9
342 stvx v1,r3,r10
343 stvx v0,r3,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000344 addi r3,r3,64
345
3467: sub r5,r5,r6
347 srdi r6,r5,7
348
Michael Neulingc75df6f2012-06-25 13:33:10 +0000349 std r14,STK_REG(R14)(r1)
350 std r15,STK_REG(R15)(r1)
351 std r16,STK_REG(R16)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000352
353 li r12,64
354 li r14,80
355 li r15,96
356 li r16,112
357
358 mtctr r6
359
360 /*
361 * Now do cacheline sized loads and stores. By this stage the
362 * cacheline stores are also cacheline aligned.
363 */
364 .align 5
3658:
Andreas Schwab8a583c02017-08-05 19:55:11 +0200366 lvx v7,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100367 lvx v6,r4,r9
368 lvx v5,r4,r10
369 lvx v4,r4,r11
370 lvx v3,r4,r12
371 lvx v2,r4,r14
372 lvx v1,r4,r15
373 lvx v0,r4,r16
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000374 addi r4,r4,128
Andreas Schwab8a583c02017-08-05 19:55:11 +0200375 stvx v7,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100376 stvx v6,r3,r9
377 stvx v5,r3,r10
378 stvx v4,r3,r11
379 stvx v3,r3,r12
380 stvx v2,r3,r14
381 stvx v1,r3,r15
382 stvx v0,r3,r16
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000383 addi r3,r3,128
384 bdnz 8b
385
Michael Neulingc75df6f2012-06-25 13:33:10 +0000386 ld r14,STK_REG(R14)(r1)
387 ld r15,STK_REG(R15)(r1)
388 ld r16,STK_REG(R16)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000389
390 /* Up to 127B to go */
391 clrldi r5,r5,(64-7)
392 srdi r6,r5,4
393 mtocrf 0x01,r6
394
395 bf cr7*4+1,9f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200396 lvx v3,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100397 lvx v2,r4,r9
398 lvx v1,r4,r10
399 lvx v0,r4,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000400 addi r4,r4,64
Andreas Schwab8a583c02017-08-05 19:55:11 +0200401 stvx v3,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100402 stvx v2,r3,r9
403 stvx v1,r3,r10
404 stvx v0,r3,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000405 addi r3,r3,64
406
4079: bf cr7*4+2,10f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200408 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100409 lvx v0,r4,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000410 addi r4,r4,32
Andreas Schwab8a583c02017-08-05 19:55:11 +0200411 stvx v1,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100412 stvx v0,r3,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000413 addi r3,r3,32
414
41510: bf cr7*4+3,11f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200416 lvx v1,0,r4
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000417 addi r4,r4,16
Andreas Schwab8a583c02017-08-05 19:55:11 +0200418 stvx v1,0,r3
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000419 addi r3,r3,16
420
421 /* Up to 15B to go */
42211: clrldi r5,r5,(64-4)
423 mtocrf 0x01,r5
424 bf cr7*4+0,12f
425 ld r0,0(r4)
426 addi r4,r4,8
427 std r0,0(r3)
428 addi r3,r3,8
429
43012: bf cr7*4+1,13f
431 lwz r0,0(r4)
432 addi r4,r4,4
433 stw r0,0(r3)
434 addi r3,r3,4
435
43613: bf cr7*4+2,14f
437 lhz r0,0(r4)
438 addi r4,r4,2
439 sth r0,0(r3)
440 addi r3,r3,2
441
44214: bf cr7*4+3,15f
443 lbz r0,0(r4)
444 stb r0,0(r3)
445
44615: addi r1,r1,STACKFRAMESIZE
Ulrich Weigand752a6422014-02-14 19:21:03 +0100447 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
Simon Guod58badf2018-06-07 09:57:53 +0800448 b exit_vmx_ops /* tail call optimise */
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000449
450.Lvmx_unaligned_copy:
451 /* Get the destination 16B aligned */
452 neg r6,r3
453 mtocrf 0x01,r6
454 clrldi r6,r6,(64-4)
455
456 bf cr7*4+3,1f
457 lbz r0,0(r4)
458 addi r4,r4,1
459 stb r0,0(r3)
460 addi r3,r3,1
461
4621: bf cr7*4+2,2f
463 lhz r0,0(r4)
464 addi r4,r4,2
465 sth r0,0(r3)
466 addi r3,r3,2
467
4682: bf cr7*4+1,3f
469 lwz r0,0(r4)
470 addi r4,r4,4
471 stw r0,0(r3)
472 addi r3,r3,4
473
4743: bf cr7*4+0,4f
475 lwz r0,0(r4) /* Less chance of a reject with word ops */
476 lwz r7,4(r4)
477 addi r4,r4,8
478 stw r0,0(r3)
479 stw r7,4(r3)
480 addi r3,r3,8
481
4824: sub r5,r5,r6
483
484 /* Get the desination 128B aligned */
485 neg r6,r3
486 srdi r7,r6,4
487 mtocrf 0x01,r7
488 clrldi r6,r6,(64-7)
489
490 li r9,16
491 li r10,32
492 li r11,48
493
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100494 LVS(v16,0,r4) /* Setup permute control vector */
495 lvx v0,0,r4
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000496 addi r4,r4,16
497
498 bf cr7*4+3,5f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200499 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100500 VPERM(v8,v0,v1,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000501 addi r4,r4,16
Andreas Schwab8a583c02017-08-05 19:55:11 +0200502 stvx v8,0,r3
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000503 addi r3,r3,16
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100504 vor v0,v1,v1
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000505
5065: bf cr7*4+2,6f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200507 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100508 VPERM(v8,v0,v1,v16)
509 lvx v0,r4,r9
510 VPERM(v9,v1,v0,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000511 addi r4,r4,32
Andreas Schwab8a583c02017-08-05 19:55:11 +0200512 stvx v8,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100513 stvx v9,r3,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000514 addi r3,r3,32
515
5166: bf cr7*4+1,7f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200517 lvx v3,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100518 VPERM(v8,v0,v3,v16)
519 lvx v2,r4,r9
520 VPERM(v9,v3,v2,v16)
521 lvx v1,r4,r10
522 VPERM(v10,v2,v1,v16)
523 lvx v0,r4,r11
524 VPERM(v11,v1,v0,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000525 addi r4,r4,64
Andreas Schwab8a583c02017-08-05 19:55:11 +0200526 stvx v8,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100527 stvx v9,r3,r9
528 stvx v10,r3,r10
529 stvx v11,r3,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000530 addi r3,r3,64
531
5327: sub r5,r5,r6
533 srdi r6,r5,7
534
Michael Neulingc75df6f2012-06-25 13:33:10 +0000535 std r14,STK_REG(R14)(r1)
536 std r15,STK_REG(R15)(r1)
537 std r16,STK_REG(R16)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000538
539 li r12,64
540 li r14,80
541 li r15,96
542 li r16,112
543
544 mtctr r6
545
546 /*
547 * Now do cacheline sized loads and stores. By this stage the
548 * cacheline stores are also cacheline aligned.
549 */
550 .align 5
5518:
Andreas Schwab8a583c02017-08-05 19:55:11 +0200552 lvx v7,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100553 VPERM(v8,v0,v7,v16)
554 lvx v6,r4,r9
555 VPERM(v9,v7,v6,v16)
556 lvx v5,r4,r10
557 VPERM(v10,v6,v5,v16)
558 lvx v4,r4,r11
559 VPERM(v11,v5,v4,v16)
560 lvx v3,r4,r12
561 VPERM(v12,v4,v3,v16)
562 lvx v2,r4,r14
563 VPERM(v13,v3,v2,v16)
564 lvx v1,r4,r15
565 VPERM(v14,v2,v1,v16)
566 lvx v0,r4,r16
567 VPERM(v15,v1,v0,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000568 addi r4,r4,128
Andreas Schwab8a583c02017-08-05 19:55:11 +0200569 stvx v8,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100570 stvx v9,r3,r9
571 stvx v10,r3,r10
572 stvx v11,r3,r11
573 stvx v12,r3,r12
574 stvx v13,r3,r14
575 stvx v14,r3,r15
576 stvx v15,r3,r16
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000577 addi r3,r3,128
578 bdnz 8b
579
Michael Neulingc75df6f2012-06-25 13:33:10 +0000580 ld r14,STK_REG(R14)(r1)
581 ld r15,STK_REG(R15)(r1)
582 ld r16,STK_REG(R16)(r1)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000583
584 /* Up to 127B to go */
585 clrldi r5,r5,(64-7)
586 srdi r6,r5,4
587 mtocrf 0x01,r6
588
589 bf cr7*4+1,9f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200590 lvx v3,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100591 VPERM(v8,v0,v3,v16)
592 lvx v2,r4,r9
593 VPERM(v9,v3,v2,v16)
594 lvx v1,r4,r10
595 VPERM(v10,v2,v1,v16)
596 lvx v0,r4,r11
597 VPERM(v11,v1,v0,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000598 addi r4,r4,64
Andreas Schwab8a583c02017-08-05 19:55:11 +0200599 stvx v8,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100600 stvx v9,r3,r9
601 stvx v10,r3,r10
602 stvx v11,r3,r11
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000603 addi r3,r3,64
604
6059: bf cr7*4+2,10f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200606 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100607 VPERM(v8,v0,v1,v16)
608 lvx v0,r4,r9
609 VPERM(v9,v1,v0,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000610 addi r4,r4,32
Andreas Schwab8a583c02017-08-05 19:55:11 +0200611 stvx v8,0,r3
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100612 stvx v9,r3,r9
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000613 addi r3,r3,32
614
61510: bf cr7*4+3,11f
Andreas Schwab8a583c02017-08-05 19:55:11 +0200616 lvx v1,0,r4
Anton Blanchardc2ce6f92015-02-10 09:51:22 +1100617 VPERM(v8,v0,v1,v16)
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000618 addi r4,r4,16
Andreas Schwab8a583c02017-08-05 19:55:11 +0200619 stvx v8,0,r3
Anton Blanchardb3f271e2012-05-30 20:22:09 +0000620 addi r3,r3,16
621
622 /* Up to 15B to go */
62311: clrldi r5,r5,(64-4)
624 addi r4,r4,-16 /* Unwind the +16 load offset */
625 mtocrf 0x01,r5
626 bf cr7*4+0,12f
627 lwz r0,0(r4) /* Less chance of a reject with word ops */
628 lwz r6,4(r4)
629 addi r4,r4,8
630 stw r0,0(r3)
631 stw r6,4(r3)
632 addi r3,r3,8
633
63412: bf cr7*4+1,13f
635 lwz r0,0(r4)
636 addi r4,r4,4
637 stw r0,0(r3)
638 addi r3,r3,4
639
64013: bf cr7*4+2,14f
641 lhz r0,0(r4)
642 addi r4,r4,2
643 sth r0,0(r3)
644 addi r3,r3,2
645
64614: bf cr7*4+3,15f
647 lbz r0,0(r4)
648 stb r0,0(r3)
649
65015: addi r1,r1,STACKFRAMESIZE
Ulrich Weigand752a6422014-02-14 19:21:03 +0100651 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
Simon Guod58badf2018-06-07 09:57:53 +0800652 b exit_vmx_ops /* tail call optimise */
Paul Bollec2522dc2014-09-26 19:45:34 +0200653#endif /* CONFIG_ALTIVEC */