blob: 459fa92a7c5311b02c1e1f21c74536eb27432ed2 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * "memcpy" implementation of SuperH
3 *
4 * Copyright (C) 1999 Niibe Yutaka
5 * Copyright (c) 2002 STMicroelectronics Ltd
6 * Modified from memcpy.S and micro-optimised for SH4
7 * Stuart Menefy (stuart.menefy@st.com)
8 *
9 */
10#include <linux/linkage.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070011
12/*
13 * void *memcpy(void *dst, const void *src, size_t n);
14 *
15 * It is assumed that there is no overlap between src and dst.
16 * If there is an overlap, then the results are undefined.
17 */
18
19 !
20 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
21 !
22
23 ! Size is 16 or greater, and may have trailing bytes
24
25 .balign 32
26.Lcase1:
27 ! Read a long word and write a long word at once
28 ! At the start of each iteration, r7 contains last long load
29 add #-1,r5 ! 79 EX
30 mov r4,r2 ! 5 MT (0 cycles latency)
31
32 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
33 add #-4,r5 ! 50 EX
34
35 add #7,r2 ! 79 EX
36 !
37#ifdef CONFIG_CPU_LITTLE_ENDIAN
38 ! 6 cycles, 4 bytes per iteration
393: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
40 mov r7, r3 ! 5 MT (latency=0) ! RQPO
41
42 cmp/hi r2,r0 ! 57 MT
43 shll16 r3 ! 103 EX
44
45 mov r1,r6 ! 5 MT (latency=0)
46 shll8 r3 ! 102 EX ! Oxxx
47
48 shlr8 r6 ! 106 EX ! xNML
49 mov r1, r7 ! 5 MT (latency=0)
50
51 or r6,r3 ! 82 EX ! ONML
52 bt/s 3b ! 109 BR
53
54 mov.l r3,@-r0 ! 30 LS
55#else
563: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
57 mov r7,r3 ! 5 MT (latency=0) ! OPQR
58
59 cmp/hi r2,r0 ! 57 MT
60 shlr16 r3 ! 107 EX
61
62 shlr8 r3 ! 106 EX ! xxxO
63 mov r1,r6 ! 5 MT (latency=0)
64
65 shll8 r6 ! 102 EX ! LMNx
66 mov r1,r7 ! 5 MT (latency=0)
67
68 or r6,r3 ! 82 EX ! LMNO
69 bt/s 3b ! 109 BR
70
71 mov.l r3,@-r0 ! 30 LS
72#endif
73 ! Finally, copy a byte at once, if necessary
74
75 add #4,r5 ! 50 EX
76 cmp/eq r4,r0 ! 54 MT
77
78 add #-6,r2 ! 50 EX
79 bt 9f ! 109 BR
80
818: cmp/hi r2,r0 ! 57 MT
82 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
83
84 bt/s 8b ! 109 BR
85
86 mov.b r1,@-r0 ! 29 LS
87
889: rts
89 nop
90
91
92 !
93 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
94 !
95
96 ! Size is 16 or greater, and may have trailing bytes
97
98 .balign 32
99.Lcase3:
100 ! Read a long word and write a long word at once
101 ! At the start of each iteration, r7 contains last long load
102 add #-3,r5 ! 79 EX
103 mov r4,r2 ! 5 MT (0 cycles latency)
104
105 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
106 add #-4,r5 ! 50 EX
107
108 add #7,r2 ! 79 EX
109 !
110#ifdef CONFIG_CPU_LITTLE_ENDIAN
111 ! 6 cycles, 4 bytes per iteration
1123: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
113 mov r7, r3 ! 5 MT (latency=0) ! RQPO
114
115 cmp/hi r2,r0 ! 57 MT
116 shll8 r3 ! 102 EX ! QPOx
117
118 mov r1,r6 ! 5 MT (latency=0)
119 shlr16 r6 ! 107 EX
120
121 shlr8 r6 ! 106 EX ! xxxN
122 mov r1, r7 ! 5 MT (latency=0)
123
124 or r6,r3 ! 82 EX ! QPON
125 bt/s 3b ! 109 BR
126
127 mov.l r3,@-r0 ! 30 LS
128#else
Hideo Saitoe08b9542008-05-15 13:28:46 +09001293: mov r7,r3 ! OPQR
Linus Torvalds1da177e2005-04-16 15:20:36 -0700130 shlr8 r3 ! xOPQ
Hideo Saitoe08b9542008-05-15 13:28:46 +0900131 mov.l @(r0,r5),r7 ! KLMN
132 mov r7,r6
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 shll16 r6
134 shll8 r6 ! Nxxx
135 or r6,r3 ! NOPQ
136 cmp/hi r2,r0
137 bt/s 3b
138 mov.l r3,@-r0
139#endif
140
141 ! Finally, copy a byte at once, if necessary
142
143 add #6,r5 ! 50 EX
144 cmp/eq r4,r0 ! 54 MT
145
146 add #-6,r2 ! 50 EX
147 bt 9f ! 109 BR
148
1498: cmp/hi r2,r0 ! 57 MT
150 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
151
152 bt/s 8b ! 109 BR
153
154 mov.b r1,@-r0 ! 29 LS
155
1569: rts
157 nop
158
159ENTRY(memcpy)
160
161 ! Calculate the invariants which will be used in the remainder
162 ! of the code:
163 !
164 ! r4 --> [ ... ] DST [ ... ] SRC
165 ! [ ... ] [ ... ]
166 ! : :
167 ! r0 --> [ ... ] r0+r5 --> [ ... ]
168 !
169 !
170
171 ! Short circuit the common case of src, dst and len being 32 bit aligned
172 ! and test for zero length move
173
174 mov r6, r0 ! 5 MT (0 cycle latency)
175 or r4, r0 ! 82 EX
176
177 or r5, r0 ! 82 EX
178 tst r6, r6 ! 86 MT
179
180 bt/s 99f ! 111 BR (zero len)
181 tst #3, r0 ! 87 MT
182
183 mov r4, r0 ! 5 MT (0 cycle latency)
184 add r6, r0 ! 49 EX
185
186 mov #16, r1 ! 6 EX
187 bt/s .Lcase00 ! 111 BR (aligned)
188
189 sub r4, r5 ! 75 EX
190
191 ! Arguments are not nicely long word aligned or zero len.
192 ! Check for small copies, and if so do a simple byte at a time copy.
193 !
194 ! Deciding on an exact value of 'small' is not easy, as the point at which
195 ! using the optimised routines become worthwhile varies (these are the
196 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
197 ! size byte-at-time long word byte
198 ! 16 42 39-40 46-50 50-55
199 ! 24 58 43-44 54-58 62-67
200 ! 36 82 49-50 66-70 80-85
201 ! However the penalty for getting it 'wrong' is much higher for long word
202 ! aligned data (and this is more common), so use a value of 16.
203
204 cmp/gt r6,r1 ! 56 MT
205
206 add #-1,r5 ! 50 EX
207 bf/s 6f ! 108 BR (not small)
208
209 mov r5, r3 ! 5 MT (latency=0)
210 shlr r6 ! 104 EX
211
212 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
213 bf/s 4f ! 111 BR
214
215 add #-1,r3 ! 50 EX
216 tst r6, r6 ! 86 MT
217
218 bt/s 98f ! 110 BR
219 mov.b r1,@-r0 ! 29 LS
220
221 ! 4 cycles, 2 bytes per iteration
2223: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
223
2244: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
225 dt r6 ! 67 EX
226
227 mov.b r1,@-r0 ! 29 LS
228 bf/s 3b ! 111 BR
229
230 mov.b r2,@-r0 ! 29 LS
23198:
232 rts
233 nop
234
23599: rts
236 mov r4, r0
237
238 ! Size is not small, so its worthwhile looking for optimisations.
239 ! First align destination to a long word boundary.
240 !
241 ! r5 = normal value -1
242
2436: tst #3, r0 ! 87 MT
244 mov #3, r3 ! 6 EX
245
246 bt/s 2f ! 111 BR
247 and r0,r3 ! 78 EX
248
249 ! 3 cycles, 1 byte per iteration
2501: dt r3 ! 67 EX
251 mov.b @(r0,r5),r1 ! 19 LS (latency=2)
252
253 add #-1, r6 ! 79 EX
254 bf/s 1b ! 109 BR
255
256 mov.b r1,@-r0 ! 28 LS
257
2582: add #1, r5 ! 79 EX
259
260 ! Now select the appropriate bulk transfer code based on relative
261 ! alignment of src and dst.
262
263 mov r0, r3 ! 5 MT (latency=0)
264
265 mov r5, r0 ! 5 MT (latency=0)
266 tst #1, r0 ! 87 MT
267
268 bf/s 1f ! 111 BR
269 mov #64, r7 ! 6 EX
270
271 ! bit 0 clear
272
273 cmp/ge r7, r6 ! 55 MT
274
275 bt/s 2f ! 111 BR
276 tst #2, r0 ! 87 MT
277
278 ! small
279 bt/s .Lcase0
280 mov r3, r0
281
282 bra .Lcase2
283 nop
284
285 ! big
2862: bt/s .Lcase0b
287 mov r3, r0
288
289 bra .Lcase2b
290 nop
291
292 ! bit 0 set
2931: tst #2, r0 ! 87 MT
294
295 bt/s .Lcase1
296 mov r3, r0
297
298 bra .Lcase3
299 nop
300
301
302 !
303 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
304 !
305
306 ! src, dst and size are all long word aligned
307 ! size is non-zero
308
309 .balign 32
310.Lcase00:
311 mov #64, r1 ! 6 EX
312 mov r5, r3 ! 5 MT (latency=0)
313
314 cmp/gt r6, r1 ! 56 MT
315 add #-4, r5 ! 50 EX
316
317 bf .Lcase00b ! 108 BR (big loop)
318 shlr2 r6 ! 105 EX
319
320 shlr r6 ! 104 EX
321 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
322
323 bf/s 4f ! 111 BR
324 add #-8, r3 ! 50 EX
325
326 tst r6, r6 ! 86 MT
327 bt/s 5f ! 110 BR
328
329 mov.l r1,@-r0 ! 30 LS
330
331 ! 4 cycles, 2 long words per iteration
3323: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
333
3344: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
335 dt r6 ! 67 EX
336
337 mov.l r1, @-r0 ! 30 LS
338 bf/s 3b ! 109 BR
339
340 mov.l r2, @-r0 ! 30 LS
341
3425: rts
343 nop
344
345
346 ! Size is 16 or greater and less than 64, but may have trailing bytes
347
348 .balign 32
349.Lcase0:
350 add #-4, r5 ! 50 EX
351 mov r4, r7 ! 5 MT (latency=0)
352
353 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
354 mov #4, r2 ! 6 EX
355
356 add #11, r7 ! 50 EX
357 tst r2, r6 ! 86 MT
358
359 mov r5, r3 ! 5 MT (latency=0)
360 bt/s 4f ! 111 BR
361
362 add #-4, r3 ! 50 EX
363 mov.l r1,@-r0 ! 30 LS
364
365 ! 4 cycles, 2 long words per iteration
3663: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
367
3684: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
369 cmp/hi r7, r0
370
371 mov.l r1, @-r0 ! 30 LS
372 bt/s 3b ! 109 BR
373
374 mov.l r2, @-r0 ! 30 LS
375
376 ! Copy the final 0-3 bytes
377
378 add #3,r5 ! 50 EX
379
380 cmp/eq r0, r4 ! 54 MT
381 add #-10, r7 ! 50 EX
382
383 bt 9f ! 110 BR
384
385 ! 3 cycles, 1 byte per iteration
3861: mov.b @(r0,r5),r1 ! 19 LS
387 cmp/hi r7,r0 ! 57 MT
388
389 bt/s 1b ! 111 BR
390 mov.b r1,@-r0 ! 28 LS
391
3929: rts
393 nop
394
395 ! Size is at least 64 bytes, so will be going round the big loop at least once.
396 !
397 ! r2 = rounded up r4
398 ! r3 = rounded down r0
399
400 .balign 32
401.Lcase0b:
402 add #-4, r5 ! 50 EX
403
404.Lcase00b:
405 mov r0, r3 ! 5 MT (latency=0)
406 mov #(~0x1f), r1 ! 6 EX
407
408 and r1, r3 ! 78 EX
409 mov r4, r2 ! 5 MT (latency=0)
410
411 cmp/eq r3, r0 ! 54 MT
412 add #0x1f, r2 ! 50 EX
413
414 bt/s 1f ! 110 BR
415 and r1, r2 ! 78 EX
416
417 ! copy initial words until cache line aligned
418
419 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
420 tst #4, r0 ! 87 MT
421
422 mov r5, r6 ! 5 MT (latency=0)
423 add #-4, r6 ! 50 EX
424
425 bt/s 4f ! 111 BR
426 add #8, r3 ! 50 EX
427
428 tst #0x18, r0 ! 87 MT
429
430 bt/s 1f ! 109 BR
431 mov.l r1,@-r0 ! 30 LS
432
433 ! 4 cycles, 2 long words per iteration
4343: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
435
4364: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
437 cmp/eq r3, r0 ! 54 MT
438
439 mov.l r1, @-r0 ! 30 LS
440 bf/s 3b ! 109 BR
441
442 mov.l r7, @-r0 ! 30 LS
443
444 ! Copy the cache line aligned blocks
445 !
446 ! In use: r0, r2, r4, r5
447 ! Scratch: r1, r3, r6, r7
448 !
449 ! We could do this with the four scratch registers, but if src
450 ! and dest hit the same cache line, this will thrash, so make
451 ! use of additional registers.
452 !
453 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
454 ! r5: src (was r0+r5)
455 ! r1: dest (was r0)
456 ! this can be reversed at the end, so we don't need to save any extra
457 ! state.
458 !
4591: mov.l r8, @-r15 ! 30 LS
460 add r0, r5 ! 49 EX
461
462 mov.l r9, @-r15 ! 30 LS
463 mov r0, r1 ! 5 MT (latency=0)
464
465 mov.l r10, @-r15 ! 30 LS
466 add #-0x1c, r5 ! 50 EX
467
468 mov.l r11, @-r15 ! 30 LS
469
470 ! 16 cycles, 32 bytes per iteration
4712: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
472 add #-0x20, r1 ! 50 EX
473 mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
474 mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
475 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
476 mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
477 mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
478 mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
479 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
480 movca.l r0,@r1 ! 40 LS (latency=3-7)
481 mov.l r3,@(0x04,r1) ! 33 LS
482 mov.l r6,@(0x08,r1) ! 33 LS
483 mov.l r7,@(0x0c,r1) ! 33 LS
484
485 mov.l r8,@(0x10,r1) ! 33 LS
486 add #-0x20, r5 ! 50 EX
487
488 mov.l r9,@(0x14,r1) ! 33 LS
489 cmp/eq r2,r1 ! 54 MT
490
491 mov.l r10,@(0x18,r1) ! 33 LS
492 bf/s 2b ! 109 BR
493
494 mov.l r11,@(0x1c,r1) ! 33 LS
495
496 mov r1, r0 ! 5 MT (latency=0)
497
498 mov.l @r15+, r11 ! 15 LS
499 sub r1, r5 ! 75 EX
500
501 mov.l @r15+, r10 ! 15 LS
502 cmp/eq r4, r0 ! 54 MT
503
504 bf/s 1f ! 109 BR
505 mov.l @r15+, r9 ! 15 LS
506
507 rts
5081: mov.l @r15+, r8 ! 15 LS
509 sub r4, r1 ! 75 EX (len remaining)
510
511 ! number of trailing bytes is non-zero
512 !
513 ! invariants restored (r5 already decremented by 4)
514 ! also r1=num bytes remaining
515
516 mov #4, r2 ! 6 EX
517 mov r4, r7 ! 5 MT (latency=0)
518
519 add #0x1c, r5 ! 50 EX (back to -4)
520 cmp/hs r2, r1 ! 58 MT
521
522 bf/s 5f ! 108 BR
523 add #11, r7 ! 50 EX
524
525 mov.l @(r0, r5), r6 ! 21 LS (latency=2)
526 tst r2, r1 ! 86 MT
527
528 mov r5, r3 ! 5 MT (latency=0)
529 bt/s 4f ! 111 BR
530
531 add #-4, r3 ! 50 EX
532 cmp/hs r2, r1 ! 58 MT
533
534 bt/s 5f ! 111 BR
535 mov.l r6,@-r0 ! 30 LS
536
537 ! 4 cycles, 2 long words per iteration
5383: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
539
5404: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
541 cmp/hi r7, r0
542
543 mov.l r6, @-r0 ! 30 LS
544 bt/s 3b ! 109 BR
545
546 mov.l r2, @-r0 ! 30 LS
547
548 ! Copy the final 0-3 bytes
549
5505: cmp/eq r0, r4 ! 54 MT
551 add #-10, r7 ! 50 EX
552
553 bt 9f ! 110 BR
554 add #3,r5 ! 50 EX
555
556 ! 3 cycles, 1 byte per iteration
5571: mov.b @(r0,r5),r1 ! 19 LS
558 cmp/hi r7,r0 ! 57 MT
559
560 bt/s 1b ! 111 BR
561 mov.b r1,@-r0 ! 28 LS
562
5639: rts
564 nop
565
566 !
567 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
568 !
569
570 .balign 32
571.Lcase2:
572 ! Size is 16 or greater and less then 64, but may have trailing bytes
573
5742: mov r5, r6 ! 5 MT (latency=0)
575 add #-2,r5 ! 50 EX
576
577 mov r4,r2 ! 5 MT (latency=0)
578 add #-4,r6 ! 50 EX
579
580 add #7,r2 ! 50 EX
5813: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
582
583 mov.w @(r0,r6),r3 ! 20 LS (latency=2)
584 cmp/hi r2,r0 ! 57 MT
585
586 mov.w r1,@-r0 ! 29 LS
587 bt/s 3b ! 111 BR
588
589 mov.w r3,@-r0 ! 29 LS
590
591 bra 10f
592 nop
593
594
595 .balign 32
596.Lcase2b:
597 ! Size is at least 64 bytes, so will be going round the big loop at least once.
598 !
599 ! r2 = rounded up r4
600 ! r3 = rounded down r0
601
602 mov r0, r3 ! 5 MT (latency=0)
603 mov #(~0x1f), r1 ! 6 EX
604
605 and r1, r3 ! 78 EX
606 mov r4, r2 ! 5 MT (latency=0)
607
608 cmp/eq r3, r0 ! 54 MT
609 add #0x1f, r2 ! 50 EX
610
611 add #-2, r5 ! 50 EX
612 bt/s 1f ! 110 BR
613 and r1, r2 ! 78 EX
614
615 ! Copy a short word one at a time until we are cache line aligned
616 ! Normal values: r0, r2, r3, r4
617 ! Unused: r1, r6, r7
618 ! Mod: r5 (=r5-2)
619 !
620 add #2, r3 ! 50 EX
621
6222: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
623 cmp/eq r3,r0 ! 54 MT
624
625 bf/s 2b ! 111 BR
626
627 mov.w r1,@-r0 ! 29 LS
628
629 ! Copy the cache line aligned blocks
630 !
631 ! In use: r0, r2, r4, r5 (=r5-2)
632 ! Scratch: r1, r3, r6, r7
633 !
634 ! We could do this with the four scratch registers, but if src
635 ! and dest hit the same cache line, this will thrash, so make
636 ! use of additional registers.
637 !
638 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
639 ! r5: src (was r0+r5)
640 ! r1: dest (was r0)
641 ! this can be reversed at the end, so we don't need to save any extra
642 ! state.
643 !
6441: mov.l r8, @-r15 ! 30 LS
645 add r0, r5 ! 49 EX
646
647 mov.l r9, @-r15 ! 30 LS
648 mov r0, r1 ! 5 MT (latency=0)
649
650 mov.l r10, @-r15 ! 30 LS
651 add #-0x1e, r5 ! 50 EX
652
653 mov.l r11, @-r15 ! 30 LS
654
655 mov.l r12, @-r15 ! 30 LS
656
657 ! 17 cycles, 32 bytes per iteration
658#ifdef CONFIG_CPU_LITTLE_ENDIAN
6592: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
660 add #-0x20, r1 ! 50 EX
661
662 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
663
664 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
665 shll16 r0 ! 103 EX JI..
666
667 mov.l @r5+, r7 ! 15 LS (latency=2)
668 xtrct r3, r0 ! 48 EX LKJI
669
670 mov.l @r5+, r8 ! 15 LS (latency=2)
671 xtrct r6, r3 ! 48 EX PONM
672
673 mov.l @r5+, r9 ! 15 LS (latency=2)
674 xtrct r7, r6 ! 48 EX
675
676 mov.l @r5+, r10 ! 15 LS (latency=2)
677 xtrct r8, r7 ! 48 EX
678
679 mov.l @r5+, r11 ! 15 LS (latency=2)
680 xtrct r9, r8 ! 48 EX
681
682 mov.w @r5+, r12 ! 15 LS (latency=2)
683 xtrct r10, r9 ! 48 EX
684
685 movca.l r0,@r1 ! 40 LS (latency=3-7)
686 xtrct r11, r10 ! 48 EX
687
688 mov.l r3, @(0x04,r1) ! 33 LS
689 xtrct r12, r11 ! 48 EX
690
691 mov.l r6, @(0x08,r1) ! 33 LS
692
693 mov.l r7, @(0x0c,r1) ! 33 LS
694
695 mov.l r8, @(0x10,r1) ! 33 LS
696 add #-0x40, r5 ! 50 EX
697
698 mov.l r9, @(0x14,r1) ! 33 LS
699 cmp/eq r2,r1 ! 54 MT
700
701 mov.l r10, @(0x18,r1) ! 33 LS
702 bf/s 2b ! 109 BR
703
704 mov.l r11, @(0x1c,r1) ! 33 LS
705#else
7062: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
707 add #-2, r5 ! 50 EX
708
709 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
710 add #-4, r1 ! 50 EX
711
712 mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
713 shll16 r0 ! 103 EX
714
715 mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
716 xtrct r3, r0 ! 48 EX
717
718 mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
719 xtrct r6, r3 ! 48 EX
720
721 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
722 xtrct r7, r6 ! 48 EX
723
724 mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
725 xtrct r8, r7 ! 48 EX
726
727 mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
728 xtrct r9, r8 ! 48 EX
729
Nobuhiro Iwamatsuc7afb7e2006-09-27 17:50:03 +0900730 mov.l @(0x00,r5), r12 ! 18 LS (latency=2)
731 xtrct r10, r9 ! 48 EX
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732
733 movca.l r0,@r1 ! 40 LS (latency=3-7)
734 add #-0x1c, r1 ! 50 EX
735
Hideo Saitoe08b9542008-05-15 13:28:46 +0900736 mov.l r3, @(0x18,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737 xtrct r11, r10 ! 48 EX
738
Hideo Saitoe08b9542008-05-15 13:28:46 +0900739 mov.l r6, @(0x14,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 xtrct r12, r11 ! 48 EX
741
Hideo Saitoe08b9542008-05-15 13:28:46 +0900742 mov.l r7, @(0x10,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743
Hideo Saitoe08b9542008-05-15 13:28:46 +0900744 mov.l r8, @(0x0c,r1) ! 33 LS
745 add #-0x1e, r5 ! 50 EX
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746
Hideo Saitoe08b9542008-05-15 13:28:46 +0900747 mov.l r9, @(0x08,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 cmp/eq r2,r1 ! 54 MT
749
Hideo Saitoe08b9542008-05-15 13:28:46 +0900750 mov.l r10, @(0x04,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751 bf/s 2b ! 109 BR
752
Hideo Saitoe08b9542008-05-15 13:28:46 +0900753 mov.l r11, @(0x00,r1) ! 33 LS
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754#endif
755
756 mov.l @r15+, r12
757 mov r1, r0 ! 5 MT (latency=0)
758
759 mov.l @r15+, r11 ! 15 LS
760 sub r1, r5 ! 75 EX
761
762 mov.l @r15+, r10 ! 15 LS
763 cmp/eq r4, r0 ! 54 MT
764
765 bf/s 1f ! 109 BR
766 mov.l @r15+, r9 ! 15 LS
767
768 rts
7691: mov.l @r15+, r8 ! 15 LS
770
771 add #0x1e, r5 ! 50 EX
772
773 ! Finish off a short word at a time
774 ! r5 must be invariant - 2
77510: mov r4,r2 ! 5 MT (latency=0)
776 add #1,r2 ! 50 EX
777
778 cmp/hi r2, r0 ! 57 MT
779 bf/s 1f ! 109 BR
780
781 add #2, r2 ! 50 EX
782
7833: mov.w @(r0,r5),r1 ! 20 LS
784 cmp/hi r2,r0 ! 57 MT
785
786 bt/s 3b ! 109 BR
787
788 mov.w r1,@-r0 ! 29 LS
7891:
790
791 !
792 ! Finally, copy the last byte if necessary
793 cmp/eq r4,r0 ! 54 MT
794 bt/s 9b
795 add #1,r5
796 mov.b @(r0,r5),r1
797 rts
798 mov.b r1,@-r0
799