blob: 55f227441f9ed9febb46a8204e6ce986af508966 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * "memcpy" implementation of SuperH
3 *
4 * Copyright (C) 1999 Niibe Yutaka
5 * Copyright (c) 2002 STMicroelectronics Ltd
6 * Modified from memcpy.S and micro-optimised for SH4
7 * Stuart Menefy (stuart.menefy@st.com)
8 *
9 */
10#include <linux/linkage.h>
11#include <linux/config.h>
12
13/*
14 * void *memcpy(void *dst, const void *src, size_t n);
15 *
16 * It is assumed that there is no overlap between src and dst.
17 * If there is an overlap, then the results are undefined.
18 */
19
20 !
21 ! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
22 !
23
24 ! Size is 16 or greater, and may have trailing bytes
25
26 .balign 32
27.Lcase1:
28 ! Read a long word and write a long word at once
29 ! At the start of each iteration, r7 contains last long load
30 add #-1,r5 ! 79 EX
31 mov r4,r2 ! 5 MT (0 cycles latency)
32
33 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
34 add #-4,r5 ! 50 EX
35
36 add #7,r2 ! 79 EX
37 !
38#ifdef CONFIG_CPU_LITTLE_ENDIAN
39 ! 6 cycles, 4 bytes per iteration
403: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
41 mov r7, r3 ! 5 MT (latency=0) ! RQPO
42
43 cmp/hi r2,r0 ! 57 MT
44 shll16 r3 ! 103 EX
45
46 mov r1,r6 ! 5 MT (latency=0)
47 shll8 r3 ! 102 EX ! Oxxx
48
49 shlr8 r6 ! 106 EX ! xNML
50 mov r1, r7 ! 5 MT (latency=0)
51
52 or r6,r3 ! 82 EX ! ONML
53 bt/s 3b ! 109 BR
54
55 mov.l r3,@-r0 ! 30 LS
56#else
573: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! KLMN
58 mov r7,r3 ! 5 MT (latency=0) ! OPQR
59
60 cmp/hi r2,r0 ! 57 MT
61 shlr16 r3 ! 107 EX
62
63 shlr8 r3 ! 106 EX ! xxxO
64 mov r1,r6 ! 5 MT (latency=0)
65
66 shll8 r6 ! 102 EX ! LMNx
67 mov r1,r7 ! 5 MT (latency=0)
68
69 or r6,r3 ! 82 EX ! LMNO
70 bt/s 3b ! 109 BR
71
72 mov.l r3,@-r0 ! 30 LS
73#endif
74 ! Finally, copy a byte at once, if necessary
75
76 add #4,r5 ! 50 EX
77 cmp/eq r4,r0 ! 54 MT
78
79 add #-6,r2 ! 50 EX
80 bt 9f ! 109 BR
81
828: cmp/hi r2,r0 ! 57 MT
83 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
84
85 bt/s 8b ! 109 BR
86
87 mov.b r1,@-r0 ! 29 LS
88
899: rts
90 nop
91
92
93 !
94 ! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
95 !
96
97 ! Size is 16 or greater, and may have trailing bytes
98
99 .balign 32
100.Lcase3:
101 ! Read a long word and write a long word at once
102 ! At the start of each iteration, r7 contains last long load
103 add #-3,r5 ! 79 EX
104 mov r4,r2 ! 5 MT (0 cycles latency)
105
106 mov.l @(r0,r5),r7 ! 21 LS (2 cycles latency)
107 add #-4,r5 ! 50 EX
108
109 add #7,r2 ! 79 EX
110 !
111#ifdef CONFIG_CPU_LITTLE_ENDIAN
112 ! 6 cycles, 4 bytes per iteration
1133: mov.l @(r0,r5),r1 ! 21 LS (latency=2) ! NMLK
114 mov r7, r3 ! 5 MT (latency=0) ! RQPO
115
116 cmp/hi r2,r0 ! 57 MT
117 shll8 r3 ! 102 EX ! QPOx
118
119 mov r1,r6 ! 5 MT (latency=0)
120 shlr16 r6 ! 107 EX
121
122 shlr8 r6 ! 106 EX ! xxxN
123 mov r1, r7 ! 5 MT (latency=0)
124
125 or r6,r3 ! 82 EX ! QPON
126 bt/s 3b ! 109 BR
127
128 mov.l r3,@-r0 ! 30 LS
129#else
1303: mov r1,r3 ! OPQR
131 shlr8 r3 ! xOPQ
132 mov.l @(r0,r5),r1 ! KLMN
133 mov r1,r6
134 shll16 r6
135 shll8 r6 ! Nxxx
136 or r6,r3 ! NOPQ
137 cmp/hi r2,r0
138 bt/s 3b
139 mov.l r3,@-r0
140#endif
141
142 ! Finally, copy a byte at once, if necessary
143
144 add #6,r5 ! 50 EX
145 cmp/eq r4,r0 ! 54 MT
146
147 add #-6,r2 ! 50 EX
148 bt 9f ! 109 BR
149
1508: cmp/hi r2,r0 ! 57 MT
151 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
152
153 bt/s 8b ! 109 BR
154
155 mov.b r1,@-r0 ! 29 LS
156
1579: rts
158 nop
159
160ENTRY(memcpy)
161
162 ! Calculate the invariants which will be used in the remainder
163 ! of the code:
164 !
165 ! r4 --> [ ... ] DST [ ... ] SRC
166 ! [ ... ] [ ... ]
167 ! : :
168 ! r0 --> [ ... ] r0+r5 --> [ ... ]
169 !
170 !
171
172 ! Short circuit the common case of src, dst and len being 32 bit aligned
173 ! and test for zero length move
174
175 mov r6, r0 ! 5 MT (0 cycle latency)
176 or r4, r0 ! 82 EX
177
178 or r5, r0 ! 82 EX
179 tst r6, r6 ! 86 MT
180
181 bt/s 99f ! 111 BR (zero len)
182 tst #3, r0 ! 87 MT
183
184 mov r4, r0 ! 5 MT (0 cycle latency)
185 add r6, r0 ! 49 EX
186
187 mov #16, r1 ! 6 EX
188 bt/s .Lcase00 ! 111 BR (aligned)
189
190 sub r4, r5 ! 75 EX
191
192 ! Arguments are not nicely long word aligned or zero len.
193 ! Check for small copies, and if so do a simple byte at a time copy.
194 !
195 ! Deciding on an exact value of 'small' is not easy, as the point at which
196 ! using the optimised routines become worthwhile varies (these are the
197 ! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198 ! size byte-at-time long word byte
199 ! 16 42 39-40 46-50 50-55
200 ! 24 58 43-44 54-58 62-67
201 ! 36 82 49-50 66-70 80-85
202 ! However the penalty for getting it 'wrong' is much higher for long word
203 ! aligned data (and this is more common), so use a value of 16.
204
205 cmp/gt r6,r1 ! 56 MT
206
207 add #-1,r5 ! 50 EX
208 bf/s 6f ! 108 BR (not small)
209
210 mov r5, r3 ! 5 MT (latency=0)
211 shlr r6 ! 104 EX
212
213 mov.b @(r0,r5),r1 ! 20 LS (latency=2)
214 bf/s 4f ! 111 BR
215
216 add #-1,r3 ! 50 EX
217 tst r6, r6 ! 86 MT
218
219 bt/s 98f ! 110 BR
220 mov.b r1,@-r0 ! 29 LS
221
222 ! 4 cycles, 2 bytes per iteration
2233: mov.b @(r0,r5),r1 ! 20 LS (latency=2)
224
2254: mov.b @(r0,r3),r2 ! 20 LS (latency=2)
226 dt r6 ! 67 EX
227
228 mov.b r1,@-r0 ! 29 LS
229 bf/s 3b ! 111 BR
230
231 mov.b r2,@-r0 ! 29 LS
23298:
233 rts
234 nop
235
23699: rts
237 mov r4, r0
238
239 ! Size is not small, so its worthwhile looking for optimisations.
240 ! First align destination to a long word boundary.
241 !
242 ! r5 = normal value -1
243
2446: tst #3, r0 ! 87 MT
245 mov #3, r3 ! 6 EX
246
247 bt/s 2f ! 111 BR
248 and r0,r3 ! 78 EX
249
250 ! 3 cycles, 1 byte per iteration
2511: dt r3 ! 67 EX
252 mov.b @(r0,r5),r1 ! 19 LS (latency=2)
253
254 add #-1, r6 ! 79 EX
255 bf/s 1b ! 109 BR
256
257 mov.b r1,@-r0 ! 28 LS
258
2592: add #1, r5 ! 79 EX
260
261 ! Now select the appropriate bulk transfer code based on relative
262 ! alignment of src and dst.
263
264 mov r0, r3 ! 5 MT (latency=0)
265
266 mov r5, r0 ! 5 MT (latency=0)
267 tst #1, r0 ! 87 MT
268
269 bf/s 1f ! 111 BR
270 mov #64, r7 ! 6 EX
271
272 ! bit 0 clear
273
274 cmp/ge r7, r6 ! 55 MT
275
276 bt/s 2f ! 111 BR
277 tst #2, r0 ! 87 MT
278
279 ! small
280 bt/s .Lcase0
281 mov r3, r0
282
283 bra .Lcase2
284 nop
285
286 ! big
2872: bt/s .Lcase0b
288 mov r3, r0
289
290 bra .Lcase2b
291 nop
292
293 ! bit 0 set
2941: tst #2, r0 ! 87 MT
295
296 bt/s .Lcase1
297 mov r3, r0
298
299 bra .Lcase3
300 nop
301
302
303 !
304 ! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
305 !
306
307 ! src, dst and size are all long word aligned
308 ! size is non-zero
309
310 .balign 32
311.Lcase00:
312 mov #64, r1 ! 6 EX
313 mov r5, r3 ! 5 MT (latency=0)
314
315 cmp/gt r6, r1 ! 56 MT
316 add #-4, r5 ! 50 EX
317
318 bf .Lcase00b ! 108 BR (big loop)
319 shlr2 r6 ! 105 EX
320
321 shlr r6 ! 104 EX
322 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
323
324 bf/s 4f ! 111 BR
325 add #-8, r3 ! 50 EX
326
327 tst r6, r6 ! 86 MT
328 bt/s 5f ! 110 BR
329
330 mov.l r1,@-r0 ! 30 LS
331
332 ! 4 cycles, 2 long words per iteration
3333: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
334
3354: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
336 dt r6 ! 67 EX
337
338 mov.l r1, @-r0 ! 30 LS
339 bf/s 3b ! 109 BR
340
341 mov.l r2, @-r0 ! 30 LS
342
3435: rts
344 nop
345
346
347 ! Size is 16 or greater and less than 64, but may have trailing bytes
348
349 .balign 32
350.Lcase0:
351 add #-4, r5 ! 50 EX
352 mov r4, r7 ! 5 MT (latency=0)
353
354 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
355 mov #4, r2 ! 6 EX
356
357 add #11, r7 ! 50 EX
358 tst r2, r6 ! 86 MT
359
360 mov r5, r3 ! 5 MT (latency=0)
361 bt/s 4f ! 111 BR
362
363 add #-4, r3 ! 50 EX
364 mov.l r1,@-r0 ! 30 LS
365
366 ! 4 cycles, 2 long words per iteration
3673: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
368
3694: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
370 cmp/hi r7, r0
371
372 mov.l r1, @-r0 ! 30 LS
373 bt/s 3b ! 109 BR
374
375 mov.l r2, @-r0 ! 30 LS
376
377 ! Copy the final 0-3 bytes
378
379 add #3,r5 ! 50 EX
380
381 cmp/eq r0, r4 ! 54 MT
382 add #-10, r7 ! 50 EX
383
384 bt 9f ! 110 BR
385
386 ! 3 cycles, 1 byte per iteration
3871: mov.b @(r0,r5),r1 ! 19 LS
388 cmp/hi r7,r0 ! 57 MT
389
390 bt/s 1b ! 111 BR
391 mov.b r1,@-r0 ! 28 LS
392
3939: rts
394 nop
395
396 ! Size is at least 64 bytes, so will be going round the big loop at least once.
397 !
398 ! r2 = rounded up r4
399 ! r3 = rounded down r0
400
401 .balign 32
402.Lcase0b:
403 add #-4, r5 ! 50 EX
404
405.Lcase00b:
406 mov r0, r3 ! 5 MT (latency=0)
407 mov #(~0x1f), r1 ! 6 EX
408
409 and r1, r3 ! 78 EX
410 mov r4, r2 ! 5 MT (latency=0)
411
412 cmp/eq r3, r0 ! 54 MT
413 add #0x1f, r2 ! 50 EX
414
415 bt/s 1f ! 110 BR
416 and r1, r2 ! 78 EX
417
418 ! copy initial words until cache line aligned
419
420 mov.l @(r0, r5), r1 ! 21 LS (latency=2)
421 tst #4, r0 ! 87 MT
422
423 mov r5, r6 ! 5 MT (latency=0)
424 add #-4, r6 ! 50 EX
425
426 bt/s 4f ! 111 BR
427 add #8, r3 ! 50 EX
428
429 tst #0x18, r0 ! 87 MT
430
431 bt/s 1f ! 109 BR
432 mov.l r1,@-r0 ! 30 LS
433
434 ! 4 cycles, 2 long words per iteration
4353: mov.l @(r0, r5), r1 ! 21 LS (latency=2)
436
4374: mov.l @(r0, r6), r7 ! 21 LS (latency=2)
438 cmp/eq r3, r0 ! 54 MT
439
440 mov.l r1, @-r0 ! 30 LS
441 bf/s 3b ! 109 BR
442
443 mov.l r7, @-r0 ! 30 LS
444
445 ! Copy the cache line aligned blocks
446 !
447 ! In use: r0, r2, r4, r5
448 ! Scratch: r1, r3, r6, r7
449 !
450 ! We could do this with the four scratch registers, but if src
451 ! and dest hit the same cache line, this will thrash, so make
452 ! use of additional registers.
453 !
454 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
455 ! r5: src (was r0+r5)
456 ! r1: dest (was r0)
457 ! this can be reversed at the end, so we don't need to save any extra
458 ! state.
459 !
4601: mov.l r8, @-r15 ! 30 LS
461 add r0, r5 ! 49 EX
462
463 mov.l r9, @-r15 ! 30 LS
464 mov r0, r1 ! 5 MT (latency=0)
465
466 mov.l r10, @-r15 ! 30 LS
467 add #-0x1c, r5 ! 50 EX
468
469 mov.l r11, @-r15 ! 30 LS
470
471 ! 16 cycles, 32 bytes per iteration
4722: mov.l @(0x00,r5),r0 ! 18 LS (latency=2)
473 add #-0x20, r1 ! 50 EX
474 mov.l @(0x04,r5),r3 ! 18 LS (latency=2)
475 mov.l @(0x08,r5),r6 ! 18 LS (latency=2)
476 mov.l @(0x0c,r5),r7 ! 18 LS (latency=2)
477 mov.l @(0x10,r5),r8 ! 18 LS (latency=2)
478 mov.l @(0x14,r5),r9 ! 18 LS (latency=2)
479 mov.l @(0x18,r5),r10 ! 18 LS (latency=2)
480 mov.l @(0x1c,r5),r11 ! 18 LS (latency=2)
481 movca.l r0,@r1 ! 40 LS (latency=3-7)
482 mov.l r3,@(0x04,r1) ! 33 LS
483 mov.l r6,@(0x08,r1) ! 33 LS
484 mov.l r7,@(0x0c,r1) ! 33 LS
485
486 mov.l r8,@(0x10,r1) ! 33 LS
487 add #-0x20, r5 ! 50 EX
488
489 mov.l r9,@(0x14,r1) ! 33 LS
490 cmp/eq r2,r1 ! 54 MT
491
492 mov.l r10,@(0x18,r1) ! 33 LS
493 bf/s 2b ! 109 BR
494
495 mov.l r11,@(0x1c,r1) ! 33 LS
496
497 mov r1, r0 ! 5 MT (latency=0)
498
499 mov.l @r15+, r11 ! 15 LS
500 sub r1, r5 ! 75 EX
501
502 mov.l @r15+, r10 ! 15 LS
503 cmp/eq r4, r0 ! 54 MT
504
505 bf/s 1f ! 109 BR
506 mov.l @r15+, r9 ! 15 LS
507
508 rts
5091: mov.l @r15+, r8 ! 15 LS
510 sub r4, r1 ! 75 EX (len remaining)
511
512 ! number of trailing bytes is non-zero
513 !
514 ! invariants restored (r5 already decremented by 4)
515 ! also r1=num bytes remaining
516
517 mov #4, r2 ! 6 EX
518 mov r4, r7 ! 5 MT (latency=0)
519
520 add #0x1c, r5 ! 50 EX (back to -4)
521 cmp/hs r2, r1 ! 58 MT
522
523 bf/s 5f ! 108 BR
524 add #11, r7 ! 50 EX
525
526 mov.l @(r0, r5), r6 ! 21 LS (latency=2)
527 tst r2, r1 ! 86 MT
528
529 mov r5, r3 ! 5 MT (latency=0)
530 bt/s 4f ! 111 BR
531
532 add #-4, r3 ! 50 EX
533 cmp/hs r2, r1 ! 58 MT
534
535 bt/s 5f ! 111 BR
536 mov.l r6,@-r0 ! 30 LS
537
538 ! 4 cycles, 2 long words per iteration
5393: mov.l @(r0, r5), r6 ! 21 LS (latency=2)
540
5414: mov.l @(r0, r3), r2 ! 21 LS (latency=2)
542 cmp/hi r7, r0
543
544 mov.l r6, @-r0 ! 30 LS
545 bt/s 3b ! 109 BR
546
547 mov.l r2, @-r0 ! 30 LS
548
549 ! Copy the final 0-3 bytes
550
5515: cmp/eq r0, r4 ! 54 MT
552 add #-10, r7 ! 50 EX
553
554 bt 9f ! 110 BR
555 add #3,r5 ! 50 EX
556
557 ! 3 cycles, 1 byte per iteration
5581: mov.b @(r0,r5),r1 ! 19 LS
559 cmp/hi r7,r0 ! 57 MT
560
561 bt/s 1b ! 111 BR
562 mov.b r1,@-r0 ! 28 LS
563
5649: rts
565 nop
566
567 !
568 ! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
569 !
570
571 .balign 32
572.Lcase2:
573 ! Size is 16 or greater and less then 64, but may have trailing bytes
574
5752: mov r5, r6 ! 5 MT (latency=0)
576 add #-2,r5 ! 50 EX
577
578 mov r4,r2 ! 5 MT (latency=0)
579 add #-4,r6 ! 50 EX
580
581 add #7,r2 ! 50 EX
5823: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
583
584 mov.w @(r0,r6),r3 ! 20 LS (latency=2)
585 cmp/hi r2,r0 ! 57 MT
586
587 mov.w r1,@-r0 ! 29 LS
588 bt/s 3b ! 111 BR
589
590 mov.w r3,@-r0 ! 29 LS
591
592 bra 10f
593 nop
594
595
596 .balign 32
597.Lcase2b:
598 ! Size is at least 64 bytes, so will be going round the big loop at least once.
599 !
600 ! r2 = rounded up r4
601 ! r3 = rounded down r0
602
603 mov r0, r3 ! 5 MT (latency=0)
604 mov #(~0x1f), r1 ! 6 EX
605
606 and r1, r3 ! 78 EX
607 mov r4, r2 ! 5 MT (latency=0)
608
609 cmp/eq r3, r0 ! 54 MT
610 add #0x1f, r2 ! 50 EX
611
612 add #-2, r5 ! 50 EX
613 bt/s 1f ! 110 BR
614 and r1, r2 ! 78 EX
615
616 ! Copy a short word one at a time until we are cache line aligned
617 ! Normal values: r0, r2, r3, r4
618 ! Unused: r1, r6, r7
619 ! Mod: r5 (=r5-2)
620 !
621 add #2, r3 ! 50 EX
622
6232: mov.w @(r0,r5),r1 ! 20 LS (latency=2)
624 cmp/eq r3,r0 ! 54 MT
625
626 bf/s 2b ! 111 BR
627
628 mov.w r1,@-r0 ! 29 LS
629
630 ! Copy the cache line aligned blocks
631 !
632 ! In use: r0, r2, r4, r5 (=r5-2)
633 ! Scratch: r1, r3, r6, r7
634 !
635 ! We could do this with the four scratch registers, but if src
636 ! and dest hit the same cache line, this will thrash, so make
637 ! use of additional registers.
638 !
639 ! We also need r0 as a temporary (for movca), so 'undo' the invariant:
640 ! r5: src (was r0+r5)
641 ! r1: dest (was r0)
642 ! this can be reversed at the end, so we don't need to save any extra
643 ! state.
644 !
6451: mov.l r8, @-r15 ! 30 LS
646 add r0, r5 ! 49 EX
647
648 mov.l r9, @-r15 ! 30 LS
649 mov r0, r1 ! 5 MT (latency=0)
650
651 mov.l r10, @-r15 ! 30 LS
652 add #-0x1e, r5 ! 50 EX
653
654 mov.l r11, @-r15 ! 30 LS
655
656 mov.l r12, @-r15 ! 30 LS
657
658 ! 17 cycles, 32 bytes per iteration
659#ifdef CONFIG_CPU_LITTLE_ENDIAN
6602: mov.w @r5+, r0 ! 14 LS (latency=2) ..JI
661 add #-0x20, r1 ! 50 EX
662
663 mov.l @r5+, r3 ! 15 LS (latency=2) NMLK
664
665 mov.l @r5+, r6 ! 15 LS (latency=2) RQPO
666 shll16 r0 ! 103 EX JI..
667
668 mov.l @r5+, r7 ! 15 LS (latency=2)
669 xtrct r3, r0 ! 48 EX LKJI
670
671 mov.l @r5+, r8 ! 15 LS (latency=2)
672 xtrct r6, r3 ! 48 EX PONM
673
674 mov.l @r5+, r9 ! 15 LS (latency=2)
675 xtrct r7, r6 ! 48 EX
676
677 mov.l @r5+, r10 ! 15 LS (latency=2)
678 xtrct r8, r7 ! 48 EX
679
680 mov.l @r5+, r11 ! 15 LS (latency=2)
681 xtrct r9, r8 ! 48 EX
682
683 mov.w @r5+, r12 ! 15 LS (latency=2)
684 xtrct r10, r9 ! 48 EX
685
686 movca.l r0,@r1 ! 40 LS (latency=3-7)
687 xtrct r11, r10 ! 48 EX
688
689 mov.l r3, @(0x04,r1) ! 33 LS
690 xtrct r12, r11 ! 48 EX
691
692 mov.l r6, @(0x08,r1) ! 33 LS
693
694 mov.l r7, @(0x0c,r1) ! 33 LS
695
696 mov.l r8, @(0x10,r1) ! 33 LS
697 add #-0x40, r5 ! 50 EX
698
699 mov.l r9, @(0x14,r1) ! 33 LS
700 cmp/eq r2,r1 ! 54 MT
701
702 mov.l r10, @(0x18,r1) ! 33 LS
703 bf/s 2b ! 109 BR
704
705 mov.l r11, @(0x1c,r1) ! 33 LS
706#else
7072: mov.w @(0x1e,r5), r0 ! 17 LS (latency=2)
708 add #-2, r5 ! 50 EX
709
710 mov.l @(0x1c,r5), r3 ! 18 LS (latency=2)
711 add #-4, r1 ! 50 EX
712
713 mov.l @(0x18,r5), r6 ! 18 LS (latency=2)
714 shll16 r0 ! 103 EX
715
716 mov.l @(0x14,r5), r7 ! 18 LS (latency=2)
717 xtrct r3, r0 ! 48 EX
718
719 mov.l @(0x10,r5), r8 ! 18 LS (latency=2)
720 xtrct r6, r3 ! 48 EX
721
722 mov.l @(0x0c,r5), r9 ! 18 LS (latency=2)
723 xtrct r7, r6 ! 48 EX
724
725 mov.l @(0x08,r5), r10 ! 18 LS (latency=2)
726 xtrct r8, r7 ! 48 EX
727
728 mov.l @(0x04,r5), r11 ! 18 LS (latency=2)
729 xtrct r9, r8 ! 48 EX
730
731 mov.w @(0x02,r5), r12 ! 18 LS (latency=2)
732 xtrct r10, r9 ! 48 EX
733
734 movca.l r0,@r1 ! 40 LS (latency=3-7)
735 add #-0x1c, r1 ! 50 EX
736
737 mov.l r3, @(0x1c,r1) ! 33 LS
738 xtrct r11, r10 ! 48 EX
739
740 mov.l r6, @(0x18,r1) ! 33 LS
741 xtrct r12, r11 ! 48 EX
742
743 mov.l r7, @(0x14,r1) ! 33 LS
744
745 mov.l r8, @(0x10,r1) ! 33 LS
746 add #-0x3e, r5 ! 50 EX
747
748 mov.l r9, @(0x0c,r1) ! 33 LS
749 cmp/eq r2,r1 ! 54 MT
750
751 mov.l r10, @(0x08,r1) ! 33 LS
752 bf/s 2b ! 109 BR
753
754 mov.l r11, @(0x04,r1) ! 33 LS
755#endif
756
757 mov.l @r15+, r12
758 mov r1, r0 ! 5 MT (latency=0)
759
760 mov.l @r15+, r11 ! 15 LS
761 sub r1, r5 ! 75 EX
762
763 mov.l @r15+, r10 ! 15 LS
764 cmp/eq r4, r0 ! 54 MT
765
766 bf/s 1f ! 109 BR
767 mov.l @r15+, r9 ! 15 LS
768
769 rts
7701: mov.l @r15+, r8 ! 15 LS
771
772 add #0x1e, r5 ! 50 EX
773
774 ! Finish off a short word at a time
775 ! r5 must be invariant - 2
77610: mov r4,r2 ! 5 MT (latency=0)
777 add #1,r2 ! 50 EX
778
779 cmp/hi r2, r0 ! 57 MT
780 bf/s 1f ! 109 BR
781
782 add #2, r2 ! 50 EX
783
7843: mov.w @(r0,r5),r1 ! 20 LS
785 cmp/hi r2,r0 ! 57 MT
786
787 bt/s 3b ! 109 BR
788
789 mov.w r1,@-r0 ! 29 LS
7901:
791
792 !
793 ! Finally, copy the last byte if necessary
794 cmp/eq r4,r0 ! 54 MT
795 bt/s 9b
796 add #1,r5
797 mov.b @(r0,r5),r1
798 rts
799 mov.b r1,@-r0
800