blob: 9ccacdf5bcb952f6a2664cd44b38fc5dee15dafd [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * arch/ppc64/lib/memcpy.S
3 *
4 * Copyright (C) 2002 Paul Mackerras, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <asm/processor.h>
12#include <asm/ppc_asm.h>
13
14 .align 7
15_GLOBAL(memcpy)
16 mtcrf 0x01,r5
17 cmpldi cr1,r5,16
18 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
19 andi. r6,r6,7
20 dcbt 0,r4
21 blt cr1,.Lshort_copy
22 bne .Ldst_unaligned
23.Ldst_aligned:
24 andi. r0,r4,7
25 addi r3,r3,-16
26 bne .Lsrc_unaligned
27 srdi r7,r5,4
28 ld r9,0(r4)
29 addi r4,r4,-8
30 mtctr r7
31 andi. r5,r5,7
32 bf cr7*4+0,2f
33 addi r3,r3,8
34 addi r4,r4,8
35 mr r8,r9
36 blt cr1,3f
371: ld r9,8(r4)
38 std r8,8(r3)
392: ldu r8,16(r4)
40 stdu r9,16(r3)
41 bdnz 1b
423: std r8,8(r3)
43 beqlr
44 addi r3,r3,16
45 ld r9,8(r4)
46.Ldo_tail:
47 bf cr7*4+1,1f
48 rotldi r9,r9,32
49 stw r9,0(r3)
50 addi r3,r3,4
511: bf cr7*4+2,2f
52 rotldi r9,r9,16
53 sth r9,0(r3)
54 addi r3,r3,2
552: bf cr7*4+3,3f
56 rotldi r9,r9,8
57 stb r9,0(r3)
583: blr
59
60.Lsrc_unaligned:
61 srdi r6,r5,3
62 addi r5,r5,-16
63 subf r4,r0,r4
64 srdi r7,r5,4
65 sldi r10,r0,3
66 cmpdi cr6,r6,3
67 andi. r5,r5,7
68 mtctr r7
69 subfic r11,r10,64
70 add r5,r5,r0
71
72 bt cr7*4+0,0f
73
74 ld r9,0(r4) # 3+2n loads, 2+2n stores
75 ld r0,8(r4)
76 sld r6,r9,r10
77 ldu r9,16(r4)
78 srd r7,r0,r11
79 sld r8,r0,r10
80 or r7,r7,r6
81 blt cr6,4f
82 ld r0,8(r4)
83 # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12
84 b 2f
85
860: ld r0,0(r4) # 4+2n loads, 3+2n stores
87 ldu r9,8(r4)
88 sld r8,r0,r10
89 addi r3,r3,-8
90 blt cr6,5f
91 ld r0,8(r4)
92 srd r12,r9,r11
93 sld r6,r9,r10
94 ldu r9,16(r4)
95 or r12,r8,r12
96 srd r7,r0,r11
97 sld r8,r0,r10
98 addi r3,r3,16
99 beq cr6,3f
100
101 # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9
1021: or r7,r7,r6
103 ld r0,8(r4)
104 std r12,8(r3)
1052: srd r12,r9,r11
106 sld r6,r9,r10
107 ldu r9,16(r4)
108 or r12,r8,r12
109 stdu r7,16(r3)
110 srd r7,r0,r11
111 sld r8,r0,r10
112 bdnz 1b
113
1143: std r12,8(r3)
115 or r7,r7,r6
1164: std r7,16(r3)
1175: srd r12,r9,r11
118 or r12,r8,r12
119 std r12,24(r3)
120 beqlr
121 cmpwi cr1,r5,8
122 addi r3,r3,32
123 sld r9,r9,r10
124 ble cr1,.Ldo_tail
125 ld r0,8(r4)
126 srd r7,r0,r11
127 or r9,r7,r9
128 b .Ldo_tail
129
130.Ldst_unaligned:
131 mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7
132 subf r5,r6,r5
133 li r7,0
134 cmpldi r1,r5,16
135 bf cr7*4+3,1f
136 lbz r0,0(r4)
137 stb r0,0(r3)
138 addi r7,r7,1
1391: bf cr7*4+2,2f
140 lhzx r0,r7,r4
141 sthx r0,r7,r3
142 addi r7,r7,2
1432: bf cr7*4+1,3f
144 lwzx r0,r7,r4
145 stwx r0,r7,r3
1463: mtcrf 0x01,r5
147 add r4,r6,r4
148 add r3,r6,r3
149 b .Ldst_aligned
150
151.Lshort_copy:
152 bf cr7*4+0,1f
153 lwz r0,0(r4)
154 lwz r9,4(r4)
155 addi r4,r4,8
156 stw r0,0(r3)
157 stw r9,4(r3)
158 addi r3,r3,8
1591: bf cr7*4+1,2f
160 lwz r0,0(r4)
161 addi r4,r4,4
162 stw r0,0(r3)
163 addi r3,r3,4
1642: bf cr7*4+2,3f
165 lhz r0,0(r4)
166 addi r4,r4,2
167 sth r0,0(r3)
168 addi r3,r3,2
1693: bf cr7*4+3,4f
170 lbz r0,0(r4)
171 stb r0,0(r3)
1724: blr