blob: 02e3ab4eddf3a2a29c742cf6d5b44ca9fe7bc1e0 [file] [log] [blame]
Michal Simek322ae8e2009-03-27 14:25:21 +01001/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33
34 .globl memcpy
35 .ent memcpy
36
37memcpy:
38fast_memcpy_ascending:
39 /* move d to return register as value of function */
40 addi r3, r5, 0
41
42 addi r4, r0, 4 /* n = 4 */
43 cmpu r4, r4, r7 /* n = c - n (unsigned) */
44 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
45
46 /* transfer first 0~3 bytes to get aligned dest address */
47 andi r4, r5, 3 /* n = d & 3 */
48 /* if zero, destination already aligned */
49 beqi r4, a_dalign_done
50 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
51 rsubi r4, r4, 4
52 rsub r7, r4, r7 /* c = c - n adjust c */
53
54a_xfer_first_loop:
55 /* if no bytes left to transfer, transfer the bulk */
56 beqi r4, a_dalign_done
57 lbui r11, r6, 0 /* h = *s */
58 sbi r11, r5, 0 /* *d = h */
59 addi r6, r6, 1 /* s++ */
60 addi r5, r5, 1 /* d++ */
61 brid a_xfer_first_loop /* loop */
62 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
63
64a_dalign_done:
65 addi r4, r0, 32 /* n = 32 */
66 cmpu r4, r4, r7 /* n = c - n (unsigned) */
67 /* if n < 0, less than one block to transfer */
68 blti r4, a_block_done
69
70a_block_xfer:
71 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
72 rsub r7, r4, r7 /* c = c - n */
73
74 andi r9, r6, 3 /* t1 = s & 3 */
75 /* if temp != 0, unaligned transfers needed */
76 bnei r9, a_block_unaligned
77
78a_block_aligned:
79 lwi r9, r6, 0 /* t1 = *(s + 0) */
80 lwi r10, r6, 4 /* t2 = *(s + 4) */
81 lwi r11, r6, 8 /* t3 = *(s + 8) */
82 lwi r12, r6, 12 /* t4 = *(s + 12) */
83 swi r9, r5, 0 /* *(d + 0) = t1 */
84 swi r10, r5, 4 /* *(d + 4) = t2 */
85 swi r11, r5, 8 /* *(d + 8) = t3 */
86 swi r12, r5, 12 /* *(d + 12) = t4 */
87 lwi r9, r6, 16 /* t1 = *(s + 16) */
88 lwi r10, r6, 20 /* t2 = *(s + 20) */
89 lwi r11, r6, 24 /* t3 = *(s + 24) */
90 lwi r12, r6, 28 /* t4 = *(s + 28) */
91 swi r9, r5, 16 /* *(d + 16) = t1 */
92 swi r10, r5, 20 /* *(d + 20) = t2 */
93 swi r11, r5, 24 /* *(d + 24) = t3 */
94 swi r12, r5, 28 /* *(d + 28) = t4 */
95 addi r6, r6, 32 /* s = s + 32 */
96 addi r4, r4, -32 /* n = n - 32 */
97 bneid r4, a_block_aligned /* while (n) loop */
98 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
99 bri a_block_done
100
101a_block_unaligned:
102 andi r8, r6, 0xfffffffc /* as = s & ~3 */
103 add r6, r6, r4 /* s = s + n */
104 lwi r11, r8, 0 /* h = *(as + 0) */
105
106 addi r9, r9, -1
107 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
108 addi r9, r9, -1
109 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
110
111a_block_u3:
112 bslli r11, r11, 24 /* h = h << 24 */
113a_bu3_loop:
114 lwi r12, r8, 4 /* v = *(as + 4) */
115 bsrli r9, r12, 8 /* t1 = v >> 8 */
116 or r9, r11, r9 /* t1 = h | t1 */
117 swi r9, r5, 0 /* *(d + 0) = t1 */
118 bslli r11, r12, 24 /* h = v << 24 */
119 lwi r12, r8, 8 /* v = *(as + 8) */
120 bsrli r9, r12, 8 /* t1 = v >> 8 */
121 or r9, r11, r9 /* t1 = h | t1 */
122 swi r9, r5, 4 /* *(d + 4) = t1 */
123 bslli r11, r12, 24 /* h = v << 24 */
124 lwi r12, r8, 12 /* v = *(as + 12) */
125 bsrli r9, r12, 8 /* t1 = v >> 8 */
126 or r9, r11, r9 /* t1 = h | t1 */
127 swi r9, r5, 8 /* *(d + 8) = t1 */
128 bslli r11, r12, 24 /* h = v << 24 */
129 lwi r12, r8, 16 /* v = *(as + 16) */
130 bsrli r9, r12, 8 /* t1 = v >> 8 */
131 or r9, r11, r9 /* t1 = h | t1 */
132 swi r9, r5, 12 /* *(d + 12) = t1 */
133 bslli r11, r12, 24 /* h = v << 24 */
134 lwi r12, r8, 20 /* v = *(as + 20) */
135 bsrli r9, r12, 8 /* t1 = v >> 8 */
136 or r9, r11, r9 /* t1 = h | t1 */
137 swi r9, r5, 16 /* *(d + 16) = t1 */
138 bslli r11, r12, 24 /* h = v << 24 */
139 lwi r12, r8, 24 /* v = *(as + 24) */
140 bsrli r9, r12, 8 /* t1 = v >> 8 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 20 /* *(d + 20) = t1 */
143 bslli r11, r12, 24 /* h = v << 24 */
144 lwi r12, r8, 28 /* v = *(as + 28) */
145 bsrli r9, r12, 8 /* t1 = v >> 8 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 24 /* *(d + 24) = t1 */
148 bslli r11, r12, 24 /* h = v << 24 */
149 lwi r12, r8, 32 /* v = *(as + 32) */
150 bsrli r9, r12, 8 /* t1 = v >> 8 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 28 /* *(d + 28) = t1 */
153 bslli r11, r12, 24 /* h = v << 24 */
154 addi r8, r8, 32 /* as = as + 32 */
155 addi r4, r4, -32 /* n = n - 32 */
156 bneid r4, a_bu3_loop /* while (n) loop */
157 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
158 bri a_block_done
159
160a_block_u1:
161 bslli r11, r11, 8 /* h = h << 8 */
162a_bu1_loop:
163 lwi r12, r8, 4 /* v = *(as + 4) */
164 bsrli r9, r12, 24 /* t1 = v >> 24 */
165 or r9, r11, r9 /* t1 = h | t1 */
166 swi r9, r5, 0 /* *(d + 0) = t1 */
167 bslli r11, r12, 8 /* h = v << 8 */
168 lwi r12, r8, 8 /* v = *(as + 8) */
169 bsrli r9, r12, 24 /* t1 = v >> 24 */
170 or r9, r11, r9 /* t1 = h | t1 */
171 swi r9, r5, 4 /* *(d + 4) = t1 */
172 bslli r11, r12, 8 /* h = v << 8 */
173 lwi r12, r8, 12 /* v = *(as + 12) */
174 bsrli r9, r12, 24 /* t1 = v >> 24 */
175 or r9, r11, r9 /* t1 = h | t1 */
176 swi r9, r5, 8 /* *(d + 8) = t1 */
177 bslli r11, r12, 8 /* h = v << 8 */
178 lwi r12, r8, 16 /* v = *(as + 16) */
179 bsrli r9, r12, 24 /* t1 = v >> 24 */
180 or r9, r11, r9 /* t1 = h | t1 */
181 swi r9, r5, 12 /* *(d + 12) = t1 */
182 bslli r11, r12, 8 /* h = v << 8 */
183 lwi r12, r8, 20 /* v = *(as + 20) */
184 bsrli r9, r12, 24 /* t1 = v >> 24 */
185 or r9, r11, r9 /* t1 = h | t1 */
186 swi r9, r5, 16 /* *(d + 16) = t1 */
187 bslli r11, r12, 8 /* h = v << 8 */
188 lwi r12, r8, 24 /* v = *(as + 24) */
189 bsrli r9, r12, 24 /* t1 = v >> 24 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 20 /* *(d + 20) = t1 */
192 bslli r11, r12, 8 /* h = v << 8 */
193 lwi r12, r8, 28 /* v = *(as + 28) */
194 bsrli r9, r12, 24 /* t1 = v >> 24 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 24 /* *(d + 24) = t1 */
197 bslli r11, r12, 8 /* h = v << 8 */
198 lwi r12, r8, 32 /* v = *(as + 32) */
199 bsrli r9, r12, 24 /* t1 = v >> 24 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 28 /* *(d + 28) = t1 */
202 bslli r11, r12, 8 /* h = v << 8 */
203 addi r8, r8, 32 /* as = as + 32 */
204 addi r4, r4, -32 /* n = n - 32 */
205 bneid r4, a_bu1_loop /* while (n) loop */
206 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
207 bri a_block_done
208
209a_block_u2:
210 bslli r11, r11, 16 /* h = h << 16 */
211a_bu2_loop:
212 lwi r12, r8, 4 /* v = *(as + 4) */
213 bsrli r9, r12, 16 /* t1 = v >> 16 */
214 or r9, r11, r9 /* t1 = h | t1 */
215 swi r9, r5, 0 /* *(d + 0) = t1 */
216 bslli r11, r12, 16 /* h = v << 16 */
217 lwi r12, r8, 8 /* v = *(as + 8) */
218 bsrli r9, r12, 16 /* t1 = v >> 16 */
219 or r9, r11, r9 /* t1 = h | t1 */
220 swi r9, r5, 4 /* *(d + 4) = t1 */
221 bslli r11, r12, 16 /* h = v << 16 */
222 lwi r12, r8, 12 /* v = *(as + 12) */
223 bsrli r9, r12, 16 /* t1 = v >> 16 */
224 or r9, r11, r9 /* t1 = h | t1 */
225 swi r9, r5, 8 /* *(d + 8) = t1 */
226 bslli r11, r12, 16 /* h = v << 16 */
227 lwi r12, r8, 16 /* v = *(as + 16) */
228 bsrli r9, r12, 16 /* t1 = v >> 16 */
229 or r9, r11, r9 /* t1 = h | t1 */
230 swi r9, r5, 12 /* *(d + 12) = t1 */
231 bslli r11, r12, 16 /* h = v << 16 */
232 lwi r12, r8, 20 /* v = *(as + 20) */
233 bsrli r9, r12, 16 /* t1 = v >> 16 */
234 or r9, r11, r9 /* t1 = h | t1 */
235 swi r9, r5, 16 /* *(d + 16) = t1 */
236 bslli r11, r12, 16 /* h = v << 16 */
237 lwi r12, r8, 24 /* v = *(as + 24) */
238 bsrli r9, r12, 16 /* t1 = v >> 16 */
239 or r9, r11, r9 /* t1 = h | t1 */
240 swi r9, r5, 20 /* *(d + 20) = t1 */
241 bslli r11, r12, 16 /* h = v << 16 */
242 lwi r12, r8, 28 /* v = *(as + 28) */
243 bsrli r9, r12, 16 /* t1 = v >> 16 */
244 or r9, r11, r9 /* t1 = h | t1 */
245 swi r9, r5, 24 /* *(d + 24) = t1 */
246 bslli r11, r12, 16 /* h = v << 16 */
247 lwi r12, r8, 32 /* v = *(as + 32) */
248 bsrli r9, r12, 16 /* t1 = v >> 16 */
249 or r9, r11, r9 /* t1 = h | t1 */
250 swi r9, r5, 28 /* *(d + 28) = t1 */
251 bslli r11, r12, 16 /* h = v << 16 */
252 addi r8, r8, 32 /* as = as + 32 */
253 addi r4, r4, -32 /* n = n - 32 */
254 bneid r4, a_bu2_loop /* while (n) loop */
255 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
256
257a_block_done:
258 addi r4, r0, 4 /* n = 4 */
259 cmpu r4, r4, r7 /* n = c - n (unsigned) */
260 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
261
262a_word_xfer:
263 andi r4, r7, 0xfffffffc /* n = c & ~3 */
264 addi r10, r0, 0 /* offset = 0 */
265
266 andi r9, r6, 3 /* t1 = s & 3 */
267 /* if temp != 0, unaligned transfers needed */
268 bnei r9, a_word_unaligned
269
270a_word_aligned:
271 lw r9, r6, r10 /* t1 = *(s+offset) */
272 sw r9, r5, r10 /* *(d+offset) = t1 */
273 addi r4, r4,-4 /* n-- */
274 bneid r4, a_word_aligned /* loop */
275 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
276
277 bri a_word_done
278
279a_word_unaligned:
280 andi r8, r6, 0xfffffffc /* as = s & ~3 */
281 lwi r11, r8, 0 /* h = *(as + 0) */
282 addi r8, r8, 4 /* as = as + 4 */
283
284 addi r9, r9, -1
285 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
286 addi r9, r9, -1
287 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
288
289a_word_u3:
290 bslli r11, r11, 24 /* h = h << 24 */
291a_wu3_loop:
292 lw r12, r8, r10 /* v = *(as + offset) */
293 bsrli r9, r12, 8 /* t1 = v >> 8 */
294 or r9, r11, r9 /* t1 = h | t1 */
295 sw r9, r5, r10 /* *(d + offset) = t1 */
296 bslli r11, r12, 24 /* h = v << 24 */
297 addi r4, r4,-4 /* n = n - 4 */
298 bneid r4, a_wu3_loop /* while (n) loop */
299 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
300
301 bri a_word_done
302
303a_word_u1:
304 bslli r11, r11, 8 /* h = h << 8 */
305a_wu1_loop:
306 lw r12, r8, r10 /* v = *(as + offset) */
307 bsrli r9, r12, 24 /* t1 = v >> 24 */
308 or r9, r11, r9 /* t1 = h | t1 */
309 sw r9, r5, r10 /* *(d + offset) = t1 */
310 bslli r11, r12, 8 /* h = v << 8 */
311 addi r4, r4,-4 /* n = n - 4 */
312 bneid r4, a_wu1_loop /* while (n) loop */
313 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
314
315 bri a_word_done
316
317a_word_u2:
318 bslli r11, r11, 16 /* h = h << 16 */
319a_wu2_loop:
320 lw r12, r8, r10 /* v = *(as + offset) */
321 bsrli r9, r12, 16 /* t1 = v >> 16 */
322 or r9, r11, r9 /* t1 = h | t1 */
323 sw r9, r5, r10 /* *(d + offset) = t1 */
324 bslli r11, r12, 16 /* h = v << 16 */
325 addi r4, r4,-4 /* n = n - 4 */
326 bneid r4, a_wu2_loop /* while (n) loop */
327 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
328
329a_word_done:
330 add r5, r5, r10 /* d = d + offset */
331 add r6, r6, r10 /* s = s + offset */
332 rsub r7, r10, r7 /* c = c - offset */
333
334a_xfer_end:
335a_xfer_end_loop:
336 beqi r7, a_done /* while (c) */
337 lbui r9, r6, 0 /* t1 = *s */
338 addi r6, r6, 1 /* s++ */
339 sbi r9, r5, 0 /* *d = t1 */
340 addi r7, r7, -1 /* c-- */
341 brid a_xfer_end_loop /* loop */
342 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
343
344a_done:
345 rtsd r15, 8
346 nop
347
348.end memcpy
349/*----------------------------------------------------------------------------*/
350 .globl memmove
351 .ent memmove
352
353memmove:
354 cmpu r4, r5, r6 /* n = s - d */
355 bgei r4,fast_memcpy_ascending
356
357fast_memcpy_descending:
358 /* move d to return register as value of function */
359 addi r3, r5, 0
360
361 add r5, r5, r7 /* d = d + c */
362 add r6, r6, r7 /* s = s + c */
363
364 addi r4, r0, 4 /* n = 4 */
365 cmpu r4, r4, r7 /* n = c - n (unsigned) */
366 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
367
368 /* transfer first 0~3 bytes to get aligned dest address */
369 andi r4, r5, 3 /* n = d & 3 */
370 /* if zero, destination already aligned */
371 beqi r4,d_dalign_done
372 rsub r7, r4, r7 /* c = c - n adjust c */
373
374d_xfer_first_loop:
375 /* if no bytes left to transfer, transfer the bulk */
376 beqi r4,d_dalign_done
377 addi r6, r6, -1 /* s-- */
378 addi r5, r5, -1 /* d-- */
379 lbui r11, r6, 0 /* h = *s */
380 sbi r11, r5, 0 /* *d = h */
381 brid d_xfer_first_loop /* loop */
382 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
383
384d_dalign_done:
385 addi r4, r0, 32 /* n = 32 */
386 cmpu r4, r4, r7 /* n = c - n (unsigned) */
387 /* if n < 0, less than one block to transfer */
388 blti r4, d_block_done
389
390d_block_xfer:
391 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
392 rsub r7, r4, r7 /* c = c - n */
393
394 andi r9, r6, 3 /* t1 = s & 3 */
395 /* if temp != 0, unaligned transfers needed */
396 bnei r9, d_block_unaligned
397
398d_block_aligned:
399 addi r6, r6, -32 /* s = s - 32 */
400 addi r5, r5, -32 /* d = d - 32 */
401 lwi r9, r6, 28 /* t1 = *(s + 28) */
402 lwi r10, r6, 24 /* t2 = *(s + 24) */
403 lwi r11, r6, 20 /* t3 = *(s + 20) */
404 lwi r12, r6, 16 /* t4 = *(s + 16) */
405 swi r9, r5, 28 /* *(d + 28) = t1 */
406 swi r10, r5, 24 /* *(d + 24) = t2 */
407 swi r11, r5, 20 /* *(d + 20) = t3 */
408 swi r12, r5, 16 /* *(d + 16) = t4 */
409 lwi r9, r6, 12 /* t1 = *(s + 12) */
410 lwi r10, r6, 8 /* t2 = *(s + 8) */
411 lwi r11, r6, 4 /* t3 = *(s + 4) */
412 lwi r12, r6, 0 /* t4 = *(s + 0) */
413 swi r9, r5, 12 /* *(d + 12) = t1 */
414 swi r10, r5, 8 /* *(d + 8) = t2 */
415 swi r11, r5, 4 /* *(d + 4) = t3 */
416 addi r4, r4, -32 /* n = n - 32 */
417 bneid r4, d_block_aligned /* while (n) loop */
418 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
419 bri d_block_done
420
421d_block_unaligned:
422 andi r8, r6, 0xfffffffc /* as = s & ~3 */
423 rsub r6, r4, r6 /* s = s - n */
424 lwi r11, r8, 0 /* h = *(as + 0) */
425
426 addi r9, r9, -1
427 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
428 addi r9, r9, -1
429 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
430
431d_block_u3:
432 bsrli r11, r11, 8 /* h = h >> 8 */
433d_bu3_loop:
434 addi r8, r8, -32 /* as = as - 32 */
435 addi r5, r5, -32 /* d = d - 32 */
436 lwi r12, r8, 28 /* v = *(as + 28) */
437 bslli r9, r12, 24 /* t1 = v << 24 */
438 or r9, r11, r9 /* t1 = h | t1 */
439 swi r9, r5, 28 /* *(d + 28) = t1 */
440 bsrli r11, r12, 8 /* h = v >> 8 */
441 lwi r12, r8, 24 /* v = *(as + 24) */
442 bslli r9, r12, 24 /* t1 = v << 24 */
443 or r9, r11, r9 /* t1 = h | t1 */
444 swi r9, r5, 24 /* *(d + 24) = t1 */
445 bsrli r11, r12, 8 /* h = v >> 8 */
446 lwi r12, r8, 20 /* v = *(as + 20) */
447 bslli r9, r12, 24 /* t1 = v << 24 */
448 or r9, r11, r9 /* t1 = h | t1 */
449 swi r9, r5, 20 /* *(d + 20) = t1 */
450 bsrli r11, r12, 8 /* h = v >> 8 */
451 lwi r12, r8, 16 /* v = *(as + 16) */
452 bslli r9, r12, 24 /* t1 = v << 24 */
453 or r9, r11, r9 /* t1 = h | t1 */
454 swi r9, r5, 16 /* *(d + 16) = t1 */
455 bsrli r11, r12, 8 /* h = v >> 8 */
456 lwi r12, r8, 12 /* v = *(as + 12) */
457 bslli r9, r12, 24 /* t1 = v << 24 */
458 or r9, r11, r9 /* t1 = h | t1 */
459 swi r9, r5, 12 /* *(d + 112) = t1 */
460 bsrli r11, r12, 8 /* h = v >> 8 */
461 lwi r12, r8, 8 /* v = *(as + 8) */
462 bslli r9, r12, 24 /* t1 = v << 24 */
463 or r9, r11, r9 /* t1 = h | t1 */
464 swi r9, r5, 8 /* *(d + 8) = t1 */
465 bsrli r11, r12, 8 /* h = v >> 8 */
466 lwi r12, r8, 4 /* v = *(as + 4) */
467 bslli r9, r12, 24 /* t1 = v << 24 */
468 or r9, r11, r9 /* t1 = h | t1 */
469 swi r9, r5, 4 /* *(d + 4) = t1 */
470 bsrli r11, r12, 8 /* h = v >> 8 */
471 lwi r12, r8, 0 /* v = *(as + 0) */
472 bslli r9, r12, 24 /* t1 = v << 24 */
473 or r9, r11, r9 /* t1 = h | t1 */
474 swi r9, r5, 0 /* *(d + 0) = t1 */
475 addi r4, r4, -32 /* n = n - 32 */
476 bneid r4, d_bu3_loop /* while (n) loop */
477 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
478 bri d_block_done
479
480d_block_u1:
481 bsrli r11, r11, 24 /* h = h >> 24 */
482d_bu1_loop:
483 addi r8, r8, -32 /* as = as - 32 */
484 addi r5, r5, -32 /* d = d - 32 */
485 lwi r12, r8, 28 /* v = *(as + 28) */
486 bslli r9, r12, 8 /* t1 = v << 8 */
487 or r9, r11, r9 /* t1 = h | t1 */
488 swi r9, r5, 28 /* *(d + 28) = t1 */
489 bsrli r11, r12, 24 /* h = v >> 24 */
490 lwi r12, r8, 24 /* v = *(as + 24) */
491 bslli r9, r12, 8 /* t1 = v << 8 */
492 or r9, r11, r9 /* t1 = h | t1 */
493 swi r9, r5, 24 /* *(d + 24) = t1 */
494 bsrli r11, r12, 24 /* h = v >> 24 */
495 lwi r12, r8, 20 /* v = *(as + 20) */
496 bslli r9, r12, 8 /* t1 = v << 8 */
497 or r9, r11, r9 /* t1 = h | t1 */
498 swi r9, r5, 20 /* *(d + 20) = t1 */
499 bsrli r11, r12, 24 /* h = v >> 24 */
500 lwi r12, r8, 16 /* v = *(as + 16) */
501 bslli r9, r12, 8 /* t1 = v << 8 */
502 or r9, r11, r9 /* t1 = h | t1 */
503 swi r9, r5, 16 /* *(d + 16) = t1 */
504 bsrli r11, r12, 24 /* h = v >> 24 */
505 lwi r12, r8, 12 /* v = *(as + 12) */
506 bslli r9, r12, 8 /* t1 = v << 8 */
507 or r9, r11, r9 /* t1 = h | t1 */
508 swi r9, r5, 12 /* *(d + 112) = t1 */
509 bsrli r11, r12, 24 /* h = v >> 24 */
510 lwi r12, r8, 8 /* v = *(as + 8) */
511 bslli r9, r12, 8 /* t1 = v << 8 */
512 or r9, r11, r9 /* t1 = h | t1 */
513 swi r9, r5, 8 /* *(d + 8) = t1 */
514 bsrli r11, r12, 24 /* h = v >> 24 */
515 lwi r12, r8, 4 /* v = *(as + 4) */
516 bslli r9, r12, 8 /* t1 = v << 8 */
517 or r9, r11, r9 /* t1 = h | t1 */
518 swi r9, r5, 4 /* *(d + 4) = t1 */
519 bsrli r11, r12, 24 /* h = v >> 24 */
520 lwi r12, r8, 0 /* v = *(as + 0) */
521 bslli r9, r12, 8 /* t1 = v << 8 */
522 or r9, r11, r9 /* t1 = h | t1 */
523 swi r9, r5, 0 /* *(d + 0) = t1 */
524 addi r4, r4, -32 /* n = n - 32 */
525 bneid r4, d_bu1_loop /* while (n) loop */
526 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
527 bri d_block_done
528
529d_block_u2:
530 bsrli r11, r11, 16 /* h = h >> 16 */
531d_bu2_loop:
532 addi r8, r8, -32 /* as = as - 32 */
533 addi r5, r5, -32 /* d = d - 32 */
534 lwi r12, r8, 28 /* v = *(as + 28) */
535 bslli r9, r12, 16 /* t1 = v << 16 */
536 or r9, r11, r9 /* t1 = h | t1 */
537 swi r9, r5, 28 /* *(d + 28) = t1 */
538 bsrli r11, r12, 16 /* h = v >> 16 */
539 lwi r12, r8, 24 /* v = *(as + 24) */
540 bslli r9, r12, 16 /* t1 = v << 16 */
541 or r9, r11, r9 /* t1 = h | t1 */
542 swi r9, r5, 24 /* *(d + 24) = t1 */
543 bsrli r11, r12, 16 /* h = v >> 16 */
544 lwi r12, r8, 20 /* v = *(as + 20) */
545 bslli r9, r12, 16 /* t1 = v << 16 */
546 or r9, r11, r9 /* t1 = h | t1 */
547 swi r9, r5, 20 /* *(d + 20) = t1 */
548 bsrli r11, r12, 16 /* h = v >> 16 */
549 lwi r12, r8, 16 /* v = *(as + 16) */
550 bslli r9, r12, 16 /* t1 = v << 16 */
551 or r9, r11, r9 /* t1 = h | t1 */
552 swi r9, r5, 16 /* *(d + 16) = t1 */
553 bsrli r11, r12, 16 /* h = v >> 16 */
554 lwi r12, r8, 12 /* v = *(as + 12) */
555 bslli r9, r12, 16 /* t1 = v << 16 */
556 or r9, r11, r9 /* t1 = h | t1 */
557 swi r9, r5, 12 /* *(d + 112) = t1 */
558 bsrli r11, r12, 16 /* h = v >> 16 */
559 lwi r12, r8, 8 /* v = *(as + 8) */
560 bslli r9, r12, 16 /* t1 = v << 16 */
561 or r9, r11, r9 /* t1 = h | t1 */
562 swi r9, r5, 8 /* *(d + 8) = t1 */
563 bsrli r11, r12, 16 /* h = v >> 16 */
564 lwi r12, r8, 4 /* v = *(as + 4) */
565 bslli r9, r12, 16 /* t1 = v << 16 */
566 or r9, r11, r9 /* t1 = h | t1 */
567 swi r9, r5, 4 /* *(d + 4) = t1 */
568 bsrli r11, r12, 16 /* h = v >> 16 */
569 lwi r12, r8, 0 /* v = *(as + 0) */
570 bslli r9, r12, 16 /* t1 = v << 16 */
571 or r9, r11, r9 /* t1 = h | t1 */
572 swi r9, r5, 0 /* *(d + 0) = t1 */
573 addi r4, r4, -32 /* n = n - 32 */
574 bneid r4, d_bu2_loop /* while (n) loop */
575 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
576
577d_block_done:
578 addi r4, r0, 4 /* n = 4 */
579 cmpu r4, r4, r7 /* n = c - n (unsigned) */
580 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
581
582d_word_xfer:
583 andi r4, r7, 0xfffffffc /* n = c & ~3 */
584 rsub r5, r4, r5 /* d = d - n */
585 rsub r6, r4, r6 /* s = s - n */
586 rsub r7, r4, r7 /* c = c - n */
587
588 andi r9, r6, 3 /* t1 = s & 3 */
589 /* if temp != 0, unaligned transfers needed */
590 bnei r9, d_word_unaligned
591
592d_word_aligned:
593 addi r4, r4,-4 /* n-- */
594 lw r9, r6, r4 /* t1 = *(s+n) */
595 bneid r4, d_word_aligned /* loop */
596 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
597
598 bri d_word_done
599
600d_word_unaligned:
601 andi r8, r6, 0xfffffffc /* as = s & ~3 */
602 lw r11, r8, r4 /* h = *(as + n) */
603
604 addi r9, r9, -1
605 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
606 addi r9, r9, -1
607 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
608
609d_word_u3:
610 bsrli r11, r11, 8 /* h = h >> 8 */
611d_wu3_loop:
612 addi r4, r4,-4 /* n = n - 4 */
613 lw r12, r8, r4 /* v = *(as + n) */
614 bslli r9, r12, 24 /* t1 = v << 24 */
615 or r9, r11, r9 /* t1 = h | t1 */
616 sw r9, r5, r4 /* *(d + n) = t1 */
617 bneid r4, d_wu3_loop /* while (n) loop */
618 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
619
620 bri d_word_done
621
622d_word_u1:
623 bsrli r11, r11, 24 /* h = h >> 24 */
624d_wu1_loop:
625 addi r4, r4,-4 /* n = n - 4 */
626 lw r12, r8, r4 /* v = *(as + n) */
627 bslli r9, r12, 8 /* t1 = v << 8 */
628 or r9, r11, r9 /* t1 = h | t1 */
629 sw r9, r5, r4 /* *(d + n) = t1 */
630 bneid r4, d_wu1_loop /* while (n) loop */
631 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
632
633 bri d_word_done
634
635d_word_u2:
636 bsrli r11, r11, 16 /* h = h >> 16 */
637d_wu2_loop:
638 addi r4, r4,-4 /* n = n - 4 */
639 lw r12, r8, r4 /* v = *(as + n) */
640 bslli r9, r12, 16 /* t1 = v << 16 */
641 or r9, r11, r9 /* t1 = h | t1 */
642 sw r9, r5, r4 /* *(d + n) = t1 */
643 bneid r4, d_wu2_loop /* while (n) loop */
644 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
645
646d_word_done:
647
648d_xfer_end:
649d_xfer_end_loop:
650 beqi r7, a_done /* while (c) */
651 addi r6, r6, -1 /* s-- */
652 lbui r9, r6, 0 /* t1 = *s */
653 addi r5, r5, -1 /* d-- */
654 sbi r9, r5, 0 /* *d = t1 */
655 brid d_xfer_end_loop /* loop */
656 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
657
658d_done:
659 rtsd r15, 8
660 nop
661
662.end memmove