blob: d8b94e1c7fcad001c32142f4e4cc51d3cc8f6a5b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * arch/alpha/lib/ev6-memset.S
3 *
4 * This is an efficient (and relatively small) implementation of the C library
5 * "memset()" function for the 21264 implementation of Alpha.
6 *
7 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
8 *
9 * Much of the information about 21264 scheduling/coding comes from:
10 * Compiler Writer's Guide for the Alpha 21264
11 * abbreviated as 'CWG' in other comments here
12 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
13 * Scheduling notation:
14 * E - either cluster
15 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
16 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
17 * The algorithm for the leading and trailing quadwords remains the same,
18 * however the loop has been unrolled to enable better memory throughput,
19 * and the code has been replicated for each of the entry points: __memset
20 * and __memsetw to permit better scheduling to eliminate the stalling
21 * encountered during the mask replication.
22 * A future enhancement might be to put in a byte store loop for really
23 * small (say < 32 bytes) memset()s. Whether or not that change would be
24 * a win in the kernel would depend upon the contextual usage.
25 * WARNING: Maintaining this is going to be more work than the above version,
26 * as fixes will need to be made in multiple places. The performance gain
27 * is worth it.
28 */
29
30 .set noat
31 .set noreorder
32.text
33 .globl __memset
34 .globl __memsetw
35 .globl __constant_c_memset
36 .globl memset
37
38 .ent __memset
39.align 5
40__memset:
41 .frame $30,0,$26,0
42 .prologue 0
43
44 /*
45 * Serious stalling happens. The only way to mitigate this is to
46 * undertake a major re-write to interleave the constant materialization
47 * with other parts of the fall-through code. This is important, even
48 * though it makes maintenance tougher.
49 * Do this later.
50 */
51 and $17,255,$1 # E : 00000000000000ch
52 insbl $17,1,$2 # U : 000000000000ch00
53 bis $16,$16,$0 # E : return value
54 ble $18,end_b # U : zero length requested?
55
56 addq $18,$16,$6 # E : max address to write to
57 bis $1,$2,$17 # E : 000000000000chch
58 insbl $1,2,$3 # U : 0000000000ch0000
59 insbl $1,3,$4 # U : 00000000ch000000
60
61 or $3,$4,$3 # E : 00000000chch0000
62 inswl $17,4,$5 # U : 0000chch00000000
63 xor $16,$6,$1 # E : will complete write be within one quadword?
64 inswl $17,6,$2 # U : chch000000000000
65
66 or $17,$3,$17 # E : 00000000chchchch
67 or $2,$5,$2 # E : chchchch00000000
68 bic $1,7,$1 # E : fit within a single quadword?
69 and $16,7,$3 # E : Target addr misalignment
70
71 or $17,$2,$17 # E : chchchchchchchch
72 beq $1,within_quad_b # U :
73 nop # E :
74 beq $3,aligned_b # U : target is 0mod8
75
76 /*
77 * Target address is misaligned, and won't fit within a quadword
78 */
79 ldq_u $4,0($16) # L : Fetch first partial
80 bis $16,$16,$5 # E : Save the address
81 insql $17,$16,$2 # U : Insert new bytes
82 subq $3,8,$3 # E : Invert (for addressing uses)
83
84 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
85 mskql $4,$16,$4 # U : clear relevant parts of the quad
86 subq $16,$3,$16 # E : $16 is new aligned destination
87 bis $2,$4,$1 # E : Final bytes
88
89 nop
90 stq_u $1,0($5) # L : Store result
91 nop
92 nop
93
94.align 4
95aligned_b:
96 /*
97 * We are now guaranteed to be quad aligned, with at least
98 * one partial quad to write.
99 */
100
101 sra $18,3,$3 # U : Number of remaining quads to write
102 and $18,7,$18 # E : Number of trailing bytes to write
103 bis $16,$16,$5 # E : Save dest address
104 beq $3,no_quad_b # U : tail stuff only
105
106 /*
107 * it's worth the effort to unroll this and use wh64 if possible
108 * Lifted a bunch of code from clear_user.S
109 * At this point, entry values are:
110 * $16 Current destination address
111 * $5 A copy of $16
112 * $6 The max quadword address to write to
113 * $18 Number trailer bytes
114 * $3 Number quads to write
115 */
116
117 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
118 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
119 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
120 blt $4, loop_b # U :
121
122 /*
123 * We know we've got at least 16 quads, minimum of one trip
124 * through unrolled loop. Do a quad at a time to get us 0mod64
125 * aligned.
126 */
127
128 nop # E :
129 nop # E :
130 nop # E :
131 beq $1, $bigalign_b # U :
132
133$alignmod64_b:
134 stq $17, 0($5) # L :
135 subq $3, 1, $3 # E : For consistency later
136 addq $1, 8, $1 # E : Increment towards zero for alignment
137 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
138
139 nop
140 nop
141 addq $5, 8, $5 # E : Inc address
142 blt $1, $alignmod64_b # U :
143
144$bigalign_b:
145 /*
146 * $3 - number quads left to go
147 * $5 - target address (aligned 0mod64)
148 * $17 - mask of stuff to store
149 * Scratch registers available: $7, $2, $4, $1
150 * we know that we'll be taking a minimum of one trip through
151 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
152 * Assumes the wh64 needs to be for 2 trips through the loop in the future
153 * The wh64 is issued on for the starting destination address for trip +2
154 * through the loop, and if there are less than two trips left, the target
155 * address will be for the current trip.
156 */
157
158$do_wh64_b:
159 wh64 ($4) # L1 : memory subsystem write hint
160 subq $3, 24, $2 # E : For determining future wh64 addresses
161 stq $17, 0($5) # L :
162 nop # E :
163
164 addq $5, 128, $4 # E : speculative target of next wh64
165 stq $17, 8($5) # L :
166 stq $17, 16($5) # L :
167 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
168
169 stq $17, 24($5) # L :
170 stq $17, 32($5) # L :
171 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
172 nop
173
174 stq $17, 40($5) # L :
175 stq $17, 48($5) # L :
176 subq $3, 16, $2 # E : Repeat the loop at least once more?
177 nop
178
179 stq $17, 56($5) # L :
180 addq $5, 64, $5 # E :
181 subq $3, 8, $3 # E :
182 bge $2, $do_wh64_b # U :
183
184 nop
185 nop
186 nop
187 beq $3, no_quad_b # U : Might have finished already
188
189.align 4
190 /*
191 * Simple loop for trailing quadwords, or for small amounts
192 * of data (where we can't use an unrolled loop and wh64)
193 */
194loop_b:
195 stq $17,0($5) # L :
196 subq $3,1,$3 # E : Decrement number quads left
197 addq $5,8,$5 # E : Inc address
198 bne $3,loop_b # U : more?
199
200no_quad_b:
201 /*
202 * Write 0..7 trailing bytes.
203 */
204 nop # E :
205 beq $18,end_b # U : All done?
206 ldq $7,0($5) # L :
207 mskqh $7,$6,$2 # U : Mask final quad
208
209 insqh $17,$6,$4 # U : New bits
210 bis $2,$4,$1 # E : Put it all together
211 stq $1,0($5) # L : And back to memory
212 ret $31,($26),1 # L0 :
213
214within_quad_b:
215 ldq_u $1,0($16) # L :
216 insql $17,$16,$2 # U : New bits
217 mskql $1,$16,$4 # U : Clear old
218 bis $2,$4,$2 # E : New result
219
220 mskql $2,$6,$4 # U :
221 mskqh $1,$6,$2 # U :
222 bis $2,$4,$1 # E :
223 stq_u $1,0($16) # L :
224
225end_b:
226 nop
227 nop
228 nop
229 ret $31,($26),1 # L0 :
230 .end __memset
231
232 /*
233 * This is the original body of code, prior to replication and
234 * rescheduling. Leave it here, as there may be calls to this
235 * entry point.
236 */
237.align 4
238 .ent __constant_c_memset
239__constant_c_memset:
240 .frame $30,0,$26,0
241 .prologue 0
242
243 addq $18,$16,$6 # E : max address to write to
244 bis $16,$16,$0 # E : return value
245 xor $16,$6,$1 # E : will complete write be within one quadword?
246 ble $18,end # U : zero length requested?
247
248 bic $1,7,$1 # E : fit within a single quadword
249 beq $1,within_one_quad # U :
250 and $16,7,$3 # E : Target addr misalignment
251 beq $3,aligned # U : target is 0mod8
252
253 /*
254 * Target address is misaligned, and won't fit within a quadword
255 */
256 ldq_u $4,0($16) # L : Fetch first partial
257 bis $16,$16,$5 # E : Save the address
258 insql $17,$16,$2 # U : Insert new bytes
259 subq $3,8,$3 # E : Invert (for addressing uses)
260
261 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
262 mskql $4,$16,$4 # U : clear relevant parts of the quad
263 subq $16,$3,$16 # E : $16 is new aligned destination
264 bis $2,$4,$1 # E : Final bytes
265
266 nop
267 stq_u $1,0($5) # L : Store result
268 nop
269 nop
270
271.align 4
272aligned:
273 /*
274 * We are now guaranteed to be quad aligned, with at least
275 * one partial quad to write.
276 */
277
278 sra $18,3,$3 # U : Number of remaining quads to write
279 and $18,7,$18 # E : Number of trailing bytes to write
280 bis $16,$16,$5 # E : Save dest address
281 beq $3,no_quad # U : tail stuff only
282
283 /*
284 * it's worth the effort to unroll this and use wh64 if possible
285 * Lifted a bunch of code from clear_user.S
286 * At this point, entry values are:
287 * $16 Current destination address
288 * $5 A copy of $16
289 * $6 The max quadword address to write to
290 * $18 Number trailer bytes
291 * $3 Number quads to write
292 */
293
294 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
295 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
296 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
297 blt $4, loop # U :
298
299 /*
300 * We know we've got at least 16 quads, minimum of one trip
301 * through unrolled loop. Do a quad at a time to get us 0mod64
302 * aligned.
303 */
304
305 nop # E :
306 nop # E :
307 nop # E :
308 beq $1, $bigalign # U :
309
310$alignmod64:
311 stq $17, 0($5) # L :
312 subq $3, 1, $3 # E : For consistency later
313 addq $1, 8, $1 # E : Increment towards zero for alignment
314 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
315
316 nop
317 nop
318 addq $5, 8, $5 # E : Inc address
319 blt $1, $alignmod64 # U :
320
321$bigalign:
322 /*
323 * $3 - number quads left to go
324 * $5 - target address (aligned 0mod64)
325 * $17 - mask of stuff to store
326 * Scratch registers available: $7, $2, $4, $1
327 * we know that we'll be taking a minimum of one trip through
328 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
329 * Assumes the wh64 needs to be for 2 trips through the loop in the future
330 * The wh64 is issued on for the starting destination address for trip +2
331 * through the loop, and if there are less than two trips left, the target
332 * address will be for the current trip.
333 */
334
335$do_wh64:
336 wh64 ($4) # L1 : memory subsystem write hint
337 subq $3, 24, $2 # E : For determining future wh64 addresses
338 stq $17, 0($5) # L :
339 nop # E :
340
341 addq $5, 128, $4 # E : speculative target of next wh64
342 stq $17, 8($5) # L :
343 stq $17, 16($5) # L :
344 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
345
346 stq $17, 24($5) # L :
347 stq $17, 32($5) # L :
348 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
349 nop
350
351 stq $17, 40($5) # L :
352 stq $17, 48($5) # L :
353 subq $3, 16, $2 # E : Repeat the loop at least once more?
354 nop
355
356 stq $17, 56($5) # L :
357 addq $5, 64, $5 # E :
358 subq $3, 8, $3 # E :
359 bge $2, $do_wh64 # U :
360
361 nop
362 nop
363 nop
364 beq $3, no_quad # U : Might have finished already
365
366.align 4
367 /*
368 * Simple loop for trailing quadwords, or for small amounts
369 * of data (where we can't use an unrolled loop and wh64)
370 */
371loop:
372 stq $17,0($5) # L :
373 subq $3,1,$3 # E : Decrement number quads left
374 addq $5,8,$5 # E : Inc address
375 bne $3,loop # U : more?
376
377no_quad:
378 /*
379 * Write 0..7 trailing bytes.
380 */
381 nop # E :
382 beq $18,end # U : All done?
383 ldq $7,0($5) # L :
384 mskqh $7,$6,$2 # U : Mask final quad
385
386 insqh $17,$6,$4 # U : New bits
387 bis $2,$4,$1 # E : Put it all together
388 stq $1,0($5) # L : And back to memory
389 ret $31,($26),1 # L0 :
390
391within_one_quad:
392 ldq_u $1,0($16) # L :
393 insql $17,$16,$2 # U : New bits
394 mskql $1,$16,$4 # U : Clear old
395 bis $2,$4,$2 # E : New result
396
397 mskql $2,$6,$4 # U :
398 mskqh $1,$6,$2 # U :
399 bis $2,$4,$1 # E :
400 stq_u $1,0($16) # L :
401
402end:
403 nop
404 nop
405 nop
406 ret $31,($26),1 # L0 :
407 .end __constant_c_memset
408
409 /*
410 * This is a replicant of the __constant_c_memset code, rescheduled
411 * to mask stalls. Note that entry point names also had to change
412 */
413 .align 5
414 .ent __memsetw
415
416__memsetw:
417 .frame $30,0,$26,0
418 .prologue 0
419
420 inswl $17,0,$5 # U : 000000000000c1c2
421 inswl $17,2,$2 # U : 00000000c1c20000
422 bis $16,$16,$0 # E : return value
423 addq $18,$16,$6 # E : max address to write to
424
425 ble $18, end_w # U : zero length requested?
426 inswl $17,4,$3 # U : 0000c1c200000000
427 inswl $17,6,$4 # U : c1c2000000000000
428 xor $16,$6,$1 # E : will complete write be within one quadword?
429
430 or $2,$5,$2 # E : 00000000c1c2c1c2
431 or $3,$4,$17 # E : c1c2c1c200000000
432 bic $1,7,$1 # E : fit within a single quadword
433 and $16,7,$3 # E : Target addr misalignment
434
435 or $17,$2,$17 # E : c1c2c1c2c1c2c1c2
436 beq $1,within_quad_w # U :
437 nop
438 beq $3,aligned_w # U : target is 0mod8
439
440 /*
441 * Target address is misaligned, and won't fit within a quadword
442 */
443 ldq_u $4,0($16) # L : Fetch first partial
444 bis $16,$16,$5 # E : Save the address
445 insql $17,$16,$2 # U : Insert new bytes
446 subq $3,8,$3 # E : Invert (for addressing uses)
447
448 addq $18,$3,$18 # E : $18 is new count ($3 is negative)
449 mskql $4,$16,$4 # U : clear relevant parts of the quad
450 subq $16,$3,$16 # E : $16 is new aligned destination
451 bis $2,$4,$1 # E : Final bytes
452
453 nop
454 stq_u $1,0($5) # L : Store result
455 nop
456 nop
457
458.align 4
459aligned_w:
460 /*
461 * We are now guaranteed to be quad aligned, with at least
462 * one partial quad to write.
463 */
464
465 sra $18,3,$3 # U : Number of remaining quads to write
466 and $18,7,$18 # E : Number of trailing bytes to write
467 bis $16,$16,$5 # E : Save dest address
468 beq $3,no_quad_w # U : tail stuff only
469
470 /*
471 * it's worth the effort to unroll this and use wh64 if possible
472 * Lifted a bunch of code from clear_user.S
473 * At this point, entry values are:
474 * $16 Current destination address
475 * $5 A copy of $16
476 * $6 The max quadword address to write to
477 * $18 Number trailer bytes
478 * $3 Number quads to write
479 */
480
481 and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop)
482 subq $3, 16, $4 # E : Only try to unroll if > 128 bytes
483 subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64)
484 blt $4, loop_w # U :
485
486 /*
487 * We know we've got at least 16 quads, minimum of one trip
488 * through unrolled loop. Do a quad at a time to get us 0mod64
489 * aligned.
490 */
491
492 nop # E :
493 nop # E :
494 nop # E :
495 beq $1, $bigalign_w # U :
496
497$alignmod64_w:
498 stq $17, 0($5) # L :
499 subq $3, 1, $3 # E : For consistency later
500 addq $1, 8, $1 # E : Increment towards zero for alignment
501 addq $5, 8, $4 # E : Initial wh64 address (filler instruction)
502
503 nop
504 nop
505 addq $5, 8, $5 # E : Inc address
506 blt $1, $alignmod64_w # U :
507
508$bigalign_w:
509 /*
510 * $3 - number quads left to go
511 * $5 - target address (aligned 0mod64)
512 * $17 - mask of stuff to store
513 * Scratch registers available: $7, $2, $4, $1
514 * we know that we'll be taking a minimum of one trip through
515 * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
516 * Assumes the wh64 needs to be for 2 trips through the loop in the future
517 * The wh64 is issued on for the starting destination address for trip +2
518 * through the loop, and if there are less than two trips left, the target
519 * address will be for the current trip.
520 */
521
522$do_wh64_w:
523 wh64 ($4) # L1 : memory subsystem write hint
524 subq $3, 24, $2 # E : For determining future wh64 addresses
525 stq $17, 0($5) # L :
526 nop # E :
527
528 addq $5, 128, $4 # E : speculative target of next wh64
529 stq $17, 8($5) # L :
530 stq $17, 16($5) # L :
531 addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr)
532
533 stq $17, 24($5) # L :
534 stq $17, 32($5) # L :
535 cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle
536 nop
537
538 stq $17, 40($5) # L :
539 stq $17, 48($5) # L :
540 subq $3, 16, $2 # E : Repeat the loop at least once more?
541 nop
542
543 stq $17, 56($5) # L :
544 addq $5, 64, $5 # E :
545 subq $3, 8, $3 # E :
546 bge $2, $do_wh64_w # U :
547
548 nop
549 nop
550 nop
551 beq $3, no_quad_w # U : Might have finished already
552
553.align 4
554 /*
555 * Simple loop for trailing quadwords, or for small amounts
556 * of data (where we can't use an unrolled loop and wh64)
557 */
558loop_w:
559 stq $17,0($5) # L :
560 subq $3,1,$3 # E : Decrement number quads left
561 addq $5,8,$5 # E : Inc address
562 bne $3,loop_w # U : more?
563
564no_quad_w:
565 /*
566 * Write 0..7 trailing bytes.
567 */
568 nop # E :
569 beq $18,end_w # U : All done?
570 ldq $7,0($5) # L :
571 mskqh $7,$6,$2 # U : Mask final quad
572
573 insqh $17,$6,$4 # U : New bits
574 bis $2,$4,$1 # E : Put it all together
575 stq $1,0($5) # L : And back to memory
576 ret $31,($26),1 # L0 :
577
578within_quad_w:
579 ldq_u $1,0($16) # L :
580 insql $17,$16,$2 # U : New bits
581 mskql $1,$16,$4 # U : Clear old
582 bis $2,$4,$2 # E : New result
583
584 mskql $2,$6,$4 # U :
585 mskqh $1,$6,$2 # U :
586 bis $2,$4,$1 # E :
587 stq_u $1,0($16) # L :
588
589end_w:
590 nop
591 nop
592 nop
593 ret $31,($26),1 # L0 :
594
595 .end __memsetw
596
597memset = __memset