Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | /* |
| 2 | * strlen.S (c) 1995 David Mosberger (davidm@cs.arizona.edu) |
| 3 | * |
| 4 | * Finds length of a 0-terminated string. Optimized for the |
| 5 | * Alpha architecture: |
| 6 | * |
| 7 | * - memory accessed as aligned quadwords only |
| 8 | * - uses bcmpge to compare 8 bytes in parallel |
| 9 | * - does binary search to find 0 byte in last |
| 10 | * quadword (HAKMEM needed 12 instructions to |
| 11 | * do this instead of the 9 instructions that |
| 12 | * binary search needs). |
| 13 | */ |
| 14 | |
| 15 | .set noreorder |
| 16 | .set noat |
| 17 | |
| 18 | .align 3 |
| 19 | |
| 20 | .globl strlen |
| 21 | .ent strlen |
| 22 | |
| 23 | strlen: |
| 24 | ldq_u $1, 0($16) # load first quadword ($16 may be misaligned) |
| 25 | lda $2, -1($31) |
| 26 | insqh $2, $16, $2 |
| 27 | andnot $16, 7, $0 |
| 28 | or $2, $1, $1 |
| 29 | cmpbge $31, $1, $2 # $2 <- bitmask: bit i == 1 <==> i-th byte == 0 |
| 30 | bne $2, found |
| 31 | |
| 32 | loop: ldq $1, 8($0) |
| 33 | addq $0, 8, $0 # addr += 8 |
| 34 | nop # helps dual issue last two insns |
| 35 | cmpbge $31, $1, $2 |
| 36 | beq $2, loop |
| 37 | |
| 38 | found: blbs $2, done # make aligned case fast |
| 39 | negq $2, $3 |
| 40 | and $2, $3, $2 |
| 41 | |
| 42 | and $2, 0x0f, $1 |
| 43 | addq $0, 4, $3 |
| 44 | cmoveq $1, $3, $0 |
| 45 | |
| 46 | and $2, 0x33, $1 |
| 47 | addq $0, 2, $3 |
| 48 | cmoveq $1, $3, $0 |
| 49 | |
| 50 | and $2, 0x55, $1 |
| 51 | addq $0, 1, $3 |
| 52 | cmoveq $1, $3, $0 |
| 53 | |
| 54 | done: subq $0, $16, $0 |
| 55 | ret $31, ($26) |
| 56 | |
| 57 | .end strlen |