Paul Mundt | 180ae20 | 2008-12-12 16:53:14 +0900 | [diff] [blame] | 1 | .section .text..SHmedia32,"ax" |
| 2 | .align 2 |
| 3 | .global __udivdi3 |
| 4 | __udivdi3: |
| 5 | shlri r3,1,r4 |
| 6 | nsb r4,r22 |
| 7 | shlld r3,r22,r6 |
| 8 | shlri r6,49,r5 |
| 9 | movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */ |
| 10 | sub r21,r5,r1 |
| 11 | mmulfx.w r1,r1,r4 |
| 12 | mshflo.w r1,r63,r1 |
| 13 | sub r63,r22,r20 // r63 == 64 % 64 |
| 14 | mmulfx.w r5,r4,r4 |
| 15 | pta large_divisor,tr0 |
| 16 | addi r20,32,r9 |
| 17 | msub.w r1,r4,r1 |
| 18 | madd.w r1,r1,r1 |
| 19 | mmulfx.w r1,r1,r4 |
| 20 | shlri r6,32,r7 |
| 21 | bgt/u r9,r63,tr0 // large_divisor |
| 22 | mmulfx.w r5,r4,r4 |
| 23 | shlri r2,32+14,r19 |
| 24 | addi r22,-31,r0 |
| 25 | msub.w r1,r4,r1 |
| 26 | |
| 27 | mulu.l r1,r7,r4 |
| 28 | addi r1,-3,r5 |
| 29 | mulu.l r5,r19,r5 |
| 30 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 |
| 31 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as |
| 32 | the case may be, %0000000000000000 000.11111111111, still */ |
| 33 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ |
| 34 | mulu.l r5,r3,r8 |
| 35 | mshalds.l r1,r21,r1 |
| 36 | shari r4,26,r4 |
| 37 | shlld r8,r0,r8 |
| 38 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) |
| 39 | sub r2,r8,r2 |
| 40 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */ |
| 41 | |
| 42 | shlri r2,22,r21 |
| 43 | mulu.l r21,r1,r21 |
| 44 | shlld r5,r0,r8 |
| 45 | addi r20,30-22,r0 |
| 46 | shlrd r21,r0,r21 |
| 47 | mulu.l r21,r3,r5 |
| 48 | add r8,r21,r8 |
| 49 | mcmpgt.l r21,r63,r21 // See Note 1 |
| 50 | addi r20,30,r0 |
| 51 | mshfhi.l r63,r21,r21 |
| 52 | sub r2,r5,r2 |
| 53 | andc r2,r21,r2 |
| 54 | |
| 55 | /* small divisor: need a third divide step */ |
| 56 | mulu.l r2,r1,r7 |
| 57 | ptabs r18,tr0 |
| 58 | addi r2,1,r2 |
| 59 | shlrd r7,r0,r7 |
| 60 | mulu.l r7,r3,r5 |
| 61 | add r8,r7,r8 |
| 62 | sub r2,r3,r2 |
| 63 | cmpgt r2,r5,r5 |
| 64 | add r8,r5,r2 |
| 65 | /* could test r3 here to check for divide by zero. */ |
| 66 | blink tr0,r63 |
| 67 | |
| 68 | large_divisor: |
| 69 | mmulfx.w r5,r4,r4 |
| 70 | shlrd r2,r9,r25 |
| 71 | shlri r25,32,r8 |
| 72 | msub.w r1,r4,r1 |
| 73 | |
| 74 | mulu.l r1,r7,r4 |
| 75 | addi r1,-3,r5 |
| 76 | mulu.l r5,r8,r5 |
| 77 | sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2 |
| 78 | shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as |
| 79 | the case may be, %0000000000000000 000.11111111111, still */ |
| 80 | muls.l r1,r4,r4 /* leaving at least one sign bit. */ |
| 81 | shlri r5,14-1,r8 |
| 82 | mulu.l r8,r7,r5 |
| 83 | mshalds.l r1,r21,r1 |
| 84 | shari r4,26,r4 |
| 85 | add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5) |
| 86 | sub r25,r5,r25 |
| 87 | /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */ |
| 88 | |
| 89 | shlri r25,22,r21 |
| 90 | mulu.l r21,r1,r21 |
| 91 | pta no_lo_adj,tr0 |
| 92 | addi r22,32,r0 |
| 93 | shlri r21,40,r21 |
| 94 | mulu.l r21,r7,r5 |
| 95 | add r8,r21,r8 |
| 96 | shlld r2,r0,r2 |
| 97 | sub r25,r5,r25 |
| 98 | bgtu/u r7,r25,tr0 // no_lo_adj |
| 99 | addi r8,1,r8 |
| 100 | sub r25,r7,r25 |
| 101 | no_lo_adj: |
| 102 | mextr4 r2,r25,r2 |
| 103 | |
| 104 | /* large_divisor: only needs a few adjustments. */ |
| 105 | mulu.l r8,r6,r5 |
| 106 | ptabs r18,tr0 |
| 107 | /* bubble */ |
| 108 | cmpgtu r5,r2,r5 |
| 109 | sub r8,r5,r2 |
| 110 | blink tr0,r63 |
| 111 | |
| 112 | /* Note 1: To shift the result of the second divide stage so that the result |
| 113 | always fits into 32 bits, yet we still reduce the rest sufficiently |
| 114 | would require a lot of instructions to do the shifts just right. Using |
| 115 | the full 64 bit shift result to multiply with the divisor would require |
| 116 | four extra instructions for the upper 32 bits (shift / mulu / shift / sub). |
| 117 | Fortunately, if the upper 32 bits of the shift result are nonzero, we |
| 118 | know that the rest after taking this partial result into account will |
| 119 | fit into 32 bits. So we just clear the upper 32 bits of the rest if the |
| 120 | upper 32 bits of the partial result are nonzero. */ |