Adrian Bunk | 88278ca | 2008-05-19 16:53:02 -0700 | [diff] [blame] | 1 | /* |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 2 | * umul.S: This routine was taken from glibc-1.09 and is covered |
| 3 | * by the GNU Library General Public License Version 2. |
| 4 | */ |
| 5 | |
| 6 | |
| 7 | /* |
| 8 | * Unsigned multiply. Returns %o0 * %o1 in %o1%o0 (i.e., %o1 holds the |
| 9 | * upper 32 bits of the 64-bit product). |
| 10 | * |
| 11 | * This code optimizes short (less than 13-bit) multiplies. Short |
| 12 | * multiplies require 25 instruction cycles, and long ones require |
| 13 | * 45 instruction cycles. |
| 14 | * |
| 15 | * On return, overflow has occurred (%o1 is not zero) if and only if |
| 16 | * the Z condition code is clear, allowing, e.g., the following: |
| 17 | * |
| 18 | * call .umul |
| 19 | * nop |
| 20 | * bnz overflow (or tnz) |
| 21 | */ |
| 22 | |
| 23 | .globl .umul |
Al Viro | 7caaeab | 2005-09-11 20:14:07 -0700 | [diff] [blame] | 24 | .globl _Umul |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 25 | .umul: |
Al Viro | 7caaeab | 2005-09-11 20:14:07 -0700 | [diff] [blame] | 26 | _Umul: /* needed for export */ |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 27 | or %o0, %o1, %o4 |
| 28 | mov %o0, %y ! multiplier -> Y |
| 29 | |
| 30 | andncc %o4, 0xfff, %g0 ! test bits 12..31 of *both* args |
| 31 | be Lmul_shortway ! if zero, can do it the short way |
| 32 | andcc %g0, %g0, %o4 ! zero the partial product and clear N and V |
| 33 | |
| 34 | /* |
| 35 | * Long multiply. 32 steps, followed by a final shift step. |
| 36 | */ |
| 37 | mulscc %o4, %o1, %o4 ! 1 |
| 38 | mulscc %o4, %o1, %o4 ! 2 |
| 39 | mulscc %o4, %o1, %o4 ! 3 |
| 40 | mulscc %o4, %o1, %o4 ! 4 |
| 41 | mulscc %o4, %o1, %o4 ! 5 |
| 42 | mulscc %o4, %o1, %o4 ! 6 |
| 43 | mulscc %o4, %o1, %o4 ! 7 |
| 44 | mulscc %o4, %o1, %o4 ! 8 |
| 45 | mulscc %o4, %o1, %o4 ! 9 |
| 46 | mulscc %o4, %o1, %o4 ! 10 |
| 47 | mulscc %o4, %o1, %o4 ! 11 |
| 48 | mulscc %o4, %o1, %o4 ! 12 |
| 49 | mulscc %o4, %o1, %o4 ! 13 |
| 50 | mulscc %o4, %o1, %o4 ! 14 |
| 51 | mulscc %o4, %o1, %o4 ! 15 |
| 52 | mulscc %o4, %o1, %o4 ! 16 |
| 53 | mulscc %o4, %o1, %o4 ! 17 |
| 54 | mulscc %o4, %o1, %o4 ! 18 |
| 55 | mulscc %o4, %o1, %o4 ! 19 |
| 56 | mulscc %o4, %o1, %o4 ! 20 |
| 57 | mulscc %o4, %o1, %o4 ! 21 |
| 58 | mulscc %o4, %o1, %o4 ! 22 |
| 59 | mulscc %o4, %o1, %o4 ! 23 |
| 60 | mulscc %o4, %o1, %o4 ! 24 |
| 61 | mulscc %o4, %o1, %o4 ! 25 |
| 62 | mulscc %o4, %o1, %o4 ! 26 |
| 63 | mulscc %o4, %o1, %o4 ! 27 |
| 64 | mulscc %o4, %o1, %o4 ! 28 |
| 65 | mulscc %o4, %o1, %o4 ! 29 |
| 66 | mulscc %o4, %o1, %o4 ! 30 |
| 67 | mulscc %o4, %o1, %o4 ! 31 |
| 68 | mulscc %o4, %o1, %o4 ! 32 |
| 69 | mulscc %o4, %g0, %o4 ! final shift |
| 70 | |
| 71 | |
| 72 | /* |
| 73 | * Normally, with the shift-and-add approach, if both numbers are |
| 74 | * positive you get the correct result. With 32-bit two's-complement |
| 75 | * numbers, -x is represented as |
| 76 | * |
| 77 | * x 32 |
| 78 | * ( 2 - ------ ) mod 2 * 2 |
| 79 | * 32 |
| 80 | * 2 |
| 81 | * |
| 82 | * (the `mod 2' subtracts 1 from 1.bbbb). To avoid lots of 2^32s, |
| 83 | * we can treat this as if the radix point were just to the left |
| 84 | * of the sign bit (multiply by 2^32), and get |
| 85 | * |
| 86 | * -x = (2 - x) mod 2 |
| 87 | * |
| 88 | * Then, ignoring the `mod 2's for convenience: |
| 89 | * |
| 90 | * x * y = xy |
| 91 | * -x * y = 2y - xy |
| 92 | * x * -y = 2x - xy |
| 93 | * -x * -y = 4 - 2x - 2y + xy |
| 94 | * |
| 95 | * For signed multiplies, we subtract (x << 32) from the partial |
| 96 | * product to fix this problem for negative multipliers (see mul.s). |
| 97 | * Because of the way the shift into the partial product is calculated |
| 98 | * (N xor V), this term is automatically removed for the multiplicand, |
| 99 | * so we don't have to adjust. |
| 100 | * |
| 101 | * But for unsigned multiplies, the high order bit wasn't a sign bit, |
| 102 | * and the correction is wrong. So for unsigned multiplies where the |
| 103 | * high order bit is one, we end up with xy - (y << 32). To fix it |
| 104 | * we add y << 32. |
| 105 | */ |
| 106 | #if 0 |
| 107 | tst %o1 |
| 108 | bl,a 1f ! if %o1 < 0 (high order bit = 1), |
| 109 | add %o4, %o0, %o4 ! %o4 += %o0 (add y to upper half) |
| 110 | |
| 111 | 1: |
| 112 | rd %y, %o0 ! get lower half of product |
| 113 | retl |
| 114 | addcc %o4, %g0, %o1 ! put upper half in place and set Z for %o1==0 |
| 115 | #else |
| 116 | /* Faster code from tege@sics.se. */ |
| 117 | sra %o1, 31, %o2 ! make mask from sign bit |
| 118 | and %o0, %o2, %o2 ! %o2 = 0 or %o0, depending on sign of %o1 |
| 119 | rd %y, %o0 ! get lower half of product |
| 120 | retl |
| 121 | addcc %o4, %o2, %o1 ! add compensation and put upper half in place |
| 122 | #endif |
| 123 | |
| 124 | Lmul_shortway: |
| 125 | /* |
| 126 | * Short multiply. 12 steps, followed by a final shift step. |
| 127 | * The resulting bits are off by 12 and (32-12) = 20 bit positions, |
| 128 | * but there is no problem with %o0 being negative (unlike above), |
| 129 | * and overflow is impossible (the answer is at most 24 bits long). |
| 130 | */ |
| 131 | mulscc %o4, %o1, %o4 ! 1 |
| 132 | mulscc %o4, %o1, %o4 ! 2 |
| 133 | mulscc %o4, %o1, %o4 ! 3 |
| 134 | mulscc %o4, %o1, %o4 ! 4 |
| 135 | mulscc %o4, %o1, %o4 ! 5 |
| 136 | mulscc %o4, %o1, %o4 ! 6 |
| 137 | mulscc %o4, %o1, %o4 ! 7 |
| 138 | mulscc %o4, %o1, %o4 ! 8 |
| 139 | mulscc %o4, %o1, %o4 ! 9 |
| 140 | mulscc %o4, %o1, %o4 ! 10 |
| 141 | mulscc %o4, %o1, %o4 ! 11 |
| 142 | mulscc %o4, %o1, %o4 ! 12 |
| 143 | mulscc %o4, %g0, %o4 ! final shift |
| 144 | |
| 145 | /* |
| 146 | * %o4 has 20 of the bits that should be in the result; %y has |
| 147 | * the bottom 12 (as %y's top 12). That is: |
| 148 | * |
| 149 | * %o4 %y |
| 150 | * +----------------+----------------+ |
| 151 | * | -12- | -20- | -12- | -20- | |
| 152 | * +------(---------+------)---------+ |
| 153 | * -----result----- |
| 154 | * |
| 155 | * The 12 bits of %o4 left of the `result' area are all zero; |
| 156 | * in fact, all top 20 bits of %o4 are zero. |
| 157 | */ |
| 158 | |
| 159 | rd %y, %o5 |
| 160 | sll %o4, 12, %o0 ! shift middle bits left 12 |
| 161 | srl %o5, 20, %o5 ! shift low bits right 20 |
| 162 | or %o5, %o0, %o0 |
| 163 | retl |
| 164 | addcc %g0, %g0, %o1 ! %o1 = zero, and set Z |
| 165 | |
| 166 | .globl .umul_patch |
| 167 | .umul_patch: |
| 168 | umul %o0, %o1, %o0 |
| 169 | retl |
| 170 | rd %y, %o1 |
| 171 | nop |