Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 1 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 2 | MOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP |
| 3 | M68000 Hi-Performance Microprocessor Division |
| 4 | M68060 Software Package |
| 5 | Production Release P1.00 -- October 10, 1994 |
| 6 | |
Jan Engelhardt | 96de0e2 | 2007-10-19 23:21:04 +0200 | [diff] [blame] | 7 | M68060 Software Package Copyright © 1993, 1994 Motorola Inc. All rights reserved. |
Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame] | 8 | |
| 9 | THE SOFTWARE is provided on an "AS IS" basis and without warranty. |
| 10 | To the maximum extent permitted by applicable law, |
| 11 | MOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, |
| 12 | INCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE |
| 13 | and any warranty against infringement with regard to the SOFTWARE |
| 14 | (INCLUDING ANY MODIFIED VERSIONS THEREOF) and any accompanying written materials. |
| 15 | |
| 16 | To the maximum extent permitted by applicable law, |
| 17 | IN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER |
| 18 | (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, |
| 19 | BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY LOSS) |
| 20 | ARISING OF THE USE OR INABILITY TO USE THE SOFTWARE. |
| 21 | Motorola assumes no responsibility for the maintenance and support of the SOFTWARE. |
| 22 | |
| 23 | You are hereby granted a copyright license to use, modify, and distribute the SOFTWARE |
| 24 | so long as this entire notice is retained without alteration in any modified and/or |
| 25 | redistributed versions, and that such modified versions are clearly identified as such. |
| 26 | No licenses are granted by implication, estoppel or otherwise under any patents |
| 27 | or trademarks of Motorola, Inc. |
| 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
| 29 | # litop.s: |
| 30 | # This file is appended to the top of the 060FPLSP package |
| 31 | # and contains the entry points into the package. The user, in |
| 32 | # effect, branches to one of the branch table entries located here. |
| 33 | # |
| 34 | |
| 35 | bra.l _060LSP__idivs64_ |
| 36 | short 0x0000 |
| 37 | bra.l _060LSP__idivu64_ |
| 38 | short 0x0000 |
| 39 | |
| 40 | bra.l _060LSP__imuls64_ |
| 41 | short 0x0000 |
| 42 | bra.l _060LSP__imulu64_ |
| 43 | short 0x0000 |
| 44 | |
| 45 | bra.l _060LSP__cmp2_Ab_ |
| 46 | short 0x0000 |
| 47 | bra.l _060LSP__cmp2_Aw_ |
| 48 | short 0x0000 |
| 49 | bra.l _060LSP__cmp2_Al_ |
| 50 | short 0x0000 |
| 51 | bra.l _060LSP__cmp2_Db_ |
| 52 | short 0x0000 |
| 53 | bra.l _060LSP__cmp2_Dw_ |
| 54 | short 0x0000 |
| 55 | bra.l _060LSP__cmp2_Dl_ |
| 56 | short 0x0000 |
| 57 | |
| 58 | # leave room for future possible aditions. |
| 59 | align 0x200 |
| 60 | |
| 61 | ######################################################################### |
| 62 | # XDEF **************************************************************** # |
| 63 | # _060LSP__idivu64_(): Emulate 64-bit unsigned div instruction. # |
| 64 | # _060LSP__idivs64_(): Emulate 64-bit signed div instruction. # |
| 65 | # # |
| 66 | # This is the library version which is accessed as a subroutine # |
| 67 | # and therefore does not work exactly like the 680X0 div{s,u}.l # |
| 68 | # 64-bit divide instruction. # |
| 69 | # # |
| 70 | # XREF **************************************************************** # |
| 71 | # None. # |
| 72 | # # |
| 73 | # INPUT *************************************************************** # |
| 74 | # 0x4(sp) = divisor # |
| 75 | # 0x8(sp) = hi(dividend) # |
| 76 | # 0xc(sp) = lo(dividend) # |
| 77 | # 0x10(sp) = pointer to location to place quotient/remainder # |
| 78 | # # |
| 79 | # OUTPUT ************************************************************** # |
| 80 | # 0x10(sp) = points to location of remainder/quotient. # |
| 81 | # remainder is in first longword, quotient is in 2nd. # |
| 82 | # # |
| 83 | # ALGORITHM *********************************************************** # |
| 84 | # If the operands are signed, make them unsigned and save the # |
| 85 | # sign info for later. Separate out special cases like divide-by-zero # |
| 86 | # or 32-bit divides if possible. Else, use a special math algorithm # |
| 87 | # to calculate the result. # |
| 88 | # Restore sign info if signed instruction. Set the condition # |
| 89 | # codes before performing the final "rts". If the divisor was equal to # |
| 90 | # zero, then perform a divide-by-zero using a 16-bit implemented # |
| 91 | # divide instruction. This way, the operating system can record that # |
| 92 | # the event occurred even though it may not point to the correct place. # |
| 93 | # # |
| 94 | ######################################################################### |
| 95 | |
| 96 | set POSNEG, -1 |
| 97 | set NDIVISOR, -2 |
| 98 | set NDIVIDEND, -3 |
| 99 | set DDSECOND, -4 |
| 100 | set DDNORMAL, -8 |
| 101 | set DDQUOTIENT, -12 |
| 102 | set DIV64_CC, -16 |
| 103 | |
| 104 | ########## |
| 105 | # divs.l # |
| 106 | ########## |
| 107 | global _060LSP__idivs64_ |
| 108 | _060LSP__idivs64_: |
| 109 | # PROLOGUE BEGIN ######################################################## |
| 110 | link.w %a6,&-16 |
| 111 | movm.l &0x3f00,-(%sp) # save d2-d7 |
| 112 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 113 | # PROLOGUE END ########################################################## |
| 114 | |
| 115 | mov.w %cc,DIV64_CC(%a6) |
| 116 | st POSNEG(%a6) # signed operation |
| 117 | bra.b ldiv64_cont |
| 118 | |
| 119 | ########## |
| 120 | # divu.l # |
| 121 | ########## |
| 122 | global _060LSP__idivu64_ |
| 123 | _060LSP__idivu64_: |
| 124 | # PROLOGUE BEGIN ######################################################## |
| 125 | link.w %a6,&-16 |
| 126 | movm.l &0x3f00,-(%sp) # save d2-d7 |
| 127 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 128 | # PROLOGUE END ########################################################## |
| 129 | |
| 130 | mov.w %cc,DIV64_CC(%a6) |
| 131 | sf POSNEG(%a6) # unsigned operation |
| 132 | |
| 133 | ldiv64_cont: |
| 134 | mov.l 0x8(%a6),%d7 # fetch divisor |
| 135 | |
| 136 | beq.w ldiv64eq0 # divisor is = 0!!! |
| 137 | |
| 138 | mov.l 0xc(%a6), %d5 # get dividend hi |
| 139 | mov.l 0x10(%a6), %d6 # get dividend lo |
| 140 | |
| 141 | # separate signed and unsigned divide |
| 142 | tst.b POSNEG(%a6) # signed or unsigned? |
| 143 | beq.b ldspecialcases # use positive divide |
| 144 | |
| 145 | # save the sign of the divisor |
| 146 | # make divisor unsigned if it's negative |
| 147 | tst.l %d7 # chk sign of divisor |
| 148 | slt NDIVISOR(%a6) # save sign of divisor |
| 149 | bpl.b ldsgndividend |
| 150 | neg.l %d7 # complement negative divisor |
| 151 | |
| 152 | # save the sign of the dividend |
| 153 | # make dividend unsigned if it's negative |
| 154 | ldsgndividend: |
| 155 | tst.l %d5 # chk sign of hi(dividend) |
| 156 | slt NDIVIDEND(%a6) # save sign of dividend |
| 157 | bpl.b ldspecialcases |
| 158 | |
| 159 | mov.w &0x0, %cc # clear 'X' cc bit |
| 160 | negx.l %d6 # complement signed dividend |
| 161 | negx.l %d5 |
| 162 | |
| 163 | # extract some special cases: |
| 164 | # - is (dividend == 0) ? |
| 165 | # - is (hi(dividend) == 0 && (divisor <= lo(dividend))) ? (32-bit div) |
| 166 | ldspecialcases: |
| 167 | tst.l %d5 # is (hi(dividend) == 0) |
| 168 | bne.b ldnormaldivide # no, so try it the long way |
| 169 | |
| 170 | tst.l %d6 # is (lo(dividend) == 0), too |
| 171 | beq.w lddone # yes, so (dividend == 0) |
| 172 | |
| 173 | cmp.l %d7,%d6 # is (divisor <= lo(dividend)) |
| 174 | bls.b ld32bitdivide # yes, so use 32 bit divide |
| 175 | |
| 176 | exg %d5,%d6 # q = 0, r = dividend |
| 177 | bra.w ldivfinish # can't divide, we're done. |
| 178 | |
| 179 | ld32bitdivide: |
| 180 | tdivu.l %d7, %d5:%d6 # it's only a 32/32 bit div! |
| 181 | |
| 182 | bra.b ldivfinish |
| 183 | |
| 184 | ldnormaldivide: |
| 185 | # last special case: |
| 186 | # - is hi(dividend) >= divisor ? if yes, then overflow |
| 187 | cmp.l %d7,%d5 |
| 188 | bls.b lddovf # answer won't fit in 32 bits |
| 189 | |
| 190 | # perform the divide algorithm: |
| 191 | bsr.l ldclassical # do int divide |
| 192 | |
| 193 | # separate into signed and unsigned finishes. |
| 194 | ldivfinish: |
| 195 | tst.b POSNEG(%a6) # do divs, divu separately |
| 196 | beq.b lddone # divu has no processing!!! |
| 197 | |
| 198 | # it was a divs.l, so ccode setting is a little more complicated... |
| 199 | tst.b NDIVIDEND(%a6) # remainder has same sign |
| 200 | beq.b ldcc # as dividend. |
| 201 | neg.l %d5 # sgn(rem) = sgn(dividend) |
| 202 | ldcc: |
| 203 | mov.b NDIVISOR(%a6), %d0 |
| 204 | eor.b %d0, NDIVIDEND(%a6) # chk if quotient is negative |
| 205 | beq.b ldqpos # branch to quot positive |
| 206 | |
| 207 | # 0x80000000 is the largest number representable as a 32-bit negative |
| 208 | # number. the negative of 0x80000000 is 0x80000000. |
| 209 | cmpi.l %d6, &0x80000000 # will (-quot) fit in 32 bits? |
| 210 | bhi.b lddovf |
| 211 | |
| 212 | neg.l %d6 # make (-quot) 2's comp |
| 213 | |
| 214 | bra.b lddone |
| 215 | |
| 216 | ldqpos: |
| 217 | btst &0x1f, %d6 # will (+quot) fit in 32 bits? |
| 218 | bne.b lddovf |
| 219 | |
| 220 | lddone: |
| 221 | # if the register numbers are the same, only the quotient gets saved. |
| 222 | # so, if we always save the quotient second, we save ourselves a cmp&beq |
| 223 | andi.w &0x10,DIV64_CC(%a6) |
| 224 | mov.w DIV64_CC(%a6),%cc |
| 225 | tst.l %d6 # may set 'N' ccode bit |
| 226 | |
| 227 | # here, the result is in d1 and d0. the current strategy is to save |
| 228 | # the values at the location pointed to by a0. |
| 229 | # use movm here to not disturb the condition codes. |
| 230 | ldexit: |
| 231 | movm.l &0x0060,([0x14,%a6]) # save result |
| 232 | |
| 233 | # EPILOGUE BEGIN ######################################################## |
| 234 | # fmovm.l (%sp)+,&0x0 # restore no fpregs |
| 235 | movm.l (%sp)+,&0x00fc # restore d2-d7 |
| 236 | unlk %a6 |
| 237 | # EPILOGUE END ########################################################## |
| 238 | |
| 239 | rts |
| 240 | |
| 241 | # the result should be the unchanged dividend |
| 242 | lddovf: |
| 243 | mov.l 0xc(%a6), %d5 # get dividend hi |
| 244 | mov.l 0x10(%a6), %d6 # get dividend lo |
| 245 | |
| 246 | andi.w &0x1c,DIV64_CC(%a6) |
| 247 | ori.w &0x02,DIV64_CC(%a6) # set 'V' ccode bit |
| 248 | mov.w DIV64_CC(%a6),%cc |
| 249 | |
| 250 | bra.b ldexit |
| 251 | |
| 252 | ldiv64eq0: |
| 253 | mov.l 0xc(%a6),([0x14,%a6]) |
| 254 | mov.l 0x10(%a6),([0x14,%a6],0x4) |
| 255 | |
| 256 | mov.w DIV64_CC(%a6),%cc |
| 257 | |
| 258 | # EPILOGUE BEGIN ######################################################## |
| 259 | # fmovm.l (%sp)+,&0x0 # restore no fpregs |
| 260 | movm.l (%sp)+,&0x00fc # restore d2-d7 |
| 261 | unlk %a6 |
| 262 | # EPILOGUE END ########################################################## |
| 263 | |
| 264 | divu.w &0x0,%d0 # force a divbyzero exception |
| 265 | rts |
| 266 | |
| 267 | ########################################################################### |
| 268 | ######################################################################### |
| 269 | # This routine uses the 'classical' Algorithm D from Donald Knuth's # |
| 270 | # Art of Computer Programming, vol II, Seminumerical Algorithms. # |
| 271 | # For this implementation b=2**16, and the target is U1U2U3U4/V1V2, # |
| 272 | # where U,V are words of the quadword dividend and longword divisor, # |
| 273 | # and U1, V1 are the most significant words. # |
| 274 | # # |
| 275 | # The most sig. longword of the 64 bit dividend must be in %d5, least # |
| 276 | # in %d6. The divisor must be in the variable ddivisor, and the # |
| 277 | # signed/unsigned flag ddusign must be set (0=unsigned,1=signed). # |
| 278 | # The quotient is returned in %d6, remainder in %d5, unless the # |
| 279 | # v (overflow) bit is set in the saved %ccr. If overflow, the dividend # |
| 280 | # is unchanged. # |
| 281 | ######################################################################### |
| 282 | ldclassical: |
| 283 | # if the divisor msw is 0, use simpler algorithm then the full blown |
| 284 | # one at ddknuth: |
| 285 | |
| 286 | cmpi.l %d7, &0xffff |
| 287 | bhi.b lddknuth # go use D. Knuth algorithm |
| 288 | |
| 289 | # Since the divisor is only a word (and larger than the mslw of the dividend), |
| 290 | # a simpler algorithm may be used : |
| 291 | # In the general case, four quotient words would be created by |
| 292 | # dividing the divisor word into each dividend word. In this case, |
| 293 | # the first two quotient words must be zero, or overflow would occur. |
| 294 | # Since we already checked this case above, we can treat the most significant |
| 295 | # longword of the dividend as (0) remainder (see Knuth) and merely complete |
| 296 | # the last two divisions to get a quotient longword and word remainder: |
| 297 | |
| 298 | clr.l %d1 |
| 299 | swap %d5 # same as r*b if previous step rqd |
| 300 | swap %d6 # get u3 to lsw position |
| 301 | mov.w %d6, %d5 # rb + u3 |
| 302 | |
| 303 | divu.w %d7, %d5 |
| 304 | |
| 305 | mov.w %d5, %d1 # first quotient word |
| 306 | swap %d6 # get u4 |
| 307 | mov.w %d6, %d5 # rb + u4 |
| 308 | |
| 309 | divu.w %d7, %d5 |
| 310 | |
| 311 | swap %d1 |
| 312 | mov.w %d5, %d1 # 2nd quotient 'digit' |
| 313 | clr.w %d5 |
| 314 | swap %d5 # now remainder |
| 315 | mov.l %d1, %d6 # and quotient |
| 316 | |
| 317 | rts |
| 318 | |
| 319 | lddknuth: |
| 320 | # In this algorithm, the divisor is treated as a 2 digit (word) number |
| 321 | # which is divided into a 3 digit (word) dividend to get one quotient |
| 322 | # digit (word). After subtraction, the dividend is shifted and the |
| 323 | # process repeated. Before beginning, the divisor and quotient are |
| 324 | # 'normalized' so that the process of estimating the quotient digit |
| 325 | # will yield verifiably correct results.. |
| 326 | |
| 327 | clr.l DDNORMAL(%a6) # count of shifts for normalization |
| 328 | clr.b DDSECOND(%a6) # clear flag for quotient digits |
| 329 | clr.l %d1 # %d1 will hold trial quotient |
| 330 | lddnchk: |
| 331 | btst &31, %d7 # must we normalize? first word of |
| 332 | bne.b lddnormalized # divisor (V1) must be >= 65536/2 |
| 333 | addq.l &0x1, DDNORMAL(%a6) # count normalization shifts |
| 334 | lsl.l &0x1, %d7 # shift the divisor |
| 335 | lsl.l &0x1, %d6 # shift u4,u3 with overflow to u2 |
| 336 | roxl.l &0x1, %d5 # shift u1,u2 |
| 337 | bra.w lddnchk |
| 338 | lddnormalized: |
| 339 | |
| 340 | # Now calculate an estimate of the quotient words (msw first, then lsw). |
| 341 | # The comments use subscripts for the first quotient digit determination. |
| 342 | mov.l %d7, %d3 # divisor |
| 343 | mov.l %d5, %d2 # dividend mslw |
| 344 | swap %d2 |
| 345 | swap %d3 |
| 346 | cmp.w %d2, %d3 # V1 = U1 ? |
| 347 | bne.b lddqcalc1 |
| 348 | mov.w &0xffff, %d1 # use max trial quotient word |
| 349 | bra.b lddadj0 |
| 350 | lddqcalc1: |
| 351 | mov.l %d5, %d1 |
| 352 | |
| 353 | divu.w %d3, %d1 # use quotient of mslw/msw |
| 354 | |
| 355 | andi.l &0x0000ffff, %d1 # zero any remainder |
| 356 | lddadj0: |
| 357 | |
| 358 | # now test the trial quotient and adjust. This step plus the |
| 359 | # normalization assures (according to Knuth) that the trial |
| 360 | # quotient will be at worst 1 too large. |
| 361 | mov.l %d6, -(%sp) |
| 362 | clr.w %d6 # word u3 left |
| 363 | swap %d6 # in lsw position |
| 364 | lddadj1: mov.l %d7, %d3 |
| 365 | mov.l %d1, %d2 |
| 366 | mulu.w %d7, %d2 # V2q |
| 367 | swap %d3 |
| 368 | mulu.w %d1, %d3 # V1q |
| 369 | mov.l %d5, %d4 # U1U2 |
| 370 | sub.l %d3, %d4 # U1U2 - V1q |
| 371 | |
| 372 | swap %d4 |
| 373 | |
| 374 | mov.w %d4,%d0 |
| 375 | mov.w %d6,%d4 # insert lower word (U3) |
| 376 | |
| 377 | tst.w %d0 # is upper word set? |
| 378 | bne.w lddadjd1 |
| 379 | |
| 380 | # add.l %d6, %d4 # (U1U2 - V1q) + U3 |
| 381 | |
| 382 | cmp.l %d2, %d4 |
| 383 | bls.b lddadjd1 # is V2q > (U1U2-V1q) + U3 ? |
| 384 | subq.l &0x1, %d1 # yes, decrement and recheck |
| 385 | bra.b lddadj1 |
| 386 | lddadjd1: |
| 387 | # now test the word by multiplying it by the divisor (V1V2) and comparing |
| 388 | # the 3 digit (word) result with the current dividend words |
| 389 | mov.l %d5, -(%sp) # save %d5 (%d6 already saved) |
| 390 | mov.l %d1, %d6 |
| 391 | swap %d6 # shift answer to ms 3 words |
| 392 | mov.l %d7, %d5 |
| 393 | bsr.l ldmm2 |
| 394 | mov.l %d5, %d2 # now %d2,%d3 are trial*divisor |
| 395 | mov.l %d6, %d3 |
| 396 | mov.l (%sp)+, %d5 # restore dividend |
| 397 | mov.l (%sp)+, %d6 |
| 398 | sub.l %d3, %d6 |
| 399 | subx.l %d2, %d5 # subtract double precision |
| 400 | bcc ldd2nd # no carry, do next quotient digit |
| 401 | subq.l &0x1, %d1 # q is one too large |
| 402 | # need to add back divisor longword to current ms 3 digits of dividend |
| 403 | # - according to Knuth, this is done only 2 out of 65536 times for random |
| 404 | # divisor, dividend selection. |
| 405 | clr.l %d2 |
| 406 | mov.l %d7, %d3 |
| 407 | swap %d3 |
| 408 | clr.w %d3 # %d3 now ls word of divisor |
| 409 | add.l %d3, %d6 # aligned with 3rd word of dividend |
| 410 | addx.l %d2, %d5 |
| 411 | mov.l %d7, %d3 |
| 412 | clr.w %d3 # %d3 now ms word of divisor |
| 413 | swap %d3 # aligned with 2nd word of dividend |
| 414 | add.l %d3, %d5 |
| 415 | ldd2nd: |
| 416 | tst.b DDSECOND(%a6) # both q words done? |
| 417 | bne.b lddremain |
| 418 | # first quotient digit now correct. store digit and shift the |
| 419 | # (subtracted) dividend |
| 420 | mov.w %d1, DDQUOTIENT(%a6) |
| 421 | clr.l %d1 |
| 422 | swap %d5 |
| 423 | swap %d6 |
| 424 | mov.w %d6, %d5 |
| 425 | clr.w %d6 |
| 426 | st DDSECOND(%a6) # second digit |
| 427 | bra.w lddnormalized |
| 428 | lddremain: |
| 429 | # add 2nd word to quotient, get the remainder. |
| 430 | mov.w %d1, DDQUOTIENT+2(%a6) |
| 431 | # shift down one word/digit to renormalize remainder. |
| 432 | mov.w %d5, %d6 |
| 433 | swap %d6 |
| 434 | swap %d5 |
| 435 | mov.l DDNORMAL(%a6), %d7 # get norm shift count |
| 436 | beq.b lddrn |
| 437 | subq.l &0x1, %d7 # set for loop count |
| 438 | lddnlp: |
| 439 | lsr.l &0x1, %d5 # shift into %d6 |
| 440 | roxr.l &0x1, %d6 |
| 441 | dbf %d7, lddnlp |
| 442 | lddrn: |
| 443 | mov.l %d6, %d5 # remainder |
| 444 | mov.l DDQUOTIENT(%a6), %d6 # quotient |
| 445 | |
| 446 | rts |
| 447 | ldmm2: |
| 448 | # factors for the 32X32->64 multiplication are in %d5 and %d6. |
| 449 | # returns 64 bit result in %d5 (hi) %d6(lo). |
| 450 | # destroys %d2,%d3,%d4. |
| 451 | |
| 452 | # multiply hi,lo words of each factor to get 4 intermediate products |
| 453 | mov.l %d6, %d2 |
| 454 | mov.l %d6, %d3 |
| 455 | mov.l %d5, %d4 |
| 456 | swap %d3 |
| 457 | swap %d4 |
| 458 | mulu.w %d5, %d6 # %d6 <- lsw*lsw |
| 459 | mulu.w %d3, %d5 # %d5 <- msw-dest*lsw-source |
| 460 | mulu.w %d4, %d2 # %d2 <- msw-source*lsw-dest |
| 461 | mulu.w %d4, %d3 # %d3 <- msw*msw |
| 462 | # now use swap and addx to consolidate to two longwords |
| 463 | clr.l %d4 |
| 464 | swap %d6 |
| 465 | add.w %d5, %d6 # add msw of l*l to lsw of m*l product |
| 466 | addx.w %d4, %d3 # add any carry to m*m product |
| 467 | add.w %d2, %d6 # add in lsw of other m*l product |
| 468 | addx.w %d4, %d3 # add any carry to m*m product |
| 469 | swap %d6 # %d6 is low 32 bits of final product |
| 470 | clr.w %d5 |
| 471 | clr.w %d2 # lsw of two mixed products used, |
| 472 | swap %d5 # now use msws of longwords |
| 473 | swap %d2 |
| 474 | add.l %d2, %d5 |
| 475 | add.l %d3, %d5 # %d5 now ms 32 bits of final product |
| 476 | rts |
| 477 | |
| 478 | ######################################################################### |
| 479 | # XDEF **************************************************************** # |
| 480 | # _060LSP__imulu64_(): Emulate 64-bit unsigned mul instruction # |
| 481 | # _060LSP__imuls64_(): Emulate 64-bit signed mul instruction. # |
| 482 | # # |
| 483 | # This is the library version which is accessed as a subroutine # |
| 484 | # and therefore does not work exactly like the 680X0 mul{s,u}.l # |
| 485 | # 64-bit multiply instruction. # |
| 486 | # # |
| 487 | # XREF **************************************************************** # |
| 488 | # None # |
| 489 | # # |
| 490 | # INPUT *************************************************************** # |
| 491 | # 0x4(sp) = multiplier # |
| 492 | # 0x8(sp) = multiplicand # |
| 493 | # 0xc(sp) = pointer to location to place 64-bit result # |
| 494 | # # |
| 495 | # OUTPUT ************************************************************** # |
| 496 | # 0xc(sp) = points to location of 64-bit result # |
| 497 | # # |
| 498 | # ALGORITHM *********************************************************** # |
| 499 | # Perform the multiply in pieces using 16x16->32 unsigned # |
| 500 | # multiplies and "add" instructions. # |
| 501 | # Set the condition codes as appropriate before performing an # |
| 502 | # "rts". # |
| 503 | # # |
| 504 | ######################################################################### |
| 505 | |
| 506 | set MUL64_CC, -4 |
| 507 | |
| 508 | global _060LSP__imulu64_ |
| 509 | _060LSP__imulu64_: |
| 510 | |
| 511 | # PROLOGUE BEGIN ######################################################## |
| 512 | link.w %a6,&-4 |
| 513 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 514 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 515 | # PROLOGUE END ########################################################## |
| 516 | |
| 517 | mov.w %cc,MUL64_CC(%a6) # save incoming ccodes |
| 518 | |
| 519 | mov.l 0x8(%a6),%d0 # store multiplier in d0 |
| 520 | beq.w mulu64_zero # handle zero separately |
| 521 | |
| 522 | mov.l 0xc(%a6),%d1 # get multiplicand in d1 |
| 523 | beq.w mulu64_zero # handle zero separately |
| 524 | |
| 525 | ######################################################################### |
| 526 | # 63 32 0 # |
| 527 | # ---------------------------- # |
| 528 | # | hi(mplier) * hi(mplicand)| # |
| 529 | # ---------------------------- # |
| 530 | # ----------------------------- # |
| 531 | # | hi(mplier) * lo(mplicand) | # |
| 532 | # ----------------------------- # |
| 533 | # ----------------------------- # |
| 534 | # | lo(mplier) * hi(mplicand) | # |
| 535 | # ----------------------------- # |
| 536 | # | ----------------------------- # |
| 537 | # --|-- | lo(mplier) * lo(mplicand) | # |
| 538 | # | ----------------------------- # |
| 539 | # ======================================================== # |
| 540 | # -------------------------------------------------------- # |
| 541 | # | hi(result) | lo(result) | # |
| 542 | # -------------------------------------------------------- # |
| 543 | ######################################################################### |
| 544 | mulu64_alg: |
| 545 | # load temp registers with operands |
| 546 | mov.l %d0,%d2 # mr in d2 |
| 547 | mov.l %d0,%d3 # mr in d3 |
| 548 | mov.l %d1,%d4 # md in d4 |
| 549 | swap %d3 # hi(mr) in lo d3 |
| 550 | swap %d4 # hi(md) in lo d4 |
| 551 | |
| 552 | # complete necessary multiplies: |
| 553 | mulu.w %d1,%d0 # [1] lo(mr) * lo(md) |
| 554 | mulu.w %d3,%d1 # [2] hi(mr) * lo(md) |
| 555 | mulu.w %d4,%d2 # [3] lo(mr) * hi(md) |
| 556 | mulu.w %d4,%d3 # [4] hi(mr) * hi(md) |
| 557 | |
| 558 | # add lo portions of [2],[3] to hi portion of [1]. |
| 559 | # add carries produced from these adds to [4]. |
| 560 | # lo([1]) is the final lo 16 bits of the result. |
| 561 | clr.l %d4 # load d4 w/ zero value |
| 562 | swap %d0 # hi([1]) <==> lo([1]) |
| 563 | add.w %d1,%d0 # hi([1]) + lo([2]) |
| 564 | addx.l %d4,%d3 # [4] + carry |
| 565 | add.w %d2,%d0 # hi([1]) + lo([3]) |
| 566 | addx.l %d4,%d3 # [4] + carry |
| 567 | swap %d0 # lo([1]) <==> hi([1]) |
| 568 | |
| 569 | # lo portions of [2],[3] have been added in to final result. |
| 570 | # now, clear lo, put hi in lo reg, and add to [4] |
| 571 | clr.w %d1 # clear lo([2]) |
| 572 | clr.w %d2 # clear hi([3]) |
| 573 | swap %d1 # hi([2]) in lo d1 |
| 574 | swap %d2 # hi([3]) in lo d2 |
| 575 | add.l %d2,%d1 # [4] + hi([2]) |
| 576 | add.l %d3,%d1 # [4] + hi([3]) |
| 577 | |
| 578 | # now, grab the condition codes. only one that can be set is 'N'. |
| 579 | # 'N' CAN be set if the operation is unsigned if bit 63 is set. |
| 580 | mov.w MUL64_CC(%a6),%d4 |
| 581 | andi.b &0x10,%d4 # keep old 'X' bit |
| 582 | tst.l %d1 # may set 'N' bit |
| 583 | bpl.b mulu64_ddone |
| 584 | ori.b &0x8,%d4 # set 'N' bit |
| 585 | mulu64_ddone: |
| 586 | mov.w %d4,%cc |
| 587 | |
| 588 | # here, the result is in d1 and d0. the current strategy is to save |
| 589 | # the values at the location pointed to by a0. |
| 590 | # use movm here to not disturb the condition codes. |
| 591 | mulu64_end: |
| 592 | exg %d1,%d0 |
| 593 | movm.l &0x0003,([0x10,%a6]) # save result |
| 594 | |
| 595 | # EPILOGUE BEGIN ######################################################## |
| 596 | # fmovm.l (%sp)+,&0x0 # restore no fpregs |
| 597 | movm.l (%sp)+,&0x001c # restore d2-d4 |
| 598 | unlk %a6 |
| 599 | # EPILOGUE END ########################################################## |
| 600 | |
| 601 | rts |
| 602 | |
| 603 | # one or both of the operands is zero so the result is also zero. |
| 604 | # save the zero result to the register file and set the 'Z' ccode bit. |
| 605 | mulu64_zero: |
| 606 | clr.l %d0 |
| 607 | clr.l %d1 |
| 608 | |
| 609 | mov.w MUL64_CC(%a6),%d4 |
| 610 | andi.b &0x10,%d4 |
| 611 | ori.b &0x4,%d4 |
| 612 | mov.w %d4,%cc # set 'Z' ccode bit |
| 613 | |
| 614 | bra.b mulu64_end |
| 615 | |
| 616 | ########## |
| 617 | # muls.l # |
| 618 | ########## |
| 619 | global _060LSP__imuls64_ |
| 620 | _060LSP__imuls64_: |
| 621 | |
| 622 | # PROLOGUE BEGIN ######################################################## |
| 623 | link.w %a6,&-4 |
| 624 | movm.l &0x3c00,-(%sp) # save d2-d5 |
| 625 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 626 | # PROLOGUE END ########################################################## |
| 627 | |
| 628 | mov.w %cc,MUL64_CC(%a6) # save incoming ccodes |
| 629 | |
| 630 | mov.l 0x8(%a6),%d0 # store multiplier in d0 |
| 631 | beq.b mulu64_zero # handle zero separately |
| 632 | |
| 633 | mov.l 0xc(%a6),%d1 # get multiplicand in d1 |
| 634 | beq.b mulu64_zero # handle zero separately |
| 635 | |
| 636 | clr.b %d5 # clear sign tag |
| 637 | tst.l %d0 # is multiplier negative? |
| 638 | bge.b muls64_chk_md_sgn # no |
| 639 | neg.l %d0 # make multiplier positive |
| 640 | |
| 641 | ori.b &0x1,%d5 # save multiplier sgn |
| 642 | |
| 643 | # the result sign is the exclusive or of the operand sign bits. |
| 644 | muls64_chk_md_sgn: |
| 645 | tst.l %d1 # is multiplicand negative? |
| 646 | bge.b muls64_alg # no |
| 647 | neg.l %d1 # make multiplicand positive |
| 648 | |
| 649 | eori.b &0x1,%d5 # calculate correct sign |
| 650 | |
| 651 | ######################################################################### |
| 652 | # 63 32 0 # |
| 653 | # ---------------------------- # |
| 654 | # | hi(mplier) * hi(mplicand)| # |
| 655 | # ---------------------------- # |
| 656 | # ----------------------------- # |
| 657 | # | hi(mplier) * lo(mplicand) | # |
| 658 | # ----------------------------- # |
| 659 | # ----------------------------- # |
| 660 | # | lo(mplier) * hi(mplicand) | # |
| 661 | # ----------------------------- # |
| 662 | # | ----------------------------- # |
| 663 | # --|-- | lo(mplier) * lo(mplicand) | # |
| 664 | # | ----------------------------- # |
| 665 | # ======================================================== # |
| 666 | # -------------------------------------------------------- # |
| 667 | # | hi(result) | lo(result) | # |
| 668 | # -------------------------------------------------------- # |
| 669 | ######################################################################### |
| 670 | muls64_alg: |
| 671 | # load temp registers with operands |
| 672 | mov.l %d0,%d2 # mr in d2 |
| 673 | mov.l %d0,%d3 # mr in d3 |
| 674 | mov.l %d1,%d4 # md in d4 |
| 675 | swap %d3 # hi(mr) in lo d3 |
| 676 | swap %d4 # hi(md) in lo d4 |
| 677 | |
| 678 | # complete necessary multiplies: |
| 679 | mulu.w %d1,%d0 # [1] lo(mr) * lo(md) |
| 680 | mulu.w %d3,%d1 # [2] hi(mr) * lo(md) |
| 681 | mulu.w %d4,%d2 # [3] lo(mr) * hi(md) |
| 682 | mulu.w %d4,%d3 # [4] hi(mr) * hi(md) |
| 683 | |
| 684 | # add lo portions of [2],[3] to hi portion of [1]. |
| 685 | # add carries produced from these adds to [4]. |
| 686 | # lo([1]) is the final lo 16 bits of the result. |
| 687 | clr.l %d4 # load d4 w/ zero value |
| 688 | swap %d0 # hi([1]) <==> lo([1]) |
| 689 | add.w %d1,%d0 # hi([1]) + lo([2]) |
| 690 | addx.l %d4,%d3 # [4] + carry |
| 691 | add.w %d2,%d0 # hi([1]) + lo([3]) |
| 692 | addx.l %d4,%d3 # [4] + carry |
| 693 | swap %d0 # lo([1]) <==> hi([1]) |
| 694 | |
| 695 | # lo portions of [2],[3] have been added in to final result. |
| 696 | # now, clear lo, put hi in lo reg, and add to [4] |
| 697 | clr.w %d1 # clear lo([2]) |
| 698 | clr.w %d2 # clear hi([3]) |
| 699 | swap %d1 # hi([2]) in lo d1 |
| 700 | swap %d2 # hi([3]) in lo d2 |
| 701 | add.l %d2,%d1 # [4] + hi([2]) |
| 702 | add.l %d3,%d1 # [4] + hi([3]) |
| 703 | |
| 704 | tst.b %d5 # should result be signed? |
| 705 | beq.b muls64_done # no |
| 706 | |
| 707 | # result should be a signed negative number. |
| 708 | # compute 2's complement of the unsigned number: |
| 709 | # -negate all bits and add 1 |
| 710 | muls64_neg: |
| 711 | not.l %d0 # negate lo(result) bits |
| 712 | not.l %d1 # negate hi(result) bits |
| 713 | addq.l &1,%d0 # add 1 to lo(result) |
| 714 | addx.l %d4,%d1 # add carry to hi(result) |
| 715 | |
| 716 | muls64_done: |
| 717 | mov.w MUL64_CC(%a6),%d4 |
| 718 | andi.b &0x10,%d4 # keep old 'X' bit |
| 719 | tst.l %d1 # may set 'N' bit |
| 720 | bpl.b muls64_ddone |
| 721 | ori.b &0x8,%d4 # set 'N' bit |
| 722 | muls64_ddone: |
| 723 | mov.w %d4,%cc |
| 724 | |
| 725 | # here, the result is in d1 and d0. the current strategy is to save |
| 726 | # the values at the location pointed to by a0. |
| 727 | # use movm here to not disturb the condition codes. |
| 728 | muls64_end: |
| 729 | exg %d1,%d0 |
| 730 | movm.l &0x0003,([0x10,%a6]) # save result at (a0) |
| 731 | |
| 732 | # EPILOGUE BEGIN ######################################################## |
| 733 | # fmovm.l (%sp)+,&0x0 # restore no fpregs |
| 734 | movm.l (%sp)+,&0x003c # restore d2-d5 |
| 735 | unlk %a6 |
| 736 | # EPILOGUE END ########################################################## |
| 737 | |
| 738 | rts |
| 739 | |
| 740 | # one or both of the operands is zero so the result is also zero. |
| 741 | # save the zero result to the register file and set the 'Z' ccode bit. |
| 742 | muls64_zero: |
| 743 | clr.l %d0 |
| 744 | clr.l %d1 |
| 745 | |
| 746 | mov.w MUL64_CC(%a6),%d4 |
| 747 | andi.b &0x10,%d4 |
| 748 | ori.b &0x4,%d4 |
| 749 | mov.w %d4,%cc # set 'Z' ccode bit |
| 750 | |
| 751 | bra.b muls64_end |
| 752 | |
| 753 | ######################################################################### |
| 754 | # XDEF **************************************************************** # |
| 755 | # _060LSP__cmp2_Ab_(): Emulate "cmp2.b An,<ea>". # |
| 756 | # _060LSP__cmp2_Aw_(): Emulate "cmp2.w An,<ea>". # |
| 757 | # _060LSP__cmp2_Al_(): Emulate "cmp2.l An,<ea>". # |
| 758 | # _060LSP__cmp2_Db_(): Emulate "cmp2.b Dn,<ea>". # |
| 759 | # _060LSP__cmp2_Dw_(): Emulate "cmp2.w Dn,<ea>". # |
| 760 | # _060LSP__cmp2_Dl_(): Emulate "cmp2.l Dn,<ea>". # |
| 761 | # # |
| 762 | # This is the library version which is accessed as a subroutine # |
| 763 | # and therefore does not work exactly like the 680X0 "cmp2" # |
| 764 | # instruction. # |
| 765 | # # |
| 766 | # XREF **************************************************************** # |
| 767 | # None # |
| 768 | # # |
| 769 | # INPUT *************************************************************** # |
| 770 | # 0x4(sp) = Rn # |
| 771 | # 0x8(sp) = pointer to boundary pair # |
| 772 | # # |
| 773 | # OUTPUT ************************************************************** # |
| 774 | # cc = condition codes are set correctly # |
| 775 | # # |
| 776 | # ALGORITHM *********************************************************** # |
| 777 | # In the interest of simplicity, all operands are converted to # |
| 778 | # longword size whether the operation is byte, word, or long. The # |
| 779 | # bounds are sign extended accordingly. If Rn is a data regsiter, Rn is # |
| 780 | # also sign extended. If Rn is an address register, it need not be sign # |
| 781 | # extended since the full register is always used. # |
| 782 | # The condition codes are set correctly before the final "rts". # |
| 783 | # # |
| 784 | ######################################################################### |
| 785 | |
| 786 | set CMP2_CC, -4 |
| 787 | |
| 788 | global _060LSP__cmp2_Ab_ |
| 789 | _060LSP__cmp2_Ab_: |
| 790 | |
| 791 | # PROLOGUE BEGIN ######################################################## |
| 792 | link.w %a6,&-4 |
| 793 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 794 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 795 | # PROLOGUE END ########################################################## |
| 796 | |
| 797 | mov.w %cc,CMP2_CC(%a6) |
| 798 | mov.l 0x8(%a6), %d2 # get regval |
| 799 | |
| 800 | mov.b ([0xc,%a6],0x0),%d0 |
| 801 | mov.b ([0xc,%a6],0x1),%d1 |
| 802 | |
| 803 | extb.l %d0 # sign extend lo bnd |
| 804 | extb.l %d1 # sign extend hi bnd |
| 805 | bra.w l_cmp2_cmp # go do the compare emulation |
| 806 | |
| 807 | global _060LSP__cmp2_Aw_ |
| 808 | _060LSP__cmp2_Aw_: |
| 809 | |
| 810 | # PROLOGUE BEGIN ######################################################## |
| 811 | link.w %a6,&-4 |
| 812 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 813 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 814 | # PROLOGUE END ########################################################## |
| 815 | |
| 816 | mov.w %cc,CMP2_CC(%a6) |
| 817 | mov.l 0x8(%a6), %d2 # get regval |
| 818 | |
| 819 | mov.w ([0xc,%a6],0x0),%d0 |
| 820 | mov.w ([0xc,%a6],0x2),%d1 |
| 821 | |
| 822 | ext.l %d0 # sign extend lo bnd |
| 823 | ext.l %d1 # sign extend hi bnd |
| 824 | bra.w l_cmp2_cmp # go do the compare emulation |
| 825 | |
| 826 | global _060LSP__cmp2_Al_ |
| 827 | _060LSP__cmp2_Al_: |
| 828 | |
| 829 | # PROLOGUE BEGIN ######################################################## |
| 830 | link.w %a6,&-4 |
| 831 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 832 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 833 | # PROLOGUE END ########################################################## |
| 834 | |
| 835 | mov.w %cc,CMP2_CC(%a6) |
| 836 | mov.l 0x8(%a6), %d2 # get regval |
| 837 | |
| 838 | mov.l ([0xc,%a6],0x0),%d0 |
| 839 | mov.l ([0xc,%a6],0x4),%d1 |
| 840 | bra.w l_cmp2_cmp # go do the compare emulation |
| 841 | |
| 842 | global _060LSP__cmp2_Db_ |
| 843 | _060LSP__cmp2_Db_: |
| 844 | |
| 845 | # PROLOGUE BEGIN ######################################################## |
| 846 | link.w %a6,&-4 |
| 847 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 848 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 849 | # PROLOGUE END ########################################################## |
| 850 | |
| 851 | mov.w %cc,CMP2_CC(%a6) |
| 852 | mov.l 0x8(%a6), %d2 # get regval |
| 853 | |
| 854 | mov.b ([0xc,%a6],0x0),%d0 |
| 855 | mov.b ([0xc,%a6],0x1),%d1 |
| 856 | |
| 857 | extb.l %d0 # sign extend lo bnd |
| 858 | extb.l %d1 # sign extend hi bnd |
| 859 | |
| 860 | # operation is a data register compare. |
| 861 | # sign extend byte to long so we can do simple longword compares. |
| 862 | extb.l %d2 # sign extend data byte |
| 863 | bra.w l_cmp2_cmp # go do the compare emulation |
| 864 | |
| 865 | global _060LSP__cmp2_Dw_ |
| 866 | _060LSP__cmp2_Dw_: |
| 867 | |
| 868 | # PROLOGUE BEGIN ######################################################## |
| 869 | link.w %a6,&-4 |
| 870 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 871 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 872 | # PROLOGUE END ########################################################## |
| 873 | |
| 874 | mov.w %cc,CMP2_CC(%a6) |
| 875 | mov.l 0x8(%a6), %d2 # get regval |
| 876 | |
| 877 | mov.w ([0xc,%a6],0x0),%d0 |
| 878 | mov.w ([0xc,%a6],0x2),%d1 |
| 879 | |
| 880 | ext.l %d0 # sign extend lo bnd |
| 881 | ext.l %d1 # sign extend hi bnd |
| 882 | |
| 883 | # operation is a data register compare. |
| 884 | # sign extend word to long so we can do simple longword compares. |
| 885 | ext.l %d2 # sign extend data word |
| 886 | bra.w l_cmp2_cmp # go emulate compare |
| 887 | |
| 888 | global _060LSP__cmp2_Dl_ |
| 889 | _060LSP__cmp2_Dl_: |
| 890 | |
| 891 | # PROLOGUE BEGIN ######################################################## |
| 892 | link.w %a6,&-4 |
| 893 | movm.l &0x3800,-(%sp) # save d2-d4 |
| 894 | # fmovm.l &0x0,-(%sp) # save no fpregs |
| 895 | # PROLOGUE END ########################################################## |
| 896 | |
| 897 | mov.w %cc,CMP2_CC(%a6) |
| 898 | mov.l 0x8(%a6), %d2 # get regval |
| 899 | |
| 900 | mov.l ([0xc,%a6],0x0),%d0 |
| 901 | mov.l ([0xc,%a6],0x4),%d1 |
| 902 | |
| 903 | # |
| 904 | # To set the ccodes correctly: |
| 905 | # (1) save 'Z' bit from (Rn - lo) |
| 906 | # (2) save 'Z' and 'N' bits from ((hi - lo) - (Rn - hi)) |
| 907 | # (3) keep 'X', 'N', and 'V' from before instruction |
| 908 | # (4) combine ccodes |
| 909 | # |
| 910 | l_cmp2_cmp: |
| 911 | sub.l %d0, %d2 # (Rn - lo) |
| 912 | mov.w %cc, %d3 # fetch resulting ccodes |
| 913 | andi.b &0x4, %d3 # keep 'Z' bit |
| 914 | sub.l %d0, %d1 # (hi - lo) |
| 915 | cmp.l %d1,%d2 # ((hi - lo) - (Rn - hi)) |
| 916 | |
| 917 | mov.w %cc, %d4 # fetch resulting ccodes |
| 918 | or.b %d4, %d3 # combine w/ earlier ccodes |
| 919 | andi.b &0x5, %d3 # keep 'Z' and 'N' |
| 920 | |
| 921 | mov.w CMP2_CC(%a6), %d4 # fetch old ccodes |
| 922 | andi.b &0x1a, %d4 # keep 'X','N','V' bits |
| 923 | or.b %d3, %d4 # insert new ccodes |
| 924 | mov.w %d4,%cc # save new ccodes |
| 925 | |
| 926 | # EPILOGUE BEGIN ######################################################## |
| 927 | # fmovm.l (%sp)+,&0x0 # restore no fpregs |
| 928 | movm.l (%sp)+,&0x001c # restore d2-d4 |
| 929 | unlk %a6 |
| 930 | # EPILOGUE END ########################################################## |
| 931 | |
| 932 | rts |