Robin Getz | 96f1050 | 2009-09-24 14:11:24 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2008 Analog Devices Inc. |
| 3 | * |
Sonic Zhang | de45083 | 2012-05-17 14:45:27 +0800 | [diff] [blame] | 4 | * Licensed under the Clear BSD license or the GPL-2 (or later) |
Robin Getz | 96f1050 | 2009-09-24 14:11:24 +0000 | [diff] [blame] | 5 | */ |
| 6 | |
Bernd Schmidt | 71ae92f | 2009-01-07 23:14:39 +0800 | [diff] [blame] | 7 | .align 2 |
| 8 | .global ___muldi3; |
| 9 | .type ___muldi3, STT_FUNC; |
| 10 | |
| 11 | #ifdef CONFIG_ARITHMETIC_OPS_L1 |
| 12 | .section .l1.text |
| 13 | #else |
| 14 | .text |
| 15 | #endif |
| 16 | |
| 17 | /* |
| 18 | R1:R0 * R3:R2 |
| 19 | = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l |
| 20 | [X] = (R1.h * R3.h) * 2^96 |
| 21 | [X] + (R1.h * R3.l + R1.l * R3.h) * 2^80 |
| 22 | [X] + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64 |
| 23 | [T1] + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48 |
| 24 | [T2] + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32 |
| 25 | [T3] + (R0.l * R2.h + R2.l * R0.h) * 2^16 |
| 26 | [T4] + (R0.l * R2.l) |
| 27 | |
| 28 | We can discard the first three lines marked "X" since we produce |
| 29 | only a 64 bit result. So, we need ten 16-bit multiplies. |
| 30 | |
| 31 | Individual mul-acc results: |
| 32 | [E1] = R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h |
| 33 | [E2] = R1.l * R2.l + R3.l * R0.l + R0.h * R2.h |
| 34 | [E3] = R0.l * R2.h + R2.l * R0.h |
| 35 | [E4] = R0.l * R2.l |
| 36 | |
| 37 | We also need to add high parts from lower-level results to higher ones: |
| 38 | E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4 |
| 39 | |
| 40 | One interesting property is that all parts of the result that depend |
| 41 | on the sign of the multiplication are discarded. Those would be the |
| 42 | multiplications involving R1.h and R3.h, but only the top 16 bit of |
| 43 | the 32 bit result depend on the sign, and since R1.h and R3.h only |
| 44 | occur in E1, the top half of these results is cut off. |
| 45 | So, we can just use FU mode for all of the 16-bit multiplies, and |
| 46 | ignore questions of when to use mixed mode. */ |
| 47 | |
| 48 | ___muldi3: |
| 49 | /* [SP] technically is part of the caller's frame, but we can |
| 50 | use it as scratch space. */ |
| 51 | A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12]; /* E1 */ |
| 52 | A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4; /* E1 */ |
| 53 | A0 += A1; /* E1 */ |
| 54 | R4 = A0.w; |
| 55 | A0 = R0.l * R3.l (FU); /* E2 */ |
| 56 | A0 += R2.l * R1.l (FU); /* E2 */ |
| 57 | |
| 58 | A1 = R2.L * R0.L (FU); /* E4 */ |
| 59 | R3 = A1.w; |
| 60 | A1 = A1 >> 16; /* E3c */ |
| 61 | A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU); /* E2, E3c */ |
| 62 | A1 += R0.L * R2.H (FU); /* E3c */ |
| 63 | R0 = A1.w; |
| 64 | A1 = A1 >> 16; /* E2c */ |
| 65 | A0 += A1; /* E2c */ |
| 66 | R1 = A0.w; |
| 67 | |
| 68 | /* low(result) = low(E3c):low(E4) */ |
| 69 | R0 = PACK (R0.l, R3.l); |
| 70 | /* high(result) = E2c + (E1 << 16) */ |
| 71 | R1.h = R1.h + R4.l (NS) || R4 = [SP]; |
| 72 | RTS; |
| 73 | |
| 74 | .size ___muldi3, .-___muldi3 |