Blame - arch/m68k/fpsp040/setox.S - kernel/msm-4.9

blob: f1acf7e36d6b826c77c167efdefaa5729b140977 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	\|
				2	\| setox.sa 3.1 12/10/90
				3	\|
				4	\| The entry point setox computes the exponential of a value.
				5	\| setoxd does the same except the input value is a denormalized
				6	\| number. setoxm1 computes exp(X)-1, and setoxm1d computes
				7	\| exp(X)-1 for denormalized X.
				8	\|
				9	\| INPUT
				10	\| -----
				11	\| Double-extended value in memory location pointed to by address
				12	\| register a0.
				13	\|
				14	\| OUTPUT
				15	\| ------
				16	\| exp(X) or exp(X)-1 returned in floating-point register fp0.
				17	\|
				18	\| ACCURACY and MONOTONICITY
				19	\| -------------------------
				20	\| The returned result is within 0.85 ulps in 64 significant bit, i.e.
				21	\| within 0.5001 ulp to 53 bits if the result is subsequently rounded
				22	\| to double precision. The result is provably monotonic in double
				23	\| precision.
				24	\|
				25	\| SPEED
				26	\| -----
				27	\| Two timings are measured, both in the copy-back mode. The
				28	\| first one is measured when the function is invoked the first time
				29	\| (so the instructions and data are not in cache), and the
				30	\| second one is measured when the function is reinvoked at the same
				31	\| input argument.
				32	\|
				33	\| The program setox takes approximately 210/190 cycles for input
				34	\| argument X whose magnitude is less than 16380 log2, which
				35	\| is the usual situation. For the less common arguments,
				36	\| depending on their values, the program may run faster or slower --
				37	\| but no worse than 10% slower even in the extreme cases.
				38	\|
Finn Thain	646819e	2008-11-18 20:45:22 +0100	[diff] [blame]	39	\| The program setoxm1 takes approximately ??? / ??? cycles for input
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	\| argument X, 0.25 <= \|X\| < 70log2. For \|X\| < 0.25, it takes
Finn Thain	646819e	2008-11-18 20:45:22 +0100	[diff] [blame]	41	\| approximately ??? / ??? cycles. For the less common arguments,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	42	\| depending on their values, the program may run faster or slower --
				43	\| but no worse than 10% slower even in the extreme cases.
				44	\|
				45	\| ALGORITHM and IMPLEMENTATION NOTES
				46	\| ----------------------------------
				47	\|
				48	\| setoxd
				49	\| ------
				50	\| Step 1. Set ans := 1.0
				51	\|
				52	\| Step 2. Return ans := ans + sign(X)*2^(-126). Exit.
				53	\| Notes: This will always generate one exception -- inexact.
				54	\|
				55	\|
				56	\| setox
				57	\| -----
				58	\|
				59	\| Step 1. Filter out extreme cases of input argument.
				60	\| 1.1 If \|X\| >= 2^(-65), go to Step 1.3.
				61	\| 1.2 Go to Step 7.
				62	\| 1.3 If \|X\| < 16380 log(2), go to Step 2.
				63	\| 1.4 Go to Step 8.
				64	\| Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
				65	\| To avoid the use of floating-point comparisons, a
				66	\| compact representation of \|X\| is used. This format is a
				67	\| 32-bit integer, the upper (more significant) 16 bits are
				68	\| the sign and biased exponent field of \|X\|; the lower 16
				69	\| bits are the 16 most significant fraction (including the
				70	\| explicit bit) bits of \|X\|. Consequently, the comparisons
				71	\| in Steps 1.1 and 1.3 can be performed by integer comparison.
				72	\| Note also that the constant 16380 log(2) used in Step 1.3
				73	\| is also in the compact form. Thus taking the branch
				74	\| to Step 2 guarantees \|X\| < 16380 log(2). There is no harm
				75	\| to have a small number of cases where \|X\| is less than,
				76	\| but close to, 16380 log(2) and the branch to Step 9 is
				77	\| taken.
				78	\|
				79	\| Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
				80	\| 2.1 Set AdjFlag := 0 (indicates the branch 1.3 -> 2 was taken)
				81	\| 2.2 N := round-to-nearest-integer( X * 64/log2 ).
				82	\| 2.3 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
				83	\| 2.4 Calculate M = (N - J)/64; so N = 64M + J.
				84	\| 2.5 Calculate the address of the stored value of 2^(J/64).
				85	\| 2.6 Create the value Scale = 2^M.
				86	\| Notes: The calculation in 2.2 is really performed by
				87	\|
				88	\| Z := X * constant
				89	\| N := round-to-nearest-integer(Z)
				90	\|
				91	\| where
				92	\|
				93	\| constant := single-precision( 64/log 2 ).
				94	\|
				95	\| Using a single-precision constant avoids memory access.
				96	\| Another effect of using a single-precision "constant" is
				97	\| that the calculated value Z is
				98	\|
				99	\| Z = X(64/log2)(1+eps), \|eps\| <= 2^(-24).
				100	\|
				101	\| This error has to be considered later in Steps 3 and 4.
				102	\|
				103	\| Step 3. Calculate X - N*log2/64.
				104	\| 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
				105	\| 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
				106	\| Notes: a) The way L1 and L2 are chosen ensures L1+L2 approximate
				107	\| the value -log2/64 to 88 bits of accuracy.
				108	\| b) N*L1 is exact because N is no longer than 22 bits and
				109	\| L1 is no longer than 24 bits.
				110	\| c) The calculation X+N*L1 is also exact due to cancellation.
				111	\| Thus, R is practically X+N(L1+L2) to full 64 bits.
				112	\| d) It is important to estimate how large can \|R\| be after
				113	\| Step 3.2.
				114	\|
				115	\| N = rnd-to-int( X*64/log2 (1+eps) ), \|eps\|<=2^(-24)
				116	\| X*64/log2 (1+eps) = N + f, \|f\| <= 0.5
				117	\| X64/log2 - N = f - epsX 64/log2
				118	\| X - Nlog2/64 = flog2/64 - eps*X
				119	\|
				120	\|
				121	\| Now \|X\| <= 16446 log2, thus
				122	\|
				123	\| \|X - Nlog2/64\| <= (0.5 + 16446/2^(18))log2/64
				124	\| <= 0.57 log2/64.
				125	\| This bound will be used in Step 4.
				126	\|
				127	\| Step 4. Approximate exp(R)-1 by a polynomial
				128	\| p = R + RR(A1 + R(A2 + R(A3 + R(A4 + RA5))))
				129	\| Notes: a) In order to reduce memory access, the coefficients are
				130	\| made as "short" as possible: A1 (which is 1/2), A4 and A5
				131	\| are single precision; A2 and A3 are double precision.
				132	\| b) Even with the restrictions above,
				133	\| \|p - (exp(R)-1)\| < 2^(-68.8) for all \|R\| <= 0.0062.
				134	\| Note that 0.0062 is slightly bigger than 0.57 log2/64.
				135	\| c) To fully utilize the pipeline, p is separated into
				136	\| two independent pieces of roughly equal complexities
				137	\| p = [ R + RS(A2 + S*A4) ] +
				138	\| [ S(A1 + S(A3 + S*A5)) ]
				139	\| where S = R*R.
				140	\|
				141	\| Step 5. Compute 2^(J/64)exp(R) = 2^(J/64)(1+p) by
				142	\| ans := T + ( T*p + t)
				143	\| where T and t are the stored values for 2^(J/64).
				144	\| Notes: 2^(J/64) is stored as T and t where T+t approximates
				145	\| 2^(J/64) to roughly 85 bits; T is in extended precision
				146	\| and t is in single precision. Note also that T is rounded
				147	\| to 62 bits so that the last two bits of T are zero. The
				148	\| reason for such a special form is that T-1, T-2, and T-8
				149	\| will all be exact --- a property that will give much
				150	\| more accurate computation of the function EXPM1.
				151	\|
				152	\| Step 6. Reconstruction of exp(X)
				153	\| exp(X) = 2^M * 2^(J/64) * exp(R).
				154	\| 6.1 If AdjFlag = 0, go to 6.3
				155	\| 6.2 ans := ans * AdjScale
				156	\| 6.3 Restore the user FPCR
				157	\| 6.4 Return ans := ans * Scale. Exit.
				158	\| Notes: If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,
				159	\| \|M\| <= 16380, and Scale = 2^M. Moreover, exp(X) will
				160	\| neither overflow nor underflow. If AdjFlag = 1, that
				161	\| means that
				162	\| X = (M1+M)log2 + Jlog2/64 + R, \|M1+M\| >= 16380.
				163	\| Hence, exp(X) may overflow or underflow or neither.
				164	\| When that is the case, AdjScale = 2^(M1) where M1 is
				165	\| approximately M. Thus 6.2 will never cause over/underflow.
				166	\| Possible exception in 6.4 is overflow or underflow.
				167	\| The inexact exception is not generated in 6.4. Although
				168	\| one can argue that the inexact flag should always be
				169	\| raised, to simulate that exception cost to much than the
				170	\| flag is worth in practical uses.
				171	\|
				172	\| Step 7. Return 1 + X.
				173	\| 7.1 ans := X
				174	\| 7.2 Restore user FPCR.
				175	\| 7.3 Return ans := 1 + ans. Exit
				176	\| Notes: For non-zero X, the inexact exception will always be
				177	\| raised by 7.3. That is the only exception raised by 7.3.
				178	\| Note also that we use the FMOVEM instruction to move X
				179	\| in Step 7.1 to avoid unnecessary trapping. (Although
				180	\| the FMOVEM may not seem relevant since X is normalized,
				181	\| the precaution will be useful in the library version of
				182	\| this code where the separate entry for denormalized inputs
				183	\| will be done away with.)
				184	\|
				185	\| Step 8. Handle exp(X) where \|X\| >= 16380log2.
				186	\| 8.1 If \|X\| > 16480 log2, go to Step 9.
				187	\| (mimic 2.2 - 2.6)
				188	\| 8.2 N := round-to-integer( X * 64/log2 )
				189	\| 8.3 Calculate J = N mod 64, J = 0,1,...,63
				190	\| 8.4 K := (N-J)/64, M1 := truncate(K/2), M = K-M1, AdjFlag := 1.
				191	\| 8.5 Calculate the address of the stored value 2^(J/64).
				192	\| 8.6 Create the values Scale = 2^M, AdjScale = 2^M1.
				193	\| 8.7 Go to Step 3.
				194	\| Notes: Refer to notes for 2.2 - 2.6.
				195	\|
				196	\| Step 9. Handle exp(X), \|X\| > 16480 log2.
				197	\| 9.1 If X < 0, go to 9.3
				198	\| 9.2 ans := Huge, go to 9.4
				199	\| 9.3 ans := Tiny.
				200	\| 9.4 Restore user FPCR.
				201	\| 9.5 Return ans := ans * ans. Exit.
				202	\| Notes: Exp(X) will surely overflow or underflow, depending on
				203	\| X's sign. "Huge" and "Tiny" are respectively large/tiny
				204	\| extended-precision numbers whose square over/underflow
				205	\| with an inexact result. Thus, 9.5 always raises the
				206	\| inexact together with either overflow or underflow.
				207	\|
				208	\|
				209	\| setoxm1d
				210	\| --------
				211	\|
				212	\| Step 1. Set ans := 0
				213	\|
				214	\| Step 2. Return ans := X + ans. Exit.
				215	\| Notes: This will return X with the appropriate rounding
				216	\| precision prescribed by the user FPCR.
				217	\|
				218	\| setoxm1
				219	\| -------
				220	\|
				221	\| Step 1. Check \|X\|
				222	\| 1.1 If \|X\| >= 1/4, go to Step 1.3.
				223	\| 1.2 Go to Step 7.
				224	\| 1.3 If \|X\| < 70 log(2), go to Step 2.
				225	\| 1.4 Go to Step 10.
				226	\| Notes: The usual case should take the branches 1.1 -> 1.3 -> 2.
				227	\| However, it is conceivable \|X\| can be small very often
				228	\| because EXPM1 is intended to evaluate exp(X)-1 accurately
				229	\| when \|X\| is small. For further details on the comparisons,
				230	\| see the notes on Step 1 of setox.
				231	\|
				232	\| Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).
				233	\| 2.1 N := round-to-nearest-integer( X * 64/log2 ).
				234	\| 2.2 Calculate J = N mod 64; so J = 0,1,2,..., or 63.
				235	\| 2.3 Calculate M = (N - J)/64; so N = 64M + J.
				236	\| 2.4 Calculate the address of the stored value of 2^(J/64).
				237	\| 2.5 Create the values Sc = 2^M and OnebySc := -2^(-M).
				238	\| Notes: See the notes on Step 2 of setox.
				239	\|
				240	\| Step 3. Calculate X - N*log2/64.
				241	\| 3.1 R := X + N*L1, where L1 := single-precision(-log2/64).
				242	\| 3.2 R := R + N*L2, L2 := extended-precision(-log2/64 - L1).
				243	\| Notes: Applying the analysis of Step 3 of setox in this case
				244	\| shows that \|R\| <= 0.0055 (note that \|X\| <= 70 log2 in
				245	\| this case).
				246	\|
				247	\| Step 4. Approximate exp(R)-1 by a polynomial
				248	\| p = R+RR(A1+R(A2+R(A3+R(A4+R(A5+R*A6)))))
				249	\| Notes: a) In order to reduce memory access, the coefficients are
				250	\| made as "short" as possible: A1 (which is 1/2), A5 and A6
				251	\| are single precision; A2, A3 and A4 are double precision.
				252	\| b) Even with the restriction above,
				253	\| \|p - (exp(R)-1)\| < \|R\| * 2^(-72.7)
				254	\| for all \|R\| <= 0.0055.
				255	\| c) To fully utilize the pipeline, p is separated into
				256	\| two independent pieces of roughly equal complexity
				257	\| p = [ RS(A2 + S(A4 + SA6)) ] +
				258	\| [ R + S(A1 + S(A3 + S*A5)) ]
				259	\| where S = R*R.
				260	\|
				261	\| Step 5. Compute 2^(J/64)*p by
				262	\| p := T*p
				263	\| where T and t are the stored values for 2^(J/64).
				264	\| Notes: 2^(J/64) is stored as T and t where T+t approximates
				265	\| 2^(J/64) to roughly 85 bits; T is in extended precision
				266	\| and t is in single precision. Note also that T is rounded
				267	\| to 62 bits so that the last two bits of T are zero. The
				268	\| reason for such a special form is that T-1, T-2, and T-8
				269	\| will all be exact --- a property that will be exploited
				270	\| in Step 6 below. The total relative error in p is no
				271	\| bigger than 2^(-67.7) compared to the final result.
				272	\|
				273	\| Step 6. Reconstruction of exp(X)-1
				274	\| exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).
				275	\| 6.1 If M <= 63, go to Step 6.3.
				276	\| 6.2 ans := T + (p + (t + OnebySc)). Go to 6.6
				277	\| 6.3 If M >= -3, go to 6.5.
				278	\| 6.4 ans := (T + (p + t)) + OnebySc. Go to 6.6
				279	\| 6.5 ans := (T + OnebySc) + (p + t).
				280	\| 6.6 Restore user FPCR.
				281	\| 6.7 Return ans := Sc * ans. Exit.
				282	\| Notes: The various arrangements of the expressions give accurate
				283	\| evaluations.
				284	\|
				285	\| Step 7. exp(X)-1 for \|X\| < 1/4.
				286	\| 7.1 If \|X\| >= 2^(-65), go to Step 9.
				287	\| 7.2 Go to Step 8.
				288	\|
				289	\| Step 8. Calculate exp(X)-1, \|X\| < 2^(-65).
				290	\| 8.1 If \|X\| < 2^(-16312), goto 8.3
				291	\| 8.2 Restore FPCR; return ans := X - 2^(-16382). Exit.
				292	\| 8.3 X := X * 2^(140).
				293	\| 8.4 Restore FPCR; ans := ans - 2^(-16382).
				294	\| Return ans := ans*2^(140). Exit
				295	\| Notes: The idea is to return "X - tiny" under the user
				296	\| precision and rounding modes. To avoid unnecessary
				297	\| inefficiency, we stay away from denormalized numbers the
				298	\| best we can. For \|X\| >= 2^(-16312), the straightforward
				299	\| 8.2 generates the inexact exception as the case warrants.
				300	\|
				301	\| Step 9. Calculate exp(X)-1, \|X\| < 1/4, by a polynomial
				302	\| p = X + XX(B1 + X(B2 + ... + XB12))
				303	\| Notes: a) In order to reduce memory access, the coefficients are
				304	\| made as "short" as possible: B1 (which is 1/2), B9 to B12
				305	\| are single precision; B3 to B8 are double precision; and
				306	\| B2 is double extended.
				307	\| b) Even with the restriction above,
				308	\| \|p - (exp(X)-1)\| < \|X\| 2^(-70.6)
				309	\| for all \|X\| <= 0.251.
				310	\| Note that 0.251 is slightly bigger than 1/4.
				311	\| c) To fully preserve accuracy, the polynomial is computed
				312	\| as X + ( SB1 + Q ) where S = XX and
				313	\| Q = XS(B2 + X(B3 + ... + XB12))
				314	\| d) To fully utilize the pipeline, Q is separated into
				315	\| two independent pieces of roughly equal complexity
				316	\| Q = [ XS(B2 + S(B4 + ... + SB12)) ] +
				317	\| [ SS(B3 + S(B5 + ... + SB11)) ]
				318	\|
				319	\| Step 10. Calculate exp(X)-1 for \|X\| >= 70 log 2.
				320	\| 10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all practical
				321	\| purposes. Therefore, go to Step 1 of setox.
				322	\| 10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical purposes.
				323	\| ans := -1
				324	\| Restore user FPCR
				325	\| Return ans := ans + 2^(-126). Exit.
				326	\| Notes: 10.2 will always create an inexact and return -1 + tiny
				327	\| in the user rounding precision and mode.
				328	\|
				329	\|
				330
				331	\| Copyright (C) Motorola, Inc. 1990
				332	\| All Rights Reserved
				333	\|
Matt Waddel	e00d82d	2006-02-11 17:55:48 -0800	[diff] [blame]	334	\| For details on the license for this file, please see the
				335	\| file, README, in this same directory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	336
				337	\|setox idnt 2,1 \| Motorola 040 Floating Point Software Package
				338
				339	\|section 8
				340
				341	#include "fpsp.h"
				342
				343	L2: .long 0x3FDC0000,0x82E30865,0x4361C4C6,0x00000000
				344
				345	EXPA3: .long 0x3FA55555,0x55554431
				346	EXPA2: .long 0x3FC55555,0x55554018
				347
				348	HUGE: .long 0x7FFE0000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
				349	TINY: .long 0x00010000,0xFFFFFFFF,0xFFFFFFFF,0x00000000
				350
				351	EM1A4: .long 0x3F811111,0x11174385
				352	EM1A3: .long 0x3FA55555,0x55554F5A
				353
				354	EM1A2: .long 0x3FC55555,0x55555555,0x00000000,0x00000000
				355
				356	EM1B8: .long 0x3EC71DE3,0xA5774682
				357	EM1B7: .long 0x3EFA01A0,0x19D7CB68
				358
				359	EM1B6: .long 0x3F2A01A0,0x1A019DF3
				360	EM1B5: .long 0x3F56C16C,0x16C170E2
				361
				362	EM1B4: .long 0x3F811111,0x11111111
				363	EM1B3: .long 0x3FA55555,0x55555555
				364
				365	EM1B2: .long 0x3FFC0000,0xAAAAAAAA,0xAAAAAAAB
				366	.long 0x00000000
				367
				368	TWO140: .long 0x48B00000,0x00000000
				369	TWON140: .long 0x37300000,0x00000000
				370
				371	EXPTBL:
				372	.long 0x3FFF0000,0x80000000,0x00000000,0x00000000
				373	.long 0x3FFF0000,0x8164D1F3,0xBC030774,0x9F841A9B
				374	.long 0x3FFF0000,0x82CD8698,0xAC2BA1D8,0x9FC1D5B9
				375	.long 0x3FFF0000,0x843A28C3,0xACDE4048,0xA0728369
				376	.long 0x3FFF0000,0x85AAC367,0xCC487B14,0x1FC5C95C
				377	.long 0x3FFF0000,0x871F6196,0x9E8D1010,0x1EE85C9F
				378	.long 0x3FFF0000,0x88980E80,0x92DA8528,0x9FA20729
				379	.long 0x3FFF0000,0x8A14D575,0x496EFD9C,0xA07BF9AF
				380	.long 0x3FFF0000,0x8B95C1E3,0xEA8BD6E8,0xA0020DCF
				381	.long 0x3FFF0000,0x8D1ADF5B,0x7E5BA9E4,0x205A63DA
				382	.long 0x3FFF0000,0x8EA4398B,0x45CD53C0,0x1EB70051
				383	.long 0x3FFF0000,0x9031DC43,0x1466B1DC,0x1F6EB029
				384	.long 0x3FFF0000,0x91C3D373,0xAB11C338,0xA0781494
				385	.long 0x3FFF0000,0x935A2B2F,0x13E6E92C,0x9EB319B0
				386	.long 0x3FFF0000,0x94F4EFA8,0xFEF70960,0x2017457D
				387	.long 0x3FFF0000,0x96942D37,0x20185A00,0x1F11D537
				388	.long 0x3FFF0000,0x9837F051,0x8DB8A970,0x9FB952DD
				389	.long 0x3FFF0000,0x99E04593,0x20B7FA64,0x1FE43087
				390	.long 0x3FFF0000,0x9B8D39B9,0xD54E5538,0x1FA2A818
				391	.long 0x3FFF0000,0x9D3ED9A7,0x2CFFB750,0x1FDE494D
				392	.long 0x3FFF0000,0x9EF53260,0x91A111AC,0x20504890
				393	.long 0x3FFF0000,0xA0B0510F,0xB9714FC4,0xA073691C
				394	.long 0x3FFF0000,0xA2704303,0x0C496818,0x1F9B7A05
				395	.long 0x3FFF0000,0xA43515AE,0x09E680A0,0xA0797126
				396	.long 0x3FFF0000,0xA5FED6A9,0xB15138EC,0xA071A140
				397	.long 0x3FFF0000,0xA7CD93B4,0xE9653568,0x204F62DA
				398	.long 0x3FFF0000,0xA9A15AB4,0xEA7C0EF8,0x1F283C4A
				399	.long 0x3FFF0000,0xAB7A39B5,0xA93ED338,0x9F9A7FDC
				400	.long 0x3FFF0000,0xAD583EEA,0x42A14AC8,0xA05B3FAC
				401	.long 0x3FFF0000,0xAF3B78AD,0x690A4374,0x1FDF2610
				402	.long 0x3FFF0000,0xB123F581,0xD2AC2590,0x9F705F90
				403	.long 0x3FFF0000,0xB311C412,0xA9112488,0x201F678A
				404	.long 0x3FFF0000,0xB504F333,0xF9DE6484,0x1F32FB13
				405	.long 0x3FFF0000,0xB6FD91E3,0x28D17790,0x20038B30
				406	.long 0x3FFF0000,0xB8FBAF47,0x62FB9EE8,0x200DC3CC
				407	.long 0x3FFF0000,0xBAFF5AB2,0x133E45FC,0x9F8B2AE6
				408	.long 0x3FFF0000,0xBD08A39F,0x580C36C0,0xA02BBF70
				409	.long 0x3FFF0000,0xBF1799B6,0x7A731084,0xA00BF518
				410	.long 0x3FFF0000,0xC12C4CCA,0x66709458,0xA041DD41
				411	.long 0x3FFF0000,0xC346CCDA,0x24976408,0x9FDF137B
				412	.long 0x3FFF0000,0xC5672A11,0x5506DADC,0x201F1568
				413	.long 0x3FFF0000,0xC78D74C8,0xABB9B15C,0x1FC13A2E
				414	.long 0x3FFF0000,0xC9B9BD86,0x6E2F27A4,0xA03F8F03
				415	.long 0x3FFF0000,0xCBEC14FE,0xF2727C5C,0x1FF4907D
				416	.long 0x3FFF0000,0xCE248C15,0x1F8480E4,0x9E6E53E4
				417	.long 0x3FFF0000,0xD06333DA,0xEF2B2594,0x1FD6D45C
				418	.long 0x3FFF0000,0xD2A81D91,0xF12AE45C,0xA076EDB9
				419	.long 0x3FFF0000,0xD4F35AAB,0xCFEDFA20,0x9FA6DE21
				420	.long 0x3FFF0000,0xD744FCCA,0xD69D6AF4,0x1EE69A2F
				421	.long 0x3FFF0000,0xD99D15C2,0x78AFD7B4,0x207F439F
				422	.long 0x3FFF0000,0xDBFBB797,0xDAF23754,0x201EC207
				423	.long 0x3FFF0000,0xDE60F482,0x5E0E9124,0x9E8BE175
				424	.long 0x3FFF0000,0xE0CCDEEC,0x2A94E110,0x20032C4B
				425	.long 0x3FFF0000,0xE33F8972,0xBE8A5A50,0x2004DFF5
				426	.long 0x3FFF0000,0xE5B906E7,0x7C8348A8,0x1E72F47A
				427	.long 0x3FFF0000,0xE8396A50,0x3C4BDC68,0x1F722F22
				428	.long 0x3FFF0000,0xEAC0C6E7,0xDD243930,0xA017E945
				429	.long 0x3FFF0000,0xED4F301E,0xD9942B84,0x1F401A5B
				430	.long 0x3FFF0000,0xEFE4B99B,0xDCDAF5CC,0x9FB9A9E3
				431	.long 0x3FFF0000,0xF281773C,0x59FFB138,0x20744C05
				432	.long 0x3FFF0000,0xF5257D15,0x2486CC2C,0x1F773A19
				433	.long 0x3FFF0000,0xF7D0DF73,0x0AD13BB8,0x1FFE90D5
				434	.long 0x3FFF0000,0xFA83B2DB,0x722A033C,0xA041ED22
				435	.long 0x3FFF0000,0xFD3E0C0C,0xF486C174,0x1F853F3A
				436
				437	.set ADJFLAG,L_SCR2
				438	.set SCALE,FP_SCR1
				439	.set ADJSCALE,FP_SCR2
				440	.set SC,FP_SCR3
				441	.set ONEBYSC,FP_SCR4
				442
				443	\| xref t_frcinx
				444	\|xref t_extdnrm
				445	\|xref t_unfl
				446	\|xref t_ovfl
				447
				448	.global setoxd
				449	setoxd:
				450	\|--entry point for EXP(X), X is denormalized
				451	movel (%a0),%d0
				452	andil #0x80000000,%d0
				453	oril #0x00800000,%d0 \| ...sign(X)*2^(-126)
				454	movel %d0,-(%sp)
				455	fmoves #0x3F800000,%fp0
				456	fmovel %d1,%fpcr
				457	fadds (%sp)+,%fp0
				458	bra t_frcinx
				459
				460	.global setox
				461	setox:
				462	\|--entry point for EXP(X), here X is finite, non-zero, and not NaN's
				463
				464	\|--Step 1.
				465	movel (%a0),%d0 \| ...load part of input X
				466	andil #0x7FFF0000,%d0 \| ...biased expo. of X
				467	cmpil #0x3FBE0000,%d0 \| ...2^(-65)
				468	bges EXPC1 \| ...normal case
				469	bra EXPSM
				470
				471	EXPC1:
				472	\|--The case \|X\| >= 2^(-65)
				473	movew 4(%a0),%d0 \| ...expo. and partial sig. of \|X\|
				474	cmpil #0x400CB167,%d0 \| ...16380 log2 trunc. 16 bits
				475	blts EXPMAIN \| ...normal case
				476	bra EXPBIG
				477
				478	EXPMAIN:
				479	\|--Step 2.
				480	\|--This is the normal branch: 2^(-65) <= \|X\| < 16380 log2.
				481	fmovex (%a0),%fp0 \| ...load input from (a0)
				482
				483	fmovex %fp0,%fp1
				484	fmuls #0x42B8AA3B,%fp0 \| ...64/log2 * X
				485	fmovemx %fp2-%fp2/%fp3,-(%a7) \| ...save fp2
				486	movel #0,ADJFLAG(%a6)
				487	fmovel %fp0,%d0 \| ...N = int( X * 64/log2 )
				488	lea EXPTBL,%a1
				489	fmovel %d0,%fp0 \| ...convert to floating-format
				490
				491	movel %d0,L_SCR1(%a6) \| ...save N temporarily
				492	andil #0x3F,%d0 \| ...D0 is J = N mod 64
				493	lsll #4,%d0
				494	addal %d0,%a1 \| ...address of 2^(J/64)
				495	movel L_SCR1(%a6),%d0
				496	asrl #6,%d0 \| ...D0 is M
				497	addiw #0x3FFF,%d0 \| ...biased expo. of 2^(M)
				498	movew L2,L_SCR1(%a6) \| ...prefetch L2, no need in CB
				499
				500	EXPCONT1:
				501	\|--Step 3.
				502	\|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
				503	\|--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
				504	fmovex %fp0,%fp2
				505	fmuls #0xBC317218,%fp0 \| ...N * L1, L1 = lead(-log2/64)
				506	fmulx L2,%fp2 \| ...N * L2, L1+L2 = -log2/64
				507	faddx %fp1,%fp0 \| ...X + N*L1
				508	faddx %fp2,%fp0 \| ...fp0 is R, reduced arg.
				509	\| MOVE.W #$3FA5,EXPA3 ...load EXPA3 in cache
				510
				511	\|--Step 4.
				512	\|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
				513	\|-- R + RR(A1 + R(A2 + R(A3 + R(A4 + RA5))))
				514	\|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
				515	\|--[R+RS(A2+SA4)] + [S(A1+S(A3+SA5))]
				516
				517	fmovex %fp0,%fp1
				518	fmulx %fp1,%fp1 \| ...fp1 IS S = R*R
				519
				520	fmoves #0x3AB60B70,%fp2 \| ...fp2 IS A5
				521	\| MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
				522
				523	fmulx %fp1,%fp2 \| ...fp2 IS S*A5
				524	fmovex %fp1,%fp3
				525	fmuls #0x3C088895,%fp3 \| ...fp3 IS S*A4
				526
				527	faddd EXPA3,%fp2 \| ...fp2 IS A3+S*A5
				528	faddd EXPA2,%fp3 \| ...fp3 IS A2+S*A4
				529
				530	fmulx %fp1,%fp2 \| ...fp2 IS S(A3+SA5)
				531	movew %d0,SCALE(%a6) \| ...SCALE is 2^(M) in extended
				532	clrw SCALE+2(%a6)
				533	movel #0x80000000,SCALE+4(%a6)
				534	clrl SCALE+8(%a6)
				535
				536	fmulx %fp1,%fp3 \| ...fp3 IS S(A2+SA4)
				537
				538	fadds #0x3F000000,%fp2 \| ...fp2 IS A1+S(A3+SA5)
				539	fmulx %fp0,%fp3 \| ...fp3 IS RS(A2+S*A4)
				540
				541	fmulx %fp1,%fp2 \| ...fp2 IS S(A1+S(A3+S*A5))
				542	faddx %fp3,%fp0 \| ...fp0 IS R+RS(A2+S*A4),
				543	\| ...fp3 released
				544
				545	fmovex (%a1)+,%fp1 \| ...fp1 is lead. pt. of 2^(J/64)
				546	faddx %fp2,%fp0 \| ...fp0 is EXP(R) - 1
				547	\| ...fp2 released
				548
				549	\|--Step 5
				550	\|--final reconstruction process
				551	\|--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
				552
				553	fmulx %fp1,%fp0 \| ...2^(J/64)*(Exp(R)-1)
				554	fmovemx (%a7)+,%fp2-%fp2/%fp3 \| ...fp2 restored
				555	fadds (%a1),%fp0 \| ...accurate 2^(J/64)
				556
				557	faddx %fp1,%fp0 \| ...2^(J/64) + 2^(J/64)*...
				558	movel ADJFLAG(%a6),%d0
				559
				560	\|--Step 6
				561	tstl %d0
				562	beqs NORMAL
				563	ADJUST:
				564	fmulx ADJSCALE(%a6),%fp0
				565	NORMAL:
				566	fmovel %d1,%FPCR \| ...restore user FPCR
				567	fmulx SCALE(%a6),%fp0 \| ...multiply 2^(M)
				568	bra t_frcinx
				569
				570	EXPSM:
				571	\|--Step 7
				572	fmovemx (%a0),%fp0-%fp0 \| ...in case X is denormalized
				573	fmovel %d1,%FPCR
				574	fadds #0x3F800000,%fp0 \| ...1+X in user mode
				575	bra t_frcinx
				576
				577	EXPBIG:
				578	\|--Step 8
				579	cmpil #0x400CB27C,%d0 \| ...16480 log2
				580	bgts EXP2BIG
				581	\|--Steps 8.2 -- 8.6
				582	fmovex (%a0),%fp0 \| ...load input from (a0)
				583
				584	fmovex %fp0,%fp1
				585	fmuls #0x42B8AA3B,%fp0 \| ...64/log2 * X
				586	fmovemx %fp2-%fp2/%fp3,-(%a7) \| ...save fp2
				587	movel #1,ADJFLAG(%a6)
				588	fmovel %fp0,%d0 \| ...N = int( X * 64/log2 )
				589	lea EXPTBL,%a1
				590	fmovel %d0,%fp0 \| ...convert to floating-format
				591	movel %d0,L_SCR1(%a6) \| ...save N temporarily
				592	andil #0x3F,%d0 \| ...D0 is J = N mod 64
				593	lsll #4,%d0
				594	addal %d0,%a1 \| ...address of 2^(J/64)
				595	movel L_SCR1(%a6),%d0
				596	asrl #6,%d0 \| ...D0 is K
				597	movel %d0,L_SCR1(%a6) \| ...save K temporarily
				598	asrl #1,%d0 \| ...D0 is M1
				599	subl %d0,L_SCR1(%a6) \| ...a1 is M
				600	addiw #0x3FFF,%d0 \| ...biased expo. of 2^(M1)
				601	movew %d0,ADJSCALE(%a6) \| ...ADJSCALE := 2^(M1)
				602	clrw ADJSCALE+2(%a6)
				603	movel #0x80000000,ADJSCALE+4(%a6)
				604	clrl ADJSCALE+8(%a6)
				605	movel L_SCR1(%a6),%d0 \| ...D0 is M
				606	addiw #0x3FFF,%d0 \| ...biased expo. of 2^(M)
				607	bra EXPCONT1 \| ...go back to Step 3
				608
				609	EXP2BIG:
				610	\|--Step 9
				611	fmovel %d1,%FPCR
				612	movel (%a0),%d0
				613	bclrb #sign_bit,(%a0) \| ...setox always returns positive
				614	cmpil #0,%d0
				615	blt t_unfl
				616	bra t_ovfl
				617
				618	.global setoxm1d
				619	setoxm1d:
				620	\|--entry point for EXPM1(X), here X is denormalized
				621	\|--Step 0.
				622	bra t_extdnrm
				623
				624
				625	.global setoxm1
				626	setoxm1:
				627	\|--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
				628
				629	\|--Step 1.
				630	\|--Step 1.1
				631	movel (%a0),%d0 \| ...load part of input X
				632	andil #0x7FFF0000,%d0 \| ...biased expo. of X
				633	cmpil #0x3FFD0000,%d0 \| ...1/4
				634	bges EM1CON1 \| ...\|X\| >= 1/4
				635	bra EM1SM
				636
				637	EM1CON1:
				638	\|--Step 1.3
				639	\|--The case \|X\| >= 1/4
				640	movew 4(%a0),%d0 \| ...expo. and partial sig. of \|X\|
				641	cmpil #0x4004C215,%d0 \| ...70log2 rounded up to 16 bits
				642	bles EM1MAIN \| ...1/4 <= \|X\| <= 70log2
				643	bra EM1BIG
				644
				645	EM1MAIN:
				646	\|--Step 2.
				647	\|--This is the case: 1/4 <= \|X\| <= 70 log2.
				648	fmovex (%a0),%fp0 \| ...load input from (a0)
				649
				650	fmovex %fp0,%fp1
				651	fmuls #0x42B8AA3B,%fp0 \| ...64/log2 * X
				652	fmovemx %fp2-%fp2/%fp3,-(%a7) \| ...save fp2
				653	\| MOVE.W #$3F81,EM1A4 ...prefetch in CB mode
				654	fmovel %fp0,%d0 \| ...N = int( X * 64/log2 )
				655	lea EXPTBL,%a1
				656	fmovel %d0,%fp0 \| ...convert to floating-format
				657
				658	movel %d0,L_SCR1(%a6) \| ...save N temporarily
				659	andil #0x3F,%d0 \| ...D0 is J = N mod 64
				660	lsll #4,%d0
				661	addal %d0,%a1 \| ...address of 2^(J/64)
				662	movel L_SCR1(%a6),%d0
				663	asrl #6,%d0 \| ...D0 is M
				664	movel %d0,L_SCR1(%a6) \| ...save a copy of M
				665	\| MOVE.W #$3FDC,L2 ...prefetch L2 in CB mode
				666
				667	\|--Step 3.
				668	\|--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
				669	\|--a0 points to 2^(J/64), D0 and a1 both contain M
				670	fmovex %fp0,%fp2
				671	fmuls #0xBC317218,%fp0 \| ...N * L1, L1 = lead(-log2/64)
				672	fmulx L2,%fp2 \| ...N * L2, L1+L2 = -log2/64
				673	faddx %fp1,%fp0 \| ...X + N*L1
				674	faddx %fp2,%fp0 \| ...fp0 is R, reduced arg.
				675	\| MOVE.W #$3FC5,EM1A2 ...load EM1A2 in cache
				676	addiw #0x3FFF,%d0 \| ...D0 is biased expo. of 2^M
				677
				678	\|--Step 4.
				679	\|--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
				680	\|-- R + RR(A1 + R(A2 + R(A3 + R(A4 + R(A5 + R*A6)))))
				681	\|--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
				682	\|--[RS(A2+S(A4+SA6))] + [R+S(A1+S(A3+S*A5))]
				683
				684	fmovex %fp0,%fp1
				685	fmulx %fp1,%fp1 \| ...fp1 IS S = R*R
				686
				687	fmoves #0x3950097B,%fp2 \| ...fp2 IS a6
				688	\| MOVE.W #0,2(%a1) ...load 2^(J/64) in cache
				689
				690	fmulx %fp1,%fp2 \| ...fp2 IS S*A6
				691	fmovex %fp1,%fp3
				692	fmuls #0x3AB60B6A,%fp3 \| ...fp3 IS S*A5
				693
				694	faddd EM1A4,%fp2 \| ...fp2 IS A4+S*A6
				695	faddd EM1A3,%fp3 \| ...fp3 IS A3+S*A5
				696	movew %d0,SC(%a6) \| ...SC is 2^(M) in extended
				697	clrw SC+2(%a6)
				698	movel #0x80000000,SC+4(%a6)
				699	clrl SC+8(%a6)
				700
				701	fmulx %fp1,%fp2 \| ...fp2 IS S(A4+SA6)
				702	movel L_SCR1(%a6),%d0 \| ...D0 is M
				703	negw %d0 \| ...D0 is -M
				704	fmulx %fp1,%fp3 \| ...fp3 IS S(A3+SA5)
				705	addiw #0x3FFF,%d0 \| ...biased expo. of 2^(-M)
				706	faddd EM1A2,%fp2 \| ...fp2 IS A2+S(A4+SA6)
				707	fadds #0x3F000000,%fp3 \| ...fp3 IS A1+S(A3+SA5)
				708
				709	fmulx %fp1,%fp2 \| ...fp2 IS S(A2+S(A4+S*A6))
				710	oriw #0x8000,%d0 \| ...signed/expo. of -2^(-M)
				711	movew %d0,ONEBYSC(%a6) \| ...OnebySc is -2^(-M)
				712	clrw ONEBYSC+2(%a6)
				713	movel #0x80000000,ONEBYSC+4(%a6)
				714	clrl ONEBYSC+8(%a6)
				715	fmulx %fp3,%fp1 \| ...fp1 IS S(A1+S(A3+S*A5))
				716	\| ...fp3 released
				717
				718	fmulx %fp0,%fp2 \| ...fp2 IS RS(A2+S(A4+SA6))
				719	faddx %fp1,%fp0 \| ...fp0 IS R+S(A1+S(A3+S*A5))
				720	\| ...fp1 released
				721
				722	faddx %fp2,%fp0 \| ...fp0 IS EXP(R)-1
				723	\| ...fp2 released
				724	fmovemx (%a7)+,%fp2-%fp2/%fp3 \| ...fp2 restored
				725
				726	\|--Step 5
				727	\|--Compute 2^(J/64)*p
				728
				729	fmulx (%a1),%fp0 \| ...2^(J/64)*(Exp(R)-1)
				730
				731	\|--Step 6
				732	\|--Step 6.1
				733	movel L_SCR1(%a6),%d0 \| ...retrieve M
				734	cmpil #63,%d0
				735	bles MLE63
				736	\|--Step 6.2 M >= 64
				737	fmoves 12(%a1),%fp1 \| ...fp1 is t
				738	faddx ONEBYSC(%a6),%fp1 \| ...fp1 is t+OnebySc
				739	faddx %fp1,%fp0 \| ...p+(t+OnebySc), fp1 released
				740	faddx (%a1),%fp0 \| ...T+(p+(t+OnebySc))
				741	bras EM1SCALE
				742	MLE63:
				743	\|--Step 6.3 M <= 63
				744	cmpil #-3,%d0
				745	bges MGEN3
				746	MLTN3:
				747	\|--Step 6.4 M <= -4
				748	fadds 12(%a1),%fp0 \| ...p+t
				749	faddx (%a1),%fp0 \| ...T+(p+t)
				750	faddx ONEBYSC(%a6),%fp0 \| ...OnebySc + (T+(p+t))
				751	bras EM1SCALE
				752	MGEN3:
				753	\|--Step 6.5 -3 <= M <= 63
				754	fmovex (%a1)+,%fp1 \| ...fp1 is T
				755	fadds (%a1),%fp0 \| ...fp0 is p+t
				756	faddx ONEBYSC(%a6),%fp1 \| ...fp1 is T+OnebySc
				757	faddx %fp1,%fp0 \| ...(T+OnebySc)+(p+t)
				758
				759	EM1SCALE:
				760	\|--Step 6.6
				761	fmovel %d1,%FPCR
				762	fmulx SC(%a6),%fp0
				763
				764	bra t_frcinx
				765
				766	EM1SM:
				767	\|--Step 7 \|X\| < 1/4.
				768	cmpil #0x3FBE0000,%d0 \| ...2^(-65)
				769	bges EM1POLY
				770
				771	EM1TINY:
				772	\|--Step 8 \|X\| < 2^(-65)
				773	cmpil #0x00330000,%d0 \| ...2^(-16312)
				774	blts EM12TINY
				775	\|--Step 8.2
				776	movel #0x80010000,SC(%a6) \| ...SC is -2^(-16382)
				777	movel #0x80000000,SC+4(%a6)
				778	clrl SC+8(%a6)
				779	fmovex (%a0),%fp0
				780	fmovel %d1,%FPCR
				781	faddx SC(%a6),%fp0
				782
				783	bra t_frcinx
				784
				785	EM12TINY:
				786	\|--Step 8.3
				787	fmovex (%a0),%fp0
				788	fmuld TWO140,%fp0
				789	movel #0x80010000,SC(%a6)
				790	movel #0x80000000,SC+4(%a6)
				791	clrl SC+8(%a6)
				792	faddx SC(%a6),%fp0
				793	fmovel %d1,%FPCR
				794	fmuld TWON140,%fp0
				795
				796	bra t_frcinx
				797
				798	EM1POLY:
				799	\|--Step 9 exp(X)-1 by a simple polynomial
				800	fmovex (%a0),%fp0 \| ...fp0 is X
				801	fmulx %fp0,%fp0 \| ...fp0 is S := X*X
				802	fmovemx %fp2-%fp2/%fp3,-(%a7) \| ...save fp2
				803	fmoves #0x2F30CAA8,%fp1 \| ...fp1 is B12
				804	fmulx %fp0,%fp1 \| ...fp1 is S*B12
				805	fmoves #0x310F8290,%fp2 \| ...fp2 is B11
				806	fadds #0x32D73220,%fp1 \| ...fp1 is B10+S*B12
				807
				808	fmulx %fp0,%fp2 \| ...fp2 is S*B11
				809	fmulx %fp0,%fp1 \| ...fp1 is S*(B10 + ...
				810
				811	fadds #0x3493F281,%fp2 \| ...fp2 is B9+S*...
				812	faddd EM1B8,%fp1 \| ...fp1 is B8+S*...
				813
				814	fmulx %fp0,%fp2 \| ...fp2 is S*(B9+...
				815	fmulx %fp0,%fp1 \| ...fp1 is S*(B8+...
				816
				817	faddd EM1B7,%fp2 \| ...fp2 is B7+S*...
				818	faddd EM1B6,%fp1 \| ...fp1 is B6+S*...
				819
				820	fmulx %fp0,%fp2 \| ...fp2 is S*(B7+...
				821	fmulx %fp0,%fp1 \| ...fp1 is S*(B6+...
				822
				823	faddd EM1B5,%fp2 \| ...fp2 is B5+S*...
				824	faddd EM1B4,%fp1 \| ...fp1 is B4+S*...
				825
				826	fmulx %fp0,%fp2 \| ...fp2 is S*(B5+...
				827	fmulx %fp0,%fp1 \| ...fp1 is S*(B4+...
				828
				829	faddd EM1B3,%fp2 \| ...fp2 is B3+S*...
				830	faddx EM1B2,%fp1 \| ...fp1 is B2+S*...
				831
				832	fmulx %fp0,%fp2 \| ...fp2 is S*(B3+...
				833	fmulx %fp0,%fp1 \| ...fp1 is S*(B2+...
				834
				835	fmulx %fp0,%fp2 \| ...fp2 is SS(B3+...)
				836	fmulx (%a0),%fp1 \| ...fp1 is XS(B2...
				837
				838	fmuls #0x3F000000,%fp0 \| ...fp0 is S*B1
				839	faddx %fp2,%fp1 \| ...fp1 is Q
				840	\| ...fp2 released
				841
				842	fmovemx (%a7)+,%fp2-%fp2/%fp3 \| ...fp2 restored
				843
				844	faddx %fp1,%fp0 \| ...fp0 is S*B1+Q
				845	\| ...fp1 released
				846
				847	fmovel %d1,%FPCR
				848	faddx (%a0),%fp0
				849
				850	bra t_frcinx
				851
				852	EM1BIG:
				853	\|--Step 10 \|X\| > 70 log2
				854	movel (%a0),%d0
				855	cmpil #0,%d0
				856	bgt EXPC1
				857	\|--Step 10.2
				858	fmoves #0xBF800000,%fp0 \| ...fp0 is -1
				859	fmovel %d1,%FPCR
				860	fadds #0x00800000,%fp0 \| ...-1 + 2^(-126)
				861
				862	bra t_frcinx
				863
				864	\|end