blob: e038dd3be9b3c63e019a5be3f77de94f25f4c949 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2===============================================================================
3
4This C source file is part of the SoftFloat IEC/IEEE Floating-point
5Arithmetic Package, Release 2.
6
7Written by John R. Hauser. This work was made possible in part by the
8International Computer Science Institute, located at Suite 600, 1947 Center
9Street, Berkeley, California 94704. Funding was partially provided by the
10National Science Foundation under grant MIP-9311980. The original version
11of this code was written as part of a project to build a fixed-point vector
12processor in collaboration with the University of California at Berkeley,
13overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14is available through the web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
15arithmetic/softfloat.html'.
16
17THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
18has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
19TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
20PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
21AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
22
23Derivative works are acceptable, even for commercial purposes, so long as
24(1) they include prominent notice that the work is derivative, and (2) they
25include prominent notice akin to these three paragraphs for those parts of
26this code that are retained.
27
28===============================================================================
29*/
30
Nicolas Pitrec1241c4c2005-06-23 21:56:46 +010031#include <asm/div64.h>
32
Linus Torvalds1da177e2005-04-16 15:20:36 -070033#include "fpa11.h"
34//#include "milieu.h"
35//#include "softfloat.h"
36
37/*
38-------------------------------------------------------------------------------
39Floating-point rounding mode, extended double-precision rounding precision,
40and exception flags.
41-------------------------------------------------------------------------------
42*/
43int8 float_rounding_mode = float_round_nearest_even;
44int8 floatx80_rounding_precision = 80;
45int8 float_exception_flags;
46
47/*
48-------------------------------------------------------------------------------
49Primitive arithmetic functions, including multi-word arithmetic, and
50division and square root approximations. (Can be specialized to target if
51desired.)
52-------------------------------------------------------------------------------
53*/
54#include "softfloat-macros"
55
56/*
57-------------------------------------------------------------------------------
58Functions and definitions to determine: (1) whether tininess for underflow
59is detected before or after rounding by default, (2) what (if anything)
60happens when exceptions are raised, (3) how signaling NaNs are distinguished
61from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
62are propagated from function inputs to output. These details are target-
63specific.
64-------------------------------------------------------------------------------
65*/
66#include "softfloat-specialize"
67
68/*
69-------------------------------------------------------------------------------
70Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71and 7, and returns the properly rounded 32-bit integer corresponding to the
72input. If `zSign' is nonzero, the input is negated before being converted
73to an integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point
74input is simply rounded to an integer, with the inexact exception raised if
75the input cannot be represented exactly as an integer. If the fixed-point
76input is too large, however, the invalid exception is raised and the largest
77positive or negative integer is returned.
78-------------------------------------------------------------------------------
79*/
80static int32 roundAndPackInt32( flag zSign, bits64 absZ )
81{
82 int8 roundingMode;
83 flag roundNearestEven;
84 int8 roundIncrement, roundBits;
85 int32 z;
86
87 roundingMode = float_rounding_mode;
88 roundNearestEven = ( roundingMode == float_round_nearest_even );
89 roundIncrement = 0x40;
90 if ( ! roundNearestEven ) {
91 if ( roundingMode == float_round_to_zero ) {
92 roundIncrement = 0;
93 }
94 else {
95 roundIncrement = 0x7F;
96 if ( zSign ) {
97 if ( roundingMode == float_round_up ) roundIncrement = 0;
98 }
99 else {
100 if ( roundingMode == float_round_down ) roundIncrement = 0;
101 }
102 }
103 }
104 roundBits = absZ & 0x7F;
105 absZ = ( absZ + roundIncrement )>>7;
106 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
107 z = absZ;
108 if ( zSign ) z = - z;
109 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
110 float_exception_flags |= float_flag_invalid;
111 return zSign ? 0x80000000 : 0x7FFFFFFF;
112 }
113 if ( roundBits ) float_exception_flags |= float_flag_inexact;
114 return z;
115
116}
117
118/*
119-------------------------------------------------------------------------------
120Returns the fraction bits of the single-precision floating-point value `a'.
121-------------------------------------------------------------------------------
122*/
123INLINE bits32 extractFloat32Frac( float32 a )
124{
125
126 return a & 0x007FFFFF;
127
128}
129
130/*
131-------------------------------------------------------------------------------
132Returns the exponent bits of the single-precision floating-point value `a'.
133-------------------------------------------------------------------------------
134*/
135INLINE int16 extractFloat32Exp( float32 a )
136{
137
138 return ( a>>23 ) & 0xFF;
139
140}
141
142/*
143-------------------------------------------------------------------------------
144Returns the sign bit of the single-precision floating-point value `a'.
145-------------------------------------------------------------------------------
146*/
147#if 0 /* in softfloat.h */
148INLINE flag extractFloat32Sign( float32 a )
149{
150
151 return a>>31;
152
153}
154#endif
155
156/*
157-------------------------------------------------------------------------------
158Normalizes the subnormal single-precision floating-point value represented
159by the denormalized significand `aSig'. The normalized exponent and
160significand are stored at the locations pointed to by `zExpPtr' and
161`zSigPtr', respectively.
162-------------------------------------------------------------------------------
163*/
164static void
165 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
166{
167 int8 shiftCount;
168
169 shiftCount = countLeadingZeros32( aSig ) - 8;
170 *zSigPtr = aSig<<shiftCount;
171 *zExpPtr = 1 - shiftCount;
172
173}
174
175/*
176-------------------------------------------------------------------------------
177Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
178single-precision floating-point value, returning the result. After being
179shifted into the proper positions, the three fields are simply added
180together to form the result. This means that any integer portion of `zSig'
181will be added into the exponent. Since a properly normalized significand
182will have an integer portion equal to 1, the `zExp' input should be 1 less
183than the desired result exponent whenever `zSig' is a complete, normalized
184significand.
185-------------------------------------------------------------------------------
186*/
187INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
188{
189#if 0
190 float32 f;
191 __asm__("@ packFloat32 \n\
192 mov %0, %1, asl #31 \n\
193 orr %0, %2, asl #23 \n\
194 orr %0, %3"
195 : /* no outputs */
196 : "g" (f), "g" (zSign), "g" (zExp), "g" (zSig)
197 : "cc");
198 return f;
199#else
200 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
201#endif
202}
203
204/*
205-------------------------------------------------------------------------------
206Takes an abstract floating-point value having sign `zSign', exponent `zExp',
207and significand `zSig', and returns the proper single-precision floating-
208point value corresponding to the abstract input. Ordinarily, the abstract
209value is simply rounded and packed into the single-precision format, with
210the inexact exception raised if the abstract input cannot be represented
211exactly. If the abstract value is too large, however, the overflow and
212inexact exceptions are raised and an infinity or maximal finite value is
213returned. If the abstract value is too small, the input value is rounded to
214a subnormal number, and the underflow and inexact exceptions are raised if
215the abstract input cannot be represented exactly as a subnormal single-
216precision floating-point number.
217 The input significand `zSig' has its binary point between bits 30
218and 29, which is 7 bits to the left of the usual location. This shifted
219significand must be normalized or smaller. If `zSig' is not normalized,
220`zExp' must be 0; in that case, the result returned is a subnormal number,
221and it must not require rounding. In the usual case that `zSig' is
222normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
223The handling of underflow and overflow follows the IEC/IEEE Standard for
224Binary Floating-point Arithmetic.
225-------------------------------------------------------------------------------
226*/
227static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
228{
229 int8 roundingMode;
230 flag roundNearestEven;
231 int8 roundIncrement, roundBits;
232 flag isTiny;
233
234 roundingMode = float_rounding_mode;
235 roundNearestEven = ( roundingMode == float_round_nearest_even );
236 roundIncrement = 0x40;
237 if ( ! roundNearestEven ) {
238 if ( roundingMode == float_round_to_zero ) {
239 roundIncrement = 0;
240 }
241 else {
242 roundIncrement = 0x7F;
243 if ( zSign ) {
244 if ( roundingMode == float_round_up ) roundIncrement = 0;
245 }
246 else {
247 if ( roundingMode == float_round_down ) roundIncrement = 0;
248 }
249 }
250 }
251 roundBits = zSig & 0x7F;
252 if ( 0xFD <= (bits16) zExp ) {
253 if ( ( 0xFD < zExp )
254 || ( ( zExp == 0xFD )
255 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
256 ) {
257 float_raise( float_flag_overflow | float_flag_inexact );
258 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
259 }
260 if ( zExp < 0 ) {
261 isTiny =
262 ( float_detect_tininess == float_tininess_before_rounding )
263 || ( zExp < -1 )
264 || ( zSig + roundIncrement < 0x80000000 );
265 shift32RightJamming( zSig, - zExp, &zSig );
266 zExp = 0;
267 roundBits = zSig & 0x7F;
268 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
269 }
270 }
271 if ( roundBits ) float_exception_flags |= float_flag_inexact;
272 zSig = ( zSig + roundIncrement )>>7;
273 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
274 if ( zSig == 0 ) zExp = 0;
275 return packFloat32( zSign, zExp, zSig );
276
277}
278
279/*
280-------------------------------------------------------------------------------
281Takes an abstract floating-point value having sign `zSign', exponent `zExp',
282and significand `zSig', and returns the proper single-precision floating-
283point value corresponding to the abstract input. This routine is just like
284`roundAndPackFloat32' except that `zSig' does not have to be normalized in
285any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
286point exponent.
287-------------------------------------------------------------------------------
288*/
289static float32
290 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
291{
292 int8 shiftCount;
293
294 shiftCount = countLeadingZeros32( zSig ) - 1;
295 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
296
297}
298
299/*
300-------------------------------------------------------------------------------
301Returns the fraction bits of the double-precision floating-point value `a'.
302-------------------------------------------------------------------------------
303*/
304INLINE bits64 extractFloat64Frac( float64 a )
305{
306
307 return a & LIT64( 0x000FFFFFFFFFFFFF );
308
309}
310
311/*
312-------------------------------------------------------------------------------
313Returns the exponent bits of the double-precision floating-point value `a'.
314-------------------------------------------------------------------------------
315*/
316INLINE int16 extractFloat64Exp( float64 a )
317{
318
319 return ( a>>52 ) & 0x7FF;
320
321}
322
323/*
324-------------------------------------------------------------------------------
325Returns the sign bit of the double-precision floating-point value `a'.
326-------------------------------------------------------------------------------
327*/
328#if 0 /* in softfloat.h */
329INLINE flag extractFloat64Sign( float64 a )
330{
331
332 return a>>63;
333
334}
335#endif
336
337/*
338-------------------------------------------------------------------------------
339Normalizes the subnormal double-precision floating-point value represented
340by the denormalized significand `aSig'. The normalized exponent and
341significand are stored at the locations pointed to by `zExpPtr' and
342`zSigPtr', respectively.
343-------------------------------------------------------------------------------
344*/
345static void
346 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
347{
348 int8 shiftCount;
349
350 shiftCount = countLeadingZeros64( aSig ) - 11;
351 *zSigPtr = aSig<<shiftCount;
352 *zExpPtr = 1 - shiftCount;
353
354}
355
356/*
357-------------------------------------------------------------------------------
358Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
359double-precision floating-point value, returning the result. After being
360shifted into the proper positions, the three fields are simply added
361together to form the result. This means that any integer portion of `zSig'
362will be added into the exponent. Since a properly normalized significand
363will have an integer portion equal to 1, the `zExp' input should be 1 less
364than the desired result exponent whenever `zSig' is a complete, normalized
365significand.
366-------------------------------------------------------------------------------
367*/
368INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
369{
370
371 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
372
373}
374
375/*
376-------------------------------------------------------------------------------
377Takes an abstract floating-point value having sign `zSign', exponent `zExp',
378and significand `zSig', and returns the proper double-precision floating-
379point value corresponding to the abstract input. Ordinarily, the abstract
380value is simply rounded and packed into the double-precision format, with
381the inexact exception raised if the abstract input cannot be represented
382exactly. If the abstract value is too large, however, the overflow and
383inexact exceptions are raised and an infinity or maximal finite value is
384returned. If the abstract value is too small, the input value is rounded to
385a subnormal number, and the underflow and inexact exceptions are raised if
386the abstract input cannot be represented exactly as a subnormal double-
387precision floating-point number.
388 The input significand `zSig' has its binary point between bits 62
389and 61, which is 10 bits to the left of the usual location. This shifted
390significand must be normalized or smaller. If `zSig' is not normalized,
391`zExp' must be 0; in that case, the result returned is a subnormal number,
392and it must not require rounding. In the usual case that `zSig' is
393normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
394The handling of underflow and overflow follows the IEC/IEEE Standard for
395Binary Floating-point Arithmetic.
396-------------------------------------------------------------------------------
397*/
398static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
399{
400 int8 roundingMode;
401 flag roundNearestEven;
402 int16 roundIncrement, roundBits;
403 flag isTiny;
404
405 roundingMode = float_rounding_mode;
406 roundNearestEven = ( roundingMode == float_round_nearest_even );
407 roundIncrement = 0x200;
408 if ( ! roundNearestEven ) {
409 if ( roundingMode == float_round_to_zero ) {
410 roundIncrement = 0;
411 }
412 else {
413 roundIncrement = 0x3FF;
414 if ( zSign ) {
415 if ( roundingMode == float_round_up ) roundIncrement = 0;
416 }
417 else {
418 if ( roundingMode == float_round_down ) roundIncrement = 0;
419 }
420 }
421 }
422 roundBits = zSig & 0x3FF;
423 if ( 0x7FD <= (bits16) zExp ) {
424 if ( ( 0x7FD < zExp )
425 || ( ( zExp == 0x7FD )
426 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
427 ) {
428 //register int lr = __builtin_return_address(0);
429 //printk("roundAndPackFloat64 called from 0x%08x\n",lr);
430 float_raise( float_flag_overflow | float_flag_inexact );
431 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
432 }
433 if ( zExp < 0 ) {
434 isTiny =
435 ( float_detect_tininess == float_tininess_before_rounding )
436 || ( zExp < -1 )
437 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
438 shift64RightJamming( zSig, - zExp, &zSig );
439 zExp = 0;
440 roundBits = zSig & 0x3FF;
441 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
442 }
443 }
444 if ( roundBits ) float_exception_flags |= float_flag_inexact;
445 zSig = ( zSig + roundIncrement )>>10;
446 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
447 if ( zSig == 0 ) zExp = 0;
448 return packFloat64( zSign, zExp, zSig );
449
450}
451
452/*
453-------------------------------------------------------------------------------
454Takes an abstract floating-point value having sign `zSign', exponent `zExp',
455and significand `zSig', and returns the proper double-precision floating-
456point value corresponding to the abstract input. This routine is just like
457`roundAndPackFloat64' except that `zSig' does not have to be normalized in
458any way. In all cases, `zExp' must be 1 less than the ``true'' floating-
459point exponent.
460-------------------------------------------------------------------------------
461*/
462static float64
463 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
464{
465 int8 shiftCount;
466
467 shiftCount = countLeadingZeros64( zSig ) - 1;
468 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
469
470}
471
472#ifdef FLOATX80
473
474/*
475-------------------------------------------------------------------------------
476Returns the fraction bits of the extended double-precision floating-point
477value `a'.
478-------------------------------------------------------------------------------
479*/
480INLINE bits64 extractFloatx80Frac( floatx80 a )
481{
482
483 return a.low;
484
485}
486
487/*
488-------------------------------------------------------------------------------
489Returns the exponent bits of the extended double-precision floating-point
490value `a'.
491-------------------------------------------------------------------------------
492*/
493INLINE int32 extractFloatx80Exp( floatx80 a )
494{
495
496 return a.high & 0x7FFF;
497
498}
499
500/*
501-------------------------------------------------------------------------------
502Returns the sign bit of the extended double-precision floating-point value
503`a'.
504-------------------------------------------------------------------------------
505*/
506INLINE flag extractFloatx80Sign( floatx80 a )
507{
508
509 return a.high>>15;
510
511}
512
513/*
514-------------------------------------------------------------------------------
515Normalizes the subnormal extended double-precision floating-point value
516represented by the denormalized significand `aSig'. The normalized exponent
517and significand are stored at the locations pointed to by `zExpPtr' and
518`zSigPtr', respectively.
519-------------------------------------------------------------------------------
520*/
521static void
522 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
523{
524 int8 shiftCount;
525
526 shiftCount = countLeadingZeros64( aSig );
527 *zSigPtr = aSig<<shiftCount;
528 *zExpPtr = 1 - shiftCount;
529
530}
531
532/*
533-------------------------------------------------------------------------------
534Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
535extended double-precision floating-point value, returning the result.
536-------------------------------------------------------------------------------
537*/
538INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
539{
540 floatx80 z;
541
542 z.low = zSig;
543 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
544 return z;
545
546}
547
548/*
549-------------------------------------------------------------------------------
550Takes an abstract floating-point value having sign `zSign', exponent `zExp',
551and extended significand formed by the concatenation of `zSig0' and `zSig1',
552and returns the proper extended double-precision floating-point value
553corresponding to the abstract input. Ordinarily, the abstract value is
554rounded and packed into the extended double-precision format, with the
555inexact exception raised if the abstract input cannot be represented
556exactly. If the abstract value is too large, however, the overflow and
557inexact exceptions are raised and an infinity or maximal finite value is
558returned. If the abstract value is too small, the input value is rounded to
559a subnormal number, and the underflow and inexact exceptions are raised if
560the abstract input cannot be represented exactly as a subnormal extended
561double-precision floating-point number.
562 If `roundingPrecision' is 32 or 64, the result is rounded to the same
563number of bits as single or double precision, respectively. Otherwise, the
564result is rounded to the full precision of the extended double-precision
565format.
566 The input significand must be normalized or smaller. If the input
567significand is not normalized, `zExp' must be 0; in that case, the result
568returned is a subnormal number, and it must not require rounding. The
569handling of underflow and overflow follows the IEC/IEEE Standard for Binary
570Floating-point Arithmetic.
571-------------------------------------------------------------------------------
572*/
573static floatx80
574 roundAndPackFloatx80(
575 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
576 )
577{
578 int8 roundingMode;
579 flag roundNearestEven, increment, isTiny;
580 int64 roundIncrement, roundMask, roundBits;
581
582 roundingMode = float_rounding_mode;
583 roundNearestEven = ( roundingMode == float_round_nearest_even );
584 if ( roundingPrecision == 80 ) goto precision80;
585 if ( roundingPrecision == 64 ) {
586 roundIncrement = LIT64( 0x0000000000000400 );
587 roundMask = LIT64( 0x00000000000007FF );
588 }
589 else if ( roundingPrecision == 32 ) {
590 roundIncrement = LIT64( 0x0000008000000000 );
591 roundMask = LIT64( 0x000000FFFFFFFFFF );
592 }
593 else {
594 goto precision80;
595 }
596 zSig0 |= ( zSig1 != 0 );
597 if ( ! roundNearestEven ) {
598 if ( roundingMode == float_round_to_zero ) {
599 roundIncrement = 0;
600 }
601 else {
602 roundIncrement = roundMask;
603 if ( zSign ) {
604 if ( roundingMode == float_round_up ) roundIncrement = 0;
605 }
606 else {
607 if ( roundingMode == float_round_down ) roundIncrement = 0;
608 }
609 }
610 }
611 roundBits = zSig0 & roundMask;
612 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
613 if ( ( 0x7FFE < zExp )
614 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
615 ) {
616 goto overflow;
617 }
618 if ( zExp <= 0 ) {
619 isTiny =
620 ( float_detect_tininess == float_tininess_before_rounding )
621 || ( zExp < 0 )
622 || ( zSig0 <= zSig0 + roundIncrement );
623 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
624 zExp = 0;
625 roundBits = zSig0 & roundMask;
626 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
627 if ( roundBits ) float_exception_flags |= float_flag_inexact;
628 zSig0 += roundIncrement;
629 if ( (sbits64) zSig0 < 0 ) zExp = 1;
630 roundIncrement = roundMask + 1;
631 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
632 roundMask |= roundIncrement;
633 }
634 zSig0 &= ~ roundMask;
635 return packFloatx80( zSign, zExp, zSig0 );
636 }
637 }
638 if ( roundBits ) float_exception_flags |= float_flag_inexact;
639 zSig0 += roundIncrement;
640 if ( zSig0 < roundIncrement ) {
641 ++zExp;
642 zSig0 = LIT64( 0x8000000000000000 );
643 }
644 roundIncrement = roundMask + 1;
645 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
646 roundMask |= roundIncrement;
647 }
648 zSig0 &= ~ roundMask;
649 if ( zSig0 == 0 ) zExp = 0;
650 return packFloatx80( zSign, zExp, zSig0 );
651 precision80:
652 increment = ( (sbits64) zSig1 < 0 );
653 if ( ! roundNearestEven ) {
654 if ( roundingMode == float_round_to_zero ) {
655 increment = 0;
656 }
657 else {
658 if ( zSign ) {
659 increment = ( roundingMode == float_round_down ) && zSig1;
660 }
661 else {
662 increment = ( roundingMode == float_round_up ) && zSig1;
663 }
664 }
665 }
666 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
667 if ( ( 0x7FFE < zExp )
668 || ( ( zExp == 0x7FFE )
669 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
670 && increment
671 )
672 ) {
673 roundMask = 0;
674 overflow:
675 float_raise( float_flag_overflow | float_flag_inexact );
676 if ( ( roundingMode == float_round_to_zero )
677 || ( zSign && ( roundingMode == float_round_up ) )
678 || ( ! zSign && ( roundingMode == float_round_down ) )
679 ) {
680 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
681 }
682 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
683 }
684 if ( zExp <= 0 ) {
685 isTiny =
686 ( float_detect_tininess == float_tininess_before_rounding )
687 || ( zExp < 0 )
688 || ! increment
689 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
690 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
691 zExp = 0;
692 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
693 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
694 if ( roundNearestEven ) {
695 increment = ( (sbits64) zSig1 < 0 );
696 }
697 else {
698 if ( zSign ) {
699 increment = ( roundingMode == float_round_down ) && zSig1;
700 }
701 else {
702 increment = ( roundingMode == float_round_up ) && zSig1;
703 }
704 }
705 if ( increment ) {
706 ++zSig0;
707 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
708 if ( (sbits64) zSig0 < 0 ) zExp = 1;
709 }
710 return packFloatx80( zSign, zExp, zSig0 );
711 }
712 }
713 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
714 if ( increment ) {
715 ++zSig0;
716 if ( zSig0 == 0 ) {
717 ++zExp;
718 zSig0 = LIT64( 0x8000000000000000 );
719 }
720 else {
721 zSig0 &= ~ ( ( zSig1 + zSig1 == 0 ) & roundNearestEven );
722 }
723 }
724 else {
725 if ( zSig0 == 0 ) zExp = 0;
726 }
727
728 return packFloatx80( zSign, zExp, zSig0 );
729}
730
731/*
732-------------------------------------------------------------------------------
733Takes an abstract floating-point value having sign `zSign', exponent
734`zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
735and returns the proper extended double-precision floating-point value
736corresponding to the abstract input. This routine is just like
737`roundAndPackFloatx80' except that the input significand does not have to be
738normalized.
739-------------------------------------------------------------------------------
740*/
741static floatx80
742 normalizeRoundAndPackFloatx80(
743 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
744 )
745{
746 int8 shiftCount;
747
748 if ( zSig0 == 0 ) {
749 zSig0 = zSig1;
750 zSig1 = 0;
751 zExp -= 64;
752 }
753 shiftCount = countLeadingZeros64( zSig0 );
754 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
755 zExp -= shiftCount;
756 return
757 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
758
759}
760
761#endif
762
763/*
764-------------------------------------------------------------------------------
765Returns the result of converting the 32-bit two's complement integer `a' to
766the single-precision floating-point format. The conversion is performed
767according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
768-------------------------------------------------------------------------------
769*/
770float32 int32_to_float32( int32 a )
771{
772 flag zSign;
773
774 if ( a == 0 ) return 0;
775 if ( a == 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
776 zSign = ( a < 0 );
777 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
778
779}
780
781/*
782-------------------------------------------------------------------------------
783Returns the result of converting the 32-bit two's complement integer `a' to
784the double-precision floating-point format. The conversion is performed
785according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
786-------------------------------------------------------------------------------
787*/
788float64 int32_to_float64( int32 a )
789{
790 flag aSign;
791 uint32 absA;
792 int8 shiftCount;
793 bits64 zSig;
794
795 if ( a == 0 ) return 0;
796 aSign = ( a < 0 );
797 absA = aSign ? - a : a;
798 shiftCount = countLeadingZeros32( absA ) + 21;
799 zSig = absA;
800 return packFloat64( aSign, 0x432 - shiftCount, zSig<<shiftCount );
801
802}
803
804#ifdef FLOATX80
805
806/*
807-------------------------------------------------------------------------------
808Returns the result of converting the 32-bit two's complement integer `a'
809to the extended double-precision floating-point format. The conversion
810is performed according to the IEC/IEEE Standard for Binary Floating-point
811Arithmetic.
812-------------------------------------------------------------------------------
813*/
814floatx80 int32_to_floatx80( int32 a )
815{
816 flag zSign;
817 uint32 absA;
818 int8 shiftCount;
819 bits64 zSig;
820
821 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
822 zSign = ( a < 0 );
823 absA = zSign ? - a : a;
824 shiftCount = countLeadingZeros32( absA ) + 32;
825 zSig = absA;
826 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
827
828}
829
830#endif
831
832/*
833-------------------------------------------------------------------------------
834Returns the result of converting the single-precision floating-point value
835`a' to the 32-bit two's complement integer format. The conversion is
836performed according to the IEC/IEEE Standard for Binary Floating-point
837Arithmetic---which means in particular that the conversion is rounded
838according to the current rounding mode. If `a' is a NaN, the largest
839positive integer is returned. Otherwise, if the conversion overflows, the
840largest integer with the same sign as `a' is returned.
841-------------------------------------------------------------------------------
842*/
843int32 float32_to_int32( float32 a )
844{
845 flag aSign;
846 int16 aExp, shiftCount;
847 bits32 aSig;
848 bits64 zSig;
849
850 aSig = extractFloat32Frac( a );
851 aExp = extractFloat32Exp( a );
852 aSign = extractFloat32Sign( a );
853 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
854 if ( aExp ) aSig |= 0x00800000;
855 shiftCount = 0xAF - aExp;
856 zSig = aSig;
857 zSig <<= 32;
858 if ( 0 < shiftCount ) shift64RightJamming( zSig, shiftCount, &zSig );
859 return roundAndPackInt32( aSign, zSig );
860
861}
862
863/*
864-------------------------------------------------------------------------------
865Returns the result of converting the single-precision floating-point value
866`a' to the 32-bit two's complement integer format. The conversion is
867performed according to the IEC/IEEE Standard for Binary Floating-point
868Arithmetic, except that the conversion is always rounded toward zero. If
869`a' is a NaN, the largest positive integer is returned. Otherwise, if the
870conversion overflows, the largest integer with the same sign as `a' is
871returned.
872-------------------------------------------------------------------------------
873*/
874int32 float32_to_int32_round_to_zero( float32 a )
875{
876 flag aSign;
877 int16 aExp, shiftCount;
878 bits32 aSig;
879 int32 z;
880
881 aSig = extractFloat32Frac( a );
882 aExp = extractFloat32Exp( a );
883 aSign = extractFloat32Sign( a );
884 shiftCount = aExp - 0x9E;
885 if ( 0 <= shiftCount ) {
886 if ( a == 0xCF000000 ) return 0x80000000;
887 float_raise( float_flag_invalid );
888 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
889 return 0x80000000;
890 }
891 else if ( aExp <= 0x7E ) {
892 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
893 return 0;
894 }
895 aSig = ( aSig | 0x00800000 )<<8;
896 z = aSig>>( - shiftCount );
897 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
898 float_exception_flags |= float_flag_inexact;
899 }
900 return aSign ? - z : z;
901
902}
903
904/*
905-------------------------------------------------------------------------------
906Returns the result of converting the single-precision floating-point value
907`a' to the double-precision floating-point format. The conversion is
908performed according to the IEC/IEEE Standard for Binary Floating-point
909Arithmetic.
910-------------------------------------------------------------------------------
911*/
912float64 float32_to_float64( float32 a )
913{
914 flag aSign;
915 int16 aExp;
916 bits32 aSig;
917
918 aSig = extractFloat32Frac( a );
919 aExp = extractFloat32Exp( a );
920 aSign = extractFloat32Sign( a );
921 if ( aExp == 0xFF ) {
922 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
923 return packFloat64( aSign, 0x7FF, 0 );
924 }
925 if ( aExp == 0 ) {
926 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
927 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
928 --aExp;
929 }
930 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
931
932}
933
934#ifdef FLOATX80
935
936/*
937-------------------------------------------------------------------------------
938Returns the result of converting the single-precision floating-point value
939`a' to the extended double-precision floating-point format. The conversion
940is performed according to the IEC/IEEE Standard for Binary Floating-point
941Arithmetic.
942-------------------------------------------------------------------------------
943*/
944floatx80 float32_to_floatx80( float32 a )
945{
946 flag aSign;
947 int16 aExp;
948 bits32 aSig;
949
950 aSig = extractFloat32Frac( a );
951 aExp = extractFloat32Exp( a );
952 aSign = extractFloat32Sign( a );
953 if ( aExp == 0xFF ) {
954 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
955 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
956 }
957 if ( aExp == 0 ) {
958 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
959 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
960 }
961 aSig |= 0x00800000;
962 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
963
964}
965
966#endif
967
968/*
969-------------------------------------------------------------------------------
970Rounds the single-precision floating-point value `a' to an integer, and
971returns the result as a single-precision floating-point value. The
972operation is performed according to the IEC/IEEE Standard for Binary
973Floating-point Arithmetic.
974-------------------------------------------------------------------------------
975*/
976float32 float32_round_to_int( float32 a )
977{
978 flag aSign;
979 int16 aExp;
980 bits32 lastBitMask, roundBitsMask;
981 int8 roundingMode;
982 float32 z;
983
984 aExp = extractFloat32Exp( a );
985 if ( 0x96 <= aExp ) {
986 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
987 return propagateFloat32NaN( a, a );
988 }
989 return a;
990 }
991 if ( aExp <= 0x7E ) {
992 if ( (bits32) ( a<<1 ) == 0 ) return a;
993 float_exception_flags |= float_flag_inexact;
994 aSign = extractFloat32Sign( a );
995 switch ( float_rounding_mode ) {
996 case float_round_nearest_even:
997 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
998 return packFloat32( aSign, 0x7F, 0 );
999 }
1000 break;
1001 case float_round_down:
1002 return aSign ? 0xBF800000 : 0;
1003 case float_round_up:
1004 return aSign ? 0x80000000 : 0x3F800000;
1005 }
1006 return packFloat32( aSign, 0, 0 );
1007 }
1008 lastBitMask = 1;
1009 lastBitMask <<= 0x96 - aExp;
1010 roundBitsMask = lastBitMask - 1;
1011 z = a;
1012 roundingMode = float_rounding_mode;
1013 if ( roundingMode == float_round_nearest_even ) {
1014 z += lastBitMask>>1;
1015 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1016 }
1017 else if ( roundingMode != float_round_to_zero ) {
1018 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1019 z += roundBitsMask;
1020 }
1021 }
1022 z &= ~ roundBitsMask;
1023 if ( z != a ) float_exception_flags |= float_flag_inexact;
1024 return z;
1025
1026}
1027
1028/*
1029-------------------------------------------------------------------------------
1030Returns the result of adding the absolute values of the single-precision
1031floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1032before being returned. `zSign' is ignored if the result is a NaN. The
1033addition is performed according to the IEC/IEEE Standard for Binary
1034Floating-point Arithmetic.
1035-------------------------------------------------------------------------------
1036*/
1037static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1038{
1039 int16 aExp, bExp, zExp;
1040 bits32 aSig, bSig, zSig;
1041 int16 expDiff;
1042
1043 aSig = extractFloat32Frac( a );
1044 aExp = extractFloat32Exp( a );
1045 bSig = extractFloat32Frac( b );
1046 bExp = extractFloat32Exp( b );
1047 expDiff = aExp - bExp;
1048 aSig <<= 6;
1049 bSig <<= 6;
1050 if ( 0 < expDiff ) {
1051 if ( aExp == 0xFF ) {
1052 if ( aSig ) return propagateFloat32NaN( a, b );
1053 return a;
1054 }
1055 if ( bExp == 0 ) {
1056 --expDiff;
1057 }
1058 else {
1059 bSig |= 0x20000000;
1060 }
1061 shift32RightJamming( bSig, expDiff, &bSig );
1062 zExp = aExp;
1063 }
1064 else if ( expDiff < 0 ) {
1065 if ( bExp == 0xFF ) {
1066 if ( bSig ) return propagateFloat32NaN( a, b );
1067 return packFloat32( zSign, 0xFF, 0 );
1068 }
1069 if ( aExp == 0 ) {
1070 ++expDiff;
1071 }
1072 else {
1073 aSig |= 0x20000000;
1074 }
1075 shift32RightJamming( aSig, - expDiff, &aSig );
1076 zExp = bExp;
1077 }
1078 else {
1079 if ( aExp == 0xFF ) {
1080 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1081 return a;
1082 }
1083 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1084 zSig = 0x40000000 + aSig + bSig;
1085 zExp = aExp;
1086 goto roundAndPack;
1087 }
1088 aSig |= 0x20000000;
1089 zSig = ( aSig + bSig )<<1;
1090 --zExp;
1091 if ( (sbits32) zSig < 0 ) {
1092 zSig = aSig + bSig;
1093 ++zExp;
1094 }
1095 roundAndPack:
1096 return roundAndPackFloat32( zSign, zExp, zSig );
1097
1098}
1099
1100/*
1101-------------------------------------------------------------------------------
1102Returns the result of subtracting the absolute values of the single-
1103precision floating-point values `a' and `b'. If `zSign' is true, the
1104difference is negated before being returned. `zSign' is ignored if the
1105result is a NaN. The subtraction is performed according to the IEC/IEEE
1106Standard for Binary Floating-point Arithmetic.
1107-------------------------------------------------------------------------------
1108*/
1109static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1110{
1111 int16 aExp, bExp, zExp;
1112 bits32 aSig, bSig, zSig;
1113 int16 expDiff;
1114
1115 aSig = extractFloat32Frac( a );
1116 aExp = extractFloat32Exp( a );
1117 bSig = extractFloat32Frac( b );
1118 bExp = extractFloat32Exp( b );
1119 expDiff = aExp - bExp;
1120 aSig <<= 7;
1121 bSig <<= 7;
1122 if ( 0 < expDiff ) goto aExpBigger;
1123 if ( expDiff < 0 ) goto bExpBigger;
1124 if ( aExp == 0xFF ) {
1125 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1126 float_raise( float_flag_invalid );
1127 return float32_default_nan;
1128 }
1129 if ( aExp == 0 ) {
1130 aExp = 1;
1131 bExp = 1;
1132 }
1133 if ( bSig < aSig ) goto aBigger;
1134 if ( aSig < bSig ) goto bBigger;
1135 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1136 bExpBigger:
1137 if ( bExp == 0xFF ) {
1138 if ( bSig ) return propagateFloat32NaN( a, b );
1139 return packFloat32( zSign ^ 1, 0xFF, 0 );
1140 }
1141 if ( aExp == 0 ) {
1142 ++expDiff;
1143 }
1144 else {
1145 aSig |= 0x40000000;
1146 }
1147 shift32RightJamming( aSig, - expDiff, &aSig );
1148 bSig |= 0x40000000;
1149 bBigger:
1150 zSig = bSig - aSig;
1151 zExp = bExp;
1152 zSign ^= 1;
1153 goto normalizeRoundAndPack;
1154 aExpBigger:
1155 if ( aExp == 0xFF ) {
1156 if ( aSig ) return propagateFloat32NaN( a, b );
1157 return a;
1158 }
1159 if ( bExp == 0 ) {
1160 --expDiff;
1161 }
1162 else {
1163 bSig |= 0x40000000;
1164 }
1165 shift32RightJamming( bSig, expDiff, &bSig );
1166 aSig |= 0x40000000;
1167 aBigger:
1168 zSig = aSig - bSig;
1169 zExp = aExp;
1170 normalizeRoundAndPack:
1171 --zExp;
1172 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1173
1174}
1175
1176/*
1177-------------------------------------------------------------------------------
1178Returns the result of adding the single-precision floating-point values `a'
1179and `b'. The operation is performed according to the IEC/IEEE Standard for
1180Binary Floating-point Arithmetic.
1181-------------------------------------------------------------------------------
1182*/
1183float32 float32_add( float32 a, float32 b )
1184{
1185 flag aSign, bSign;
1186
1187 aSign = extractFloat32Sign( a );
1188 bSign = extractFloat32Sign( b );
1189 if ( aSign == bSign ) {
1190 return addFloat32Sigs( a, b, aSign );
1191 }
1192 else {
1193 return subFloat32Sigs( a, b, aSign );
1194 }
1195
1196}
1197
1198/*
1199-------------------------------------------------------------------------------
1200Returns the result of subtracting the single-precision floating-point values
1201`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1202for Binary Floating-point Arithmetic.
1203-------------------------------------------------------------------------------
1204*/
1205float32 float32_sub( float32 a, float32 b )
1206{
1207 flag aSign, bSign;
1208
1209 aSign = extractFloat32Sign( a );
1210 bSign = extractFloat32Sign( b );
1211 if ( aSign == bSign ) {
1212 return subFloat32Sigs( a, b, aSign );
1213 }
1214 else {
1215 return addFloat32Sigs( a, b, aSign );
1216 }
1217
1218}
1219
1220/*
1221-------------------------------------------------------------------------------
1222Returns the result of multiplying the single-precision floating-point values
1223`a' and `b'. The operation is performed according to the IEC/IEEE Standard
1224for Binary Floating-point Arithmetic.
1225-------------------------------------------------------------------------------
1226*/
1227float32 float32_mul( float32 a, float32 b )
1228{
1229 flag aSign, bSign, zSign;
1230 int16 aExp, bExp, zExp;
1231 bits32 aSig, bSig;
1232 bits64 zSig64;
1233 bits32 zSig;
1234
1235 aSig = extractFloat32Frac( a );
1236 aExp = extractFloat32Exp( a );
1237 aSign = extractFloat32Sign( a );
1238 bSig = extractFloat32Frac( b );
1239 bExp = extractFloat32Exp( b );
1240 bSign = extractFloat32Sign( b );
1241 zSign = aSign ^ bSign;
1242 if ( aExp == 0xFF ) {
1243 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1244 return propagateFloat32NaN( a, b );
1245 }
1246 if ( ( bExp | bSig ) == 0 ) {
1247 float_raise( float_flag_invalid );
1248 return float32_default_nan;
1249 }
1250 return packFloat32( zSign, 0xFF, 0 );
1251 }
1252 if ( bExp == 0xFF ) {
1253 if ( bSig ) return propagateFloat32NaN( a, b );
1254 if ( ( aExp | aSig ) == 0 ) {
1255 float_raise( float_flag_invalid );
1256 return float32_default_nan;
1257 }
1258 return packFloat32( zSign, 0xFF, 0 );
1259 }
1260 if ( aExp == 0 ) {
1261 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1262 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1263 }
1264 if ( bExp == 0 ) {
1265 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1266 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1267 }
1268 zExp = aExp + bExp - 0x7F;
1269 aSig = ( aSig | 0x00800000 )<<7;
1270 bSig = ( bSig | 0x00800000 )<<8;
1271 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1272 zSig = zSig64;
1273 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1274 zSig <<= 1;
1275 --zExp;
1276 }
1277 return roundAndPackFloat32( zSign, zExp, zSig );
1278
1279}
1280
1281/*
1282-------------------------------------------------------------------------------
1283Returns the result of dividing the single-precision floating-point value `a'
1284by the corresponding value `b'. The operation is performed according to the
1285IEC/IEEE Standard for Binary Floating-point Arithmetic.
1286-------------------------------------------------------------------------------
1287*/
1288float32 float32_div( float32 a, float32 b )
1289{
1290 flag aSign, bSign, zSign;
1291 int16 aExp, bExp, zExp;
1292 bits32 aSig, bSig, zSig;
1293
1294 aSig = extractFloat32Frac( a );
1295 aExp = extractFloat32Exp( a );
1296 aSign = extractFloat32Sign( a );
1297 bSig = extractFloat32Frac( b );
1298 bExp = extractFloat32Exp( b );
1299 bSign = extractFloat32Sign( b );
1300 zSign = aSign ^ bSign;
1301 if ( aExp == 0xFF ) {
1302 if ( aSig ) return propagateFloat32NaN( a, b );
1303 if ( bExp == 0xFF ) {
1304 if ( bSig ) return propagateFloat32NaN( a, b );
1305 float_raise( float_flag_invalid );
1306 return float32_default_nan;
1307 }
1308 return packFloat32( zSign, 0xFF, 0 );
1309 }
1310 if ( bExp == 0xFF ) {
1311 if ( bSig ) return propagateFloat32NaN( a, b );
1312 return packFloat32( zSign, 0, 0 );
1313 }
1314 if ( bExp == 0 ) {
1315 if ( bSig == 0 ) {
1316 if ( ( aExp | aSig ) == 0 ) {
1317 float_raise( float_flag_invalid );
1318 return float32_default_nan;
1319 }
1320 float_raise( float_flag_divbyzero );
1321 return packFloat32( zSign, 0xFF, 0 );
1322 }
1323 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1324 }
1325 if ( aExp == 0 ) {
1326 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1327 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1328 }
1329 zExp = aExp - bExp + 0x7D;
1330 aSig = ( aSig | 0x00800000 )<<7;
1331 bSig = ( bSig | 0x00800000 )<<8;
1332 if ( bSig <= ( aSig + aSig ) ) {
1333 aSig >>= 1;
1334 ++zExp;
1335 }
Nicolas Pitrec1241c4c2005-06-23 21:56:46 +01001336 {
1337 bits64 tmp = ( (bits64) aSig )<<32;
1338 do_div( tmp, bSig );
1339 zSig = tmp;
1340 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001341 if ( ( zSig & 0x3F ) == 0 ) {
1342 zSig |= ( ( (bits64) bSig ) * zSig != ( (bits64) aSig )<<32 );
1343 }
1344 return roundAndPackFloat32( zSign, zExp, zSig );
1345
1346}
1347
1348/*
1349-------------------------------------------------------------------------------
1350Returns the remainder of the single-precision floating-point value `a'
1351with respect to the corresponding value `b'. The operation is performed
1352according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1353-------------------------------------------------------------------------------
1354*/
1355float32 float32_rem( float32 a, float32 b )
1356{
1357 flag aSign, bSign, zSign;
1358 int16 aExp, bExp, expDiff;
1359 bits32 aSig, bSig;
1360 bits32 q;
1361 bits64 aSig64, bSig64, q64;
1362 bits32 alternateASig;
1363 sbits32 sigMean;
1364
1365 aSig = extractFloat32Frac( a );
1366 aExp = extractFloat32Exp( a );
1367 aSign = extractFloat32Sign( a );
1368 bSig = extractFloat32Frac( b );
1369 bExp = extractFloat32Exp( b );
1370 bSign = extractFloat32Sign( b );
1371 if ( aExp == 0xFF ) {
1372 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1373 return propagateFloat32NaN( a, b );
1374 }
1375 float_raise( float_flag_invalid );
1376 return float32_default_nan;
1377 }
1378 if ( bExp == 0xFF ) {
1379 if ( bSig ) return propagateFloat32NaN( a, b );
1380 return a;
1381 }
1382 if ( bExp == 0 ) {
1383 if ( bSig == 0 ) {
1384 float_raise( float_flag_invalid );
1385 return float32_default_nan;
1386 }
1387 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1388 }
1389 if ( aExp == 0 ) {
1390 if ( aSig == 0 ) return a;
1391 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1392 }
1393 expDiff = aExp - bExp;
1394 aSig |= 0x00800000;
1395 bSig |= 0x00800000;
1396 if ( expDiff < 32 ) {
1397 aSig <<= 8;
1398 bSig <<= 8;
1399 if ( expDiff < 0 ) {
1400 if ( expDiff < -1 ) return a;
1401 aSig >>= 1;
1402 }
1403 q = ( bSig <= aSig );
1404 if ( q ) aSig -= bSig;
1405 if ( 0 < expDiff ) {
Nicolas Pitrec1241c4c2005-06-23 21:56:46 +01001406 bits64 tmp = ( (bits64) aSig )<<32;
1407 do_div( tmp, bSig );
1408 q = tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 q >>= 32 - expDiff;
1410 bSig >>= 2;
1411 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1412 }
1413 else {
1414 aSig >>= 2;
1415 bSig >>= 2;
1416 }
1417 }
1418 else {
1419 if ( bSig <= aSig ) aSig -= bSig;
1420 aSig64 = ( (bits64) aSig )<<40;
1421 bSig64 = ( (bits64) bSig )<<40;
1422 expDiff -= 64;
1423 while ( 0 < expDiff ) {
1424 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1425 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1426 aSig64 = - ( ( bSig * q64 )<<38 );
1427 expDiff -= 62;
1428 }
1429 expDiff += 64;
1430 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1431 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1432 q = q64>>( 64 - expDiff );
1433 bSig <<= 6;
1434 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1435 }
1436 do {
1437 alternateASig = aSig;
1438 ++q;
1439 aSig -= bSig;
1440 } while ( 0 <= (sbits32) aSig );
1441 sigMean = aSig + alternateASig;
1442 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1443 aSig = alternateASig;
1444 }
1445 zSign = ( (sbits32) aSig < 0 );
1446 if ( zSign ) aSig = - aSig;
1447 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1448
1449}
1450
1451/*
1452-------------------------------------------------------------------------------
1453Returns the square root of the single-precision floating-point value `a'.
1454The operation is performed according to the IEC/IEEE Standard for Binary
1455Floating-point Arithmetic.
1456-------------------------------------------------------------------------------
1457*/
1458float32 float32_sqrt( float32 a )
1459{
1460 flag aSign;
1461 int16 aExp, zExp;
1462 bits32 aSig, zSig;
1463 bits64 rem, term;
1464
1465 aSig = extractFloat32Frac( a );
1466 aExp = extractFloat32Exp( a );
1467 aSign = extractFloat32Sign( a );
1468 if ( aExp == 0xFF ) {
1469 if ( aSig ) return propagateFloat32NaN( a, 0 );
1470 if ( ! aSign ) return a;
1471 float_raise( float_flag_invalid );
1472 return float32_default_nan;
1473 }
1474 if ( aSign ) {
1475 if ( ( aExp | aSig ) == 0 ) return a;
1476 float_raise( float_flag_invalid );
1477 return float32_default_nan;
1478 }
1479 if ( aExp == 0 ) {
1480 if ( aSig == 0 ) return 0;
1481 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1482 }
1483 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1484 aSig = ( aSig | 0x00800000 )<<8;
1485 zSig = estimateSqrt32( aExp, aSig ) + 2;
1486 if ( ( zSig & 0x7F ) <= 5 ) {
1487 if ( zSig < 2 ) {
1488 zSig = 0xFFFFFFFF;
1489 }
1490 else {
1491 aSig >>= aExp & 1;
1492 term = ( (bits64) zSig ) * zSig;
1493 rem = ( ( (bits64) aSig )<<32 ) - term;
1494 while ( (sbits64) rem < 0 ) {
1495 --zSig;
1496 rem += ( ( (bits64) zSig )<<1 ) | 1;
1497 }
1498 zSig |= ( rem != 0 );
1499 }
1500 }
1501 shift32RightJamming( zSig, 1, &zSig );
1502 return roundAndPackFloat32( 0, zExp, zSig );
1503
1504}
1505
1506/*
1507-------------------------------------------------------------------------------
1508Returns 1 if the single-precision floating-point value `a' is equal to the
1509corresponding value `b', and 0 otherwise. The comparison is performed
1510according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1511-------------------------------------------------------------------------------
1512*/
1513flag float32_eq( float32 a, float32 b )
1514{
1515
1516 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1517 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1518 ) {
1519 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1520 float_raise( float_flag_invalid );
1521 }
1522 return 0;
1523 }
1524 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1525
1526}
1527
1528/*
1529-------------------------------------------------------------------------------
1530Returns 1 if the single-precision floating-point value `a' is less than or
1531equal to the corresponding value `b', and 0 otherwise. The comparison is
1532performed according to the IEC/IEEE Standard for Binary Floating-point
1533Arithmetic.
1534-------------------------------------------------------------------------------
1535*/
1536flag float32_le( float32 a, float32 b )
1537{
1538 flag aSign, bSign;
1539
1540 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1541 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1542 ) {
1543 float_raise( float_flag_invalid );
1544 return 0;
1545 }
1546 aSign = extractFloat32Sign( a );
1547 bSign = extractFloat32Sign( b );
1548 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1549 return ( a == b ) || ( aSign ^ ( a < b ) );
1550
1551}
1552
1553/*
1554-------------------------------------------------------------------------------
1555Returns 1 if the single-precision floating-point value `a' is less than
1556the corresponding value `b', and 0 otherwise. The comparison is performed
1557according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1558-------------------------------------------------------------------------------
1559*/
1560flag float32_lt( float32 a, float32 b )
1561{
1562 flag aSign, bSign;
1563
1564 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1565 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1566 ) {
1567 float_raise( float_flag_invalid );
1568 return 0;
1569 }
1570 aSign = extractFloat32Sign( a );
1571 bSign = extractFloat32Sign( b );
1572 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1573 return ( a != b ) && ( aSign ^ ( a < b ) );
1574
1575}
1576
1577/*
1578-------------------------------------------------------------------------------
1579Returns 1 if the single-precision floating-point value `a' is equal to the
1580corresponding value `b', and 0 otherwise. The invalid exception is raised
1581if either operand is a NaN. Otherwise, the comparison is performed
1582according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
1583-------------------------------------------------------------------------------
1584*/
1585flag float32_eq_signaling( float32 a, float32 b )
1586{
1587
1588 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1589 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1590 ) {
1591 float_raise( float_flag_invalid );
1592 return 0;
1593 }
1594 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1595
1596}
1597
1598/*
1599-------------------------------------------------------------------------------
1600Returns 1 if the single-precision floating-point value `a' is less than or
1601equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1602cause an exception. Otherwise, the comparison is performed according to the
1603IEC/IEEE Standard for Binary Floating-point Arithmetic.
1604-------------------------------------------------------------------------------
1605*/
1606flag float32_le_quiet( float32 a, float32 b )
1607{
1608 flag aSign, bSign;
1609 //int16 aExp, bExp;
1610
1611 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1612 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1613 ) {
1614 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1615 float_raise( float_flag_invalid );
1616 }
1617 return 0;
1618 }
1619 aSign = extractFloat32Sign( a );
1620 bSign = extractFloat32Sign( b );
1621 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1622 return ( a == b ) || ( aSign ^ ( a < b ) );
1623
1624}
1625
1626/*
1627-------------------------------------------------------------------------------
1628Returns 1 if the single-precision floating-point value `a' is less than
1629the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1630exception. Otherwise, the comparison is performed according to the IEC/IEEE
1631Standard for Binary Floating-point Arithmetic.
1632-------------------------------------------------------------------------------
1633*/
1634flag float32_lt_quiet( float32 a, float32 b )
1635{
1636 flag aSign, bSign;
1637
1638 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1639 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1640 ) {
1641 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1642 float_raise( float_flag_invalid );
1643 }
1644 return 0;
1645 }
1646 aSign = extractFloat32Sign( a );
1647 bSign = extractFloat32Sign( b );
1648 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1649 return ( a != b ) && ( aSign ^ ( a < b ) );
1650
1651}
1652
1653/*
1654-------------------------------------------------------------------------------
1655Returns the result of converting the double-precision floating-point value
1656`a' to the 32-bit two's complement integer format. The conversion is
1657performed according to the IEC/IEEE Standard for Binary Floating-point
1658Arithmetic---which means in particular that the conversion is rounded
1659according to the current rounding mode. If `a' is a NaN, the largest
1660positive integer is returned. Otherwise, if the conversion overflows, the
1661largest integer with the same sign as `a' is returned.
1662-------------------------------------------------------------------------------
1663*/
1664int32 float64_to_int32( float64 a )
1665{
1666 flag aSign;
1667 int16 aExp, shiftCount;
1668 bits64 aSig;
1669
1670 aSig = extractFloat64Frac( a );
1671 aExp = extractFloat64Exp( a );
1672 aSign = extractFloat64Sign( a );
1673 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1674 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1675 shiftCount = 0x42C - aExp;
1676 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1677 return roundAndPackInt32( aSign, aSig );
1678
1679}
1680
1681/*
1682-------------------------------------------------------------------------------
1683Returns the result of converting the double-precision floating-point value
1684`a' to the 32-bit two's complement integer format. The conversion is
1685performed according to the IEC/IEEE Standard for Binary Floating-point
1686Arithmetic, except that the conversion is always rounded toward zero. If
1687`a' is a NaN, the largest positive integer is returned. Otherwise, if the
1688conversion overflows, the largest integer with the same sign as `a' is
1689returned.
1690-------------------------------------------------------------------------------
1691*/
1692int32 float64_to_int32_round_to_zero( float64 a )
1693{
1694 flag aSign;
1695 int16 aExp, shiftCount;
1696 bits64 aSig, savedASig;
1697 int32 z;
1698
1699 aSig = extractFloat64Frac( a );
1700 aExp = extractFloat64Exp( a );
1701 aSign = extractFloat64Sign( a );
1702 shiftCount = 0x433 - aExp;
1703 if ( shiftCount < 21 ) {
1704 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1705 goto invalid;
1706 }
1707 else if ( 52 < shiftCount ) {
1708 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1709 return 0;
1710 }
1711 aSig |= LIT64( 0x0010000000000000 );
1712 savedASig = aSig;
1713 aSig >>= shiftCount;
1714 z = aSig;
1715 if ( aSign ) z = - z;
1716 if ( ( z < 0 ) ^ aSign ) {
1717 invalid:
1718 float_exception_flags |= float_flag_invalid;
1719 return aSign ? 0x80000000 : 0x7FFFFFFF;
1720 }
1721 if ( ( aSig<<shiftCount ) != savedASig ) {
1722 float_exception_flags |= float_flag_inexact;
1723 }
1724 return z;
1725
1726}
1727
1728/*
1729-------------------------------------------------------------------------------
1730Returns the result of converting the double-precision floating-point value
1731`a' to the 32-bit two's complement unsigned integer format. The conversion
1732is performed according to the IEC/IEEE Standard for Binary Floating-point
1733Arithmetic---which means in particular that the conversion is rounded
1734according to the current rounding mode. If `a' is a NaN, the largest
1735positive integer is returned. Otherwise, if the conversion overflows, the
1736largest positive integer is returned.
1737-------------------------------------------------------------------------------
1738*/
1739int32 float64_to_uint32( float64 a )
1740{
1741 flag aSign;
1742 int16 aExp, shiftCount;
1743 bits64 aSig;
1744
1745 aSig = extractFloat64Frac( a );
1746 aExp = extractFloat64Exp( a );
1747 aSign = 0; //extractFloat64Sign( a );
1748 //if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1749 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
1750 shiftCount = 0x42C - aExp;
1751 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
1752 return roundAndPackInt32( aSign, aSig );
1753}
1754
1755/*
1756-------------------------------------------------------------------------------
1757Returns the result of converting the double-precision floating-point value
1758`a' to the 32-bit two's complement integer format. The conversion is
1759performed according to the IEC/IEEE Standard for Binary Floating-point
1760Arithmetic, except that the conversion is always rounded toward zero. If
1761`a' is a NaN, the largest positive integer is returned. Otherwise, if the
1762conversion overflows, the largest positive integer is returned.
1763-------------------------------------------------------------------------------
1764*/
1765int32 float64_to_uint32_round_to_zero( float64 a )
1766{
1767 flag aSign;
1768 int16 aExp, shiftCount;
1769 bits64 aSig, savedASig;
1770 int32 z;
1771
1772 aSig = extractFloat64Frac( a );
1773 aExp = extractFloat64Exp( a );
1774 aSign = extractFloat64Sign( a );
1775 shiftCount = 0x433 - aExp;
1776 if ( shiftCount < 21 ) {
1777 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
1778 goto invalid;
1779 }
1780 else if ( 52 < shiftCount ) {
1781 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
1782 return 0;
1783 }
1784 aSig |= LIT64( 0x0010000000000000 );
1785 savedASig = aSig;
1786 aSig >>= shiftCount;
1787 z = aSig;
1788 if ( aSign ) z = - z;
1789 if ( ( z < 0 ) ^ aSign ) {
1790 invalid:
1791 float_exception_flags |= float_flag_invalid;
1792 return aSign ? 0x80000000 : 0x7FFFFFFF;
1793 }
1794 if ( ( aSig<<shiftCount ) != savedASig ) {
1795 float_exception_flags |= float_flag_inexact;
1796 }
1797 return z;
1798}
1799
1800/*
1801-------------------------------------------------------------------------------
1802Returns the result of converting the double-precision floating-point value
1803`a' to the single-precision floating-point format. The conversion is
1804performed according to the IEC/IEEE Standard for Binary Floating-point
1805Arithmetic.
1806-------------------------------------------------------------------------------
1807*/
1808float32 float64_to_float32( float64 a )
1809{
1810 flag aSign;
1811 int16 aExp;
1812 bits64 aSig;
1813 bits32 zSig;
1814
1815 aSig = extractFloat64Frac( a );
1816 aExp = extractFloat64Exp( a );
1817 aSign = extractFloat64Sign( a );
1818 if ( aExp == 0x7FF ) {
1819 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
1820 return packFloat32( aSign, 0xFF, 0 );
1821 }
1822 shift64RightJamming( aSig, 22, &aSig );
1823 zSig = aSig;
1824 if ( aExp || zSig ) {
1825 zSig |= 0x40000000;
1826 aExp -= 0x381;
1827 }
1828 return roundAndPackFloat32( aSign, aExp, zSig );
1829
1830}
1831
1832#ifdef FLOATX80
1833
1834/*
1835-------------------------------------------------------------------------------
1836Returns the result of converting the double-precision floating-point value
1837`a' to the extended double-precision floating-point format. The conversion
1838is performed according to the IEC/IEEE Standard for Binary Floating-point
1839Arithmetic.
1840-------------------------------------------------------------------------------
1841*/
1842floatx80 float64_to_floatx80( float64 a )
1843{
1844 flag aSign;
1845 int16 aExp;
1846 bits64 aSig;
1847
1848 aSig = extractFloat64Frac( a );
1849 aExp = extractFloat64Exp( a );
1850 aSign = extractFloat64Sign( a );
1851 if ( aExp == 0x7FF ) {
1852 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
1853 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1854 }
1855 if ( aExp == 0 ) {
1856 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1857 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
1858 }
1859 return
1860 packFloatx80(
1861 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
1862
1863}
1864
1865#endif
1866
1867/*
1868-------------------------------------------------------------------------------
1869Rounds the double-precision floating-point value `a' to an integer, and
1870returns the result as a double-precision floating-point value. The
1871operation is performed according to the IEC/IEEE Standard for Binary
1872Floating-point Arithmetic.
1873-------------------------------------------------------------------------------
1874*/
1875float64 float64_round_to_int( float64 a )
1876{
1877 flag aSign;
1878 int16 aExp;
1879 bits64 lastBitMask, roundBitsMask;
1880 int8 roundingMode;
1881 float64 z;
1882
1883 aExp = extractFloat64Exp( a );
1884 if ( 0x433 <= aExp ) {
1885 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
1886 return propagateFloat64NaN( a, a );
1887 }
1888 return a;
1889 }
1890 if ( aExp <= 0x3FE ) {
1891 if ( (bits64) ( a<<1 ) == 0 ) return a;
1892 float_exception_flags |= float_flag_inexact;
1893 aSign = extractFloat64Sign( a );
1894 switch ( float_rounding_mode ) {
1895 case float_round_nearest_even:
1896 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
1897 return packFloat64( aSign, 0x3FF, 0 );
1898 }
1899 break;
1900 case float_round_down:
1901 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
1902 case float_round_up:
1903 return
1904 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
1905 }
1906 return packFloat64( aSign, 0, 0 );
1907 }
1908 lastBitMask = 1;
1909 lastBitMask <<= 0x433 - aExp;
1910 roundBitsMask = lastBitMask - 1;
1911 z = a;
1912 roundingMode = float_rounding_mode;
1913 if ( roundingMode == float_round_nearest_even ) {
1914 z += lastBitMask>>1;
1915 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1916 }
1917 else if ( roundingMode != float_round_to_zero ) {
1918 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1919 z += roundBitsMask;
1920 }
1921 }
1922 z &= ~ roundBitsMask;
1923 if ( z != a ) float_exception_flags |= float_flag_inexact;
1924 return z;
1925
1926}
1927
1928/*
1929-------------------------------------------------------------------------------
1930Returns the result of adding the absolute values of the double-precision
1931floating-point values `a' and `b'. If `zSign' is true, the sum is negated
1932before being returned. `zSign' is ignored if the result is a NaN. The
1933addition is performed according to the IEC/IEEE Standard for Binary
1934Floating-point Arithmetic.
1935-------------------------------------------------------------------------------
1936*/
1937static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1938{
1939 int16 aExp, bExp, zExp;
1940 bits64 aSig, bSig, zSig;
1941 int16 expDiff;
1942
1943 aSig = extractFloat64Frac( a );
1944 aExp = extractFloat64Exp( a );
1945 bSig = extractFloat64Frac( b );
1946 bExp = extractFloat64Exp( b );
1947 expDiff = aExp - bExp;
1948 aSig <<= 9;
1949 bSig <<= 9;
1950 if ( 0 < expDiff ) {
1951 if ( aExp == 0x7FF ) {
1952 if ( aSig ) return propagateFloat64NaN( a, b );
1953 return a;
1954 }
1955 if ( bExp == 0 ) {
1956 --expDiff;
1957 }
1958 else {
1959 bSig |= LIT64( 0x2000000000000000 );
1960 }
1961 shift64RightJamming( bSig, expDiff, &bSig );
1962 zExp = aExp;
1963 }
1964 else if ( expDiff < 0 ) {
1965 if ( bExp == 0x7FF ) {
1966 if ( bSig ) return propagateFloat64NaN( a, b );
1967 return packFloat64( zSign, 0x7FF, 0 );
1968 }
1969 if ( aExp == 0 ) {
1970 ++expDiff;
1971 }
1972 else {
1973 aSig |= LIT64( 0x2000000000000000 );
1974 }
1975 shift64RightJamming( aSig, - expDiff, &aSig );
1976 zExp = bExp;
1977 }
1978 else {
1979 if ( aExp == 0x7FF ) {
1980 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
1981 return a;
1982 }
1983 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
1984 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
1985 zExp = aExp;
1986 goto roundAndPack;
1987 }
1988 aSig |= LIT64( 0x2000000000000000 );
1989 zSig = ( aSig + bSig )<<1;
1990 --zExp;
1991 if ( (sbits64) zSig < 0 ) {
1992 zSig = aSig + bSig;
1993 ++zExp;
1994 }
1995 roundAndPack:
1996 return roundAndPackFloat64( zSign, zExp, zSig );
1997
1998}
1999
2000/*
2001-------------------------------------------------------------------------------
2002Returns the result of subtracting the absolute values of the double-
2003precision floating-point values `a' and `b'. If `zSign' is true, the
2004difference is negated before being returned. `zSign' is ignored if the
2005result is a NaN. The subtraction is performed according to the IEC/IEEE
2006Standard for Binary Floating-point Arithmetic.
2007-------------------------------------------------------------------------------
2008*/
2009static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2010{
2011 int16 aExp, bExp, zExp;
2012 bits64 aSig, bSig, zSig;
2013 int16 expDiff;
2014
2015 aSig = extractFloat64Frac( a );
2016 aExp = extractFloat64Exp( a );
2017 bSig = extractFloat64Frac( b );
2018 bExp = extractFloat64Exp( b );
2019 expDiff = aExp - bExp;
2020 aSig <<= 10;
2021 bSig <<= 10;
2022 if ( 0 < expDiff ) goto aExpBigger;
2023 if ( expDiff < 0 ) goto bExpBigger;
2024 if ( aExp == 0x7FF ) {
2025 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2026 float_raise( float_flag_invalid );
2027 return float64_default_nan;
2028 }
2029 if ( aExp == 0 ) {
2030 aExp = 1;
2031 bExp = 1;
2032 }
2033 if ( bSig < aSig ) goto aBigger;
2034 if ( aSig < bSig ) goto bBigger;
2035 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2036 bExpBigger:
2037 if ( bExp == 0x7FF ) {
2038 if ( bSig ) return propagateFloat64NaN( a, b );
2039 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2040 }
2041 if ( aExp == 0 ) {
2042 ++expDiff;
2043 }
2044 else {
2045 aSig |= LIT64( 0x4000000000000000 );
2046 }
2047 shift64RightJamming( aSig, - expDiff, &aSig );
2048 bSig |= LIT64( 0x4000000000000000 );
2049 bBigger:
2050 zSig = bSig - aSig;
2051 zExp = bExp;
2052 zSign ^= 1;
2053 goto normalizeRoundAndPack;
2054 aExpBigger:
2055 if ( aExp == 0x7FF ) {
2056 if ( aSig ) return propagateFloat64NaN( a, b );
2057 return a;
2058 }
2059 if ( bExp == 0 ) {
2060 --expDiff;
2061 }
2062 else {
2063 bSig |= LIT64( 0x4000000000000000 );
2064 }
2065 shift64RightJamming( bSig, expDiff, &bSig );
2066 aSig |= LIT64( 0x4000000000000000 );
2067 aBigger:
2068 zSig = aSig - bSig;
2069 zExp = aExp;
2070 normalizeRoundAndPack:
2071 --zExp;
2072 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2073
2074}
2075
2076/*
2077-------------------------------------------------------------------------------
2078Returns the result of adding the double-precision floating-point values `a'
2079and `b'. The operation is performed according to the IEC/IEEE Standard for
2080Binary Floating-point Arithmetic.
2081-------------------------------------------------------------------------------
2082*/
2083float64 float64_add( float64 a, float64 b )
2084{
2085 flag aSign, bSign;
2086
2087 aSign = extractFloat64Sign( a );
2088 bSign = extractFloat64Sign( b );
2089 if ( aSign == bSign ) {
2090 return addFloat64Sigs( a, b, aSign );
2091 }
2092 else {
2093 return subFloat64Sigs( a, b, aSign );
2094 }
2095
2096}
2097
2098/*
2099-------------------------------------------------------------------------------
2100Returns the result of subtracting the double-precision floating-point values
2101`a' and `b'. The operation is performed according to the IEC/IEEE Standard
2102for Binary Floating-point Arithmetic.
2103-------------------------------------------------------------------------------
2104*/
2105float64 float64_sub( float64 a, float64 b )
2106{
2107 flag aSign, bSign;
2108
2109 aSign = extractFloat64Sign( a );
2110 bSign = extractFloat64Sign( b );
2111 if ( aSign == bSign ) {
2112 return subFloat64Sigs( a, b, aSign );
2113 }
2114 else {
2115 return addFloat64Sigs( a, b, aSign );
2116 }
2117
2118}
2119
2120/*
2121-------------------------------------------------------------------------------
2122Returns the result of multiplying the double-precision floating-point values
2123`a' and `b'. The operation is performed according to the IEC/IEEE Standard
2124for Binary Floating-point Arithmetic.
2125-------------------------------------------------------------------------------
2126*/
2127float64 float64_mul( float64 a, float64 b )
2128{
2129 flag aSign, bSign, zSign;
2130 int16 aExp, bExp, zExp;
2131 bits64 aSig, bSig, zSig0, zSig1;
2132
2133 aSig = extractFloat64Frac( a );
2134 aExp = extractFloat64Exp( a );
2135 aSign = extractFloat64Sign( a );
2136 bSig = extractFloat64Frac( b );
2137 bExp = extractFloat64Exp( b );
2138 bSign = extractFloat64Sign( b );
2139 zSign = aSign ^ bSign;
2140 if ( aExp == 0x7FF ) {
2141 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2142 return propagateFloat64NaN( a, b );
2143 }
2144 if ( ( bExp | bSig ) == 0 ) {
2145 float_raise( float_flag_invalid );
2146 return float64_default_nan;
2147 }
2148 return packFloat64( zSign, 0x7FF, 0 );
2149 }
2150 if ( bExp == 0x7FF ) {
2151 if ( bSig ) return propagateFloat64NaN( a, b );
2152 if ( ( aExp | aSig ) == 0 ) {
2153 float_raise( float_flag_invalid );
2154 return float64_default_nan;
2155 }
2156 return packFloat64( zSign, 0x7FF, 0 );
2157 }
2158 if ( aExp == 0 ) {
2159 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2160 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2161 }
2162 if ( bExp == 0 ) {
2163 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2164 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2165 }
2166 zExp = aExp + bExp - 0x3FF;
2167 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2168 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2169 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2170 zSig0 |= ( zSig1 != 0 );
2171 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2172 zSig0 <<= 1;
2173 --zExp;
2174 }
2175 return roundAndPackFloat64( zSign, zExp, zSig0 );
2176
2177}
2178
2179/*
2180-------------------------------------------------------------------------------
2181Returns the result of dividing the double-precision floating-point value `a'
2182by the corresponding value `b'. The operation is performed according to
2183the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2184-------------------------------------------------------------------------------
2185*/
2186float64 float64_div( float64 a, float64 b )
2187{
2188 flag aSign, bSign, zSign;
2189 int16 aExp, bExp, zExp;
2190 bits64 aSig, bSig, zSig;
2191 bits64 rem0, rem1;
2192 bits64 term0, term1;
2193
2194 aSig = extractFloat64Frac( a );
2195 aExp = extractFloat64Exp( a );
2196 aSign = extractFloat64Sign( a );
2197 bSig = extractFloat64Frac( b );
2198 bExp = extractFloat64Exp( b );
2199 bSign = extractFloat64Sign( b );
2200 zSign = aSign ^ bSign;
2201 if ( aExp == 0x7FF ) {
2202 if ( aSig ) return propagateFloat64NaN( a, b );
2203 if ( bExp == 0x7FF ) {
2204 if ( bSig ) return propagateFloat64NaN( a, b );
2205 float_raise( float_flag_invalid );
2206 return float64_default_nan;
2207 }
2208 return packFloat64( zSign, 0x7FF, 0 );
2209 }
2210 if ( bExp == 0x7FF ) {
2211 if ( bSig ) return propagateFloat64NaN( a, b );
2212 return packFloat64( zSign, 0, 0 );
2213 }
2214 if ( bExp == 0 ) {
2215 if ( bSig == 0 ) {
2216 if ( ( aExp | aSig ) == 0 ) {
2217 float_raise( float_flag_invalid );
2218 return float64_default_nan;
2219 }
2220 float_raise( float_flag_divbyzero );
2221 return packFloat64( zSign, 0x7FF, 0 );
2222 }
2223 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2224 }
2225 if ( aExp == 0 ) {
2226 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2227 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2228 }
2229 zExp = aExp - bExp + 0x3FD;
2230 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2231 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2232 if ( bSig <= ( aSig + aSig ) ) {
2233 aSig >>= 1;
2234 ++zExp;
2235 }
2236 zSig = estimateDiv128To64( aSig, 0, bSig );
2237 if ( ( zSig & 0x1FF ) <= 2 ) {
2238 mul64To128( bSig, zSig, &term0, &term1 );
2239 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2240 while ( (sbits64) rem0 < 0 ) {
2241 --zSig;
2242 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2243 }
2244 zSig |= ( rem1 != 0 );
2245 }
2246 return roundAndPackFloat64( zSign, zExp, zSig );
2247
2248}
2249
2250/*
2251-------------------------------------------------------------------------------
2252Returns the remainder of the double-precision floating-point value `a'
2253with respect to the corresponding value `b'. The operation is performed
2254according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2255-------------------------------------------------------------------------------
2256*/
2257float64 float64_rem( float64 a, float64 b )
2258{
2259 flag aSign, bSign, zSign;
2260 int16 aExp, bExp, expDiff;
2261 bits64 aSig, bSig;
2262 bits64 q, alternateASig;
2263 sbits64 sigMean;
2264
2265 aSig = extractFloat64Frac( a );
2266 aExp = extractFloat64Exp( a );
2267 aSign = extractFloat64Sign( a );
2268 bSig = extractFloat64Frac( b );
2269 bExp = extractFloat64Exp( b );
2270 bSign = extractFloat64Sign( b );
2271 if ( aExp == 0x7FF ) {
2272 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2273 return propagateFloat64NaN( a, b );
2274 }
2275 float_raise( float_flag_invalid );
2276 return float64_default_nan;
2277 }
2278 if ( bExp == 0x7FF ) {
2279 if ( bSig ) return propagateFloat64NaN( a, b );
2280 return a;
2281 }
2282 if ( bExp == 0 ) {
2283 if ( bSig == 0 ) {
2284 float_raise( float_flag_invalid );
2285 return float64_default_nan;
2286 }
2287 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2288 }
2289 if ( aExp == 0 ) {
2290 if ( aSig == 0 ) return a;
2291 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2292 }
2293 expDiff = aExp - bExp;
2294 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2295 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2296 if ( expDiff < 0 ) {
2297 if ( expDiff < -1 ) return a;
2298 aSig >>= 1;
2299 }
2300 q = ( bSig <= aSig );
2301 if ( q ) aSig -= bSig;
2302 expDiff -= 64;
2303 while ( 0 < expDiff ) {
2304 q = estimateDiv128To64( aSig, 0, bSig );
2305 q = ( 2 < q ) ? q - 2 : 0;
2306 aSig = - ( ( bSig>>2 ) * q );
2307 expDiff -= 62;
2308 }
2309 expDiff += 64;
2310 if ( 0 < expDiff ) {
2311 q = estimateDiv128To64( aSig, 0, bSig );
2312 q = ( 2 < q ) ? q - 2 : 0;
2313 q >>= 64 - expDiff;
2314 bSig >>= 2;
2315 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2316 }
2317 else {
2318 aSig >>= 2;
2319 bSig >>= 2;
2320 }
2321 do {
2322 alternateASig = aSig;
2323 ++q;
2324 aSig -= bSig;
2325 } while ( 0 <= (sbits64) aSig );
2326 sigMean = aSig + alternateASig;
2327 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2328 aSig = alternateASig;
2329 }
2330 zSign = ( (sbits64) aSig < 0 );
2331 if ( zSign ) aSig = - aSig;
2332 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2333
2334}
2335
2336/*
2337-------------------------------------------------------------------------------
2338Returns the square root of the double-precision floating-point value `a'.
2339The operation is performed according to the IEC/IEEE Standard for Binary
2340Floating-point Arithmetic.
2341-------------------------------------------------------------------------------
2342*/
2343float64 float64_sqrt( float64 a )
2344{
2345 flag aSign;
2346 int16 aExp, zExp;
2347 bits64 aSig, zSig;
2348 bits64 rem0, rem1, term0, term1; //, shiftedRem;
2349 //float64 z;
2350
2351 aSig = extractFloat64Frac( a );
2352 aExp = extractFloat64Exp( a );
2353 aSign = extractFloat64Sign( a );
2354 if ( aExp == 0x7FF ) {
2355 if ( aSig ) return propagateFloat64NaN( a, a );
2356 if ( ! aSign ) return a;
2357 float_raise( float_flag_invalid );
2358 return float64_default_nan;
2359 }
2360 if ( aSign ) {
2361 if ( ( aExp | aSig ) == 0 ) return a;
2362 float_raise( float_flag_invalid );
2363 return float64_default_nan;
2364 }
2365 if ( aExp == 0 ) {
2366 if ( aSig == 0 ) return 0;
2367 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2368 }
2369 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2370 aSig |= LIT64( 0x0010000000000000 );
2371 zSig = estimateSqrt32( aExp, aSig>>21 );
2372 zSig <<= 31;
2373 aSig <<= 9 - ( aExp & 1 );
2374 zSig = estimateDiv128To64( aSig, 0, zSig ) + zSig + 2;
2375 if ( ( zSig & 0x3FF ) <= 5 ) {
2376 if ( zSig < 2 ) {
2377 zSig = LIT64( 0xFFFFFFFFFFFFFFFF );
2378 }
2379 else {
2380 aSig <<= 2;
2381 mul64To128( zSig, zSig, &term0, &term1 );
2382 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2383 while ( (sbits64) rem0 < 0 ) {
2384 --zSig;
2385 shortShift128Left( 0, zSig, 1, &term0, &term1 );
2386 term1 |= 1;
2387 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
2388 }
2389 zSig |= ( ( rem0 | rem1 ) != 0 );
2390 }
2391 }
2392 shift64RightJamming( zSig, 1, &zSig );
2393 return roundAndPackFloat64( 0, zExp, zSig );
2394
2395}
2396
2397/*
2398-------------------------------------------------------------------------------
2399Returns 1 if the double-precision floating-point value `a' is equal to the
2400corresponding value `b', and 0 otherwise. The comparison is performed
2401according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2402-------------------------------------------------------------------------------
2403*/
2404flag float64_eq( float64 a, float64 b )
2405{
2406
2407 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2408 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2409 ) {
2410 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2411 float_raise( float_flag_invalid );
2412 }
2413 return 0;
2414 }
2415 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2416
2417}
2418
2419/*
2420-------------------------------------------------------------------------------
2421Returns 1 if the double-precision floating-point value `a' is less than or
2422equal to the corresponding value `b', and 0 otherwise. The comparison is
2423performed according to the IEC/IEEE Standard for Binary Floating-point
2424Arithmetic.
2425-------------------------------------------------------------------------------
2426*/
2427flag float64_le( float64 a, float64 b )
2428{
2429 flag aSign, bSign;
2430
2431 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2432 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2433 ) {
2434 float_raise( float_flag_invalid );
2435 return 0;
2436 }
2437 aSign = extractFloat64Sign( a );
2438 bSign = extractFloat64Sign( b );
2439 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2440 return ( a == b ) || ( aSign ^ ( a < b ) );
2441
2442}
2443
2444/*
2445-------------------------------------------------------------------------------
2446Returns 1 if the double-precision floating-point value `a' is less than
2447the corresponding value `b', and 0 otherwise. The comparison is performed
2448according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2449-------------------------------------------------------------------------------
2450*/
2451flag float64_lt( float64 a, float64 b )
2452{
2453 flag aSign, bSign;
2454
2455 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2456 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2457 ) {
2458 float_raise( float_flag_invalid );
2459 return 0;
2460 }
2461 aSign = extractFloat64Sign( a );
2462 bSign = extractFloat64Sign( b );
2463 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2464 return ( a != b ) && ( aSign ^ ( a < b ) );
2465
2466}
2467
2468/*
2469-------------------------------------------------------------------------------
2470Returns 1 if the double-precision floating-point value `a' is equal to the
2471corresponding value `b', and 0 otherwise. The invalid exception is raised
2472if either operand is a NaN. Otherwise, the comparison is performed
2473according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2474-------------------------------------------------------------------------------
2475*/
2476flag float64_eq_signaling( float64 a, float64 b )
2477{
2478
2479 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2480 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2481 ) {
2482 float_raise( float_flag_invalid );
2483 return 0;
2484 }
2485 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2486
2487}
2488
2489/*
2490-------------------------------------------------------------------------------
2491Returns 1 if the double-precision floating-point value `a' is less than or
2492equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2493cause an exception. Otherwise, the comparison is performed according to the
2494IEC/IEEE Standard for Binary Floating-point Arithmetic.
2495-------------------------------------------------------------------------------
2496*/
2497flag float64_le_quiet( float64 a, float64 b )
2498{
2499 flag aSign, bSign;
2500 //int16 aExp, bExp;
2501
2502 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2503 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2504 ) {
2505 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2506 float_raise( float_flag_invalid );
2507 }
2508 return 0;
2509 }
2510 aSign = extractFloat64Sign( a );
2511 bSign = extractFloat64Sign( b );
2512 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2513 return ( a == b ) || ( aSign ^ ( a < b ) );
2514
2515}
2516
2517/*
2518-------------------------------------------------------------------------------
2519Returns 1 if the double-precision floating-point value `a' is less than
2520the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2521exception. Otherwise, the comparison is performed according to the IEC/IEEE
2522Standard for Binary Floating-point Arithmetic.
2523-------------------------------------------------------------------------------
2524*/
2525flag float64_lt_quiet( float64 a, float64 b )
2526{
2527 flag aSign, bSign;
2528
2529 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2530 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2531 ) {
2532 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2533 float_raise( float_flag_invalid );
2534 }
2535 return 0;
2536 }
2537 aSign = extractFloat64Sign( a );
2538 bSign = extractFloat64Sign( b );
2539 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2540 return ( a != b ) && ( aSign ^ ( a < b ) );
2541
2542}
2543
2544#ifdef FLOATX80
2545
2546/*
2547-------------------------------------------------------------------------------
2548Returns the result of converting the extended double-precision floating-
2549point value `a' to the 32-bit two's complement integer format. The
2550conversion is performed according to the IEC/IEEE Standard for Binary
2551Floating-point Arithmetic---which means in particular that the conversion
2552is rounded according to the current rounding mode. If `a' is a NaN, the
2553largest positive integer is returned. Otherwise, if the conversion
2554overflows, the largest integer with the same sign as `a' is returned.
2555-------------------------------------------------------------------------------
2556*/
2557int32 floatx80_to_int32( floatx80 a )
2558{
2559 flag aSign;
2560 int32 aExp, shiftCount;
2561 bits64 aSig;
2562
2563 aSig = extractFloatx80Frac( a );
2564 aExp = extractFloatx80Exp( a );
2565 aSign = extractFloatx80Sign( a );
2566 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2567 shiftCount = 0x4037 - aExp;
2568 if ( shiftCount <= 0 ) shiftCount = 1;
2569 shift64RightJamming( aSig, shiftCount, &aSig );
2570 return roundAndPackInt32( aSign, aSig );
2571
2572}
2573
2574/*
2575-------------------------------------------------------------------------------
2576Returns the result of converting the extended double-precision floating-
2577point value `a' to the 32-bit two's complement integer format. The
2578conversion is performed according to the IEC/IEEE Standard for Binary
2579Floating-point Arithmetic, except that the conversion is always rounded
2580toward zero. If `a' is a NaN, the largest positive integer is returned.
2581Otherwise, if the conversion overflows, the largest integer with the same
2582sign as `a' is returned.
2583-------------------------------------------------------------------------------
2584*/
2585int32 floatx80_to_int32_round_to_zero( floatx80 a )
2586{
2587 flag aSign;
2588 int32 aExp, shiftCount;
2589 bits64 aSig, savedASig;
2590 int32 z;
2591
2592 aSig = extractFloatx80Frac( a );
2593 aExp = extractFloatx80Exp( a );
2594 aSign = extractFloatx80Sign( a );
2595 shiftCount = 0x403E - aExp;
2596 if ( shiftCount < 32 ) {
2597 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
2598 goto invalid;
2599 }
2600 else if ( 63 < shiftCount ) {
2601 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2602 return 0;
2603 }
2604 savedASig = aSig;
2605 aSig >>= shiftCount;
2606 z = aSig;
2607 if ( aSign ) z = - z;
2608 if ( ( z < 0 ) ^ aSign ) {
2609 invalid:
2610 float_exception_flags |= float_flag_invalid;
2611 return aSign ? 0x80000000 : 0x7FFFFFFF;
2612 }
2613 if ( ( aSig<<shiftCount ) != savedASig ) {
2614 float_exception_flags |= float_flag_inexact;
2615 }
2616 return z;
2617
2618}
2619
2620/*
2621-------------------------------------------------------------------------------
2622Returns the result of converting the extended double-precision floating-
2623point value `a' to the single-precision floating-point format. The
2624conversion is performed according to the IEC/IEEE Standard for Binary
2625Floating-point Arithmetic.
2626-------------------------------------------------------------------------------
2627*/
2628float32 floatx80_to_float32( floatx80 a )
2629{
2630 flag aSign;
2631 int32 aExp;
2632 bits64 aSig;
2633
2634 aSig = extractFloatx80Frac( a );
2635 aExp = extractFloatx80Exp( a );
2636 aSign = extractFloatx80Sign( a );
2637 if ( aExp == 0x7FFF ) {
2638 if ( (bits64) ( aSig<<1 ) ) {
2639 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
2640 }
2641 return packFloat32( aSign, 0xFF, 0 );
2642 }
2643 shift64RightJamming( aSig, 33, &aSig );
2644 if ( aExp || aSig ) aExp -= 0x3F81;
2645 return roundAndPackFloat32( aSign, aExp, aSig );
2646
2647}
2648
2649/*
2650-------------------------------------------------------------------------------
2651Returns the result of converting the extended double-precision floating-
2652point value `a' to the double-precision floating-point format. The
2653conversion is performed according to the IEC/IEEE Standard for Binary
2654Floating-point Arithmetic.
2655-------------------------------------------------------------------------------
2656*/
2657float64 floatx80_to_float64( floatx80 a )
2658{
2659 flag aSign;
2660 int32 aExp;
2661 bits64 aSig, zSig;
2662
2663 aSig = extractFloatx80Frac( a );
2664 aExp = extractFloatx80Exp( a );
2665 aSign = extractFloatx80Sign( a );
2666 if ( aExp == 0x7FFF ) {
2667 if ( (bits64) ( aSig<<1 ) ) {
2668 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
2669 }
2670 return packFloat64( aSign, 0x7FF, 0 );
2671 }
2672 shift64RightJamming( aSig, 1, &zSig );
2673 if ( aExp || aSig ) aExp -= 0x3C01;
2674 return roundAndPackFloat64( aSign, aExp, zSig );
2675
2676}
2677
2678/*
2679-------------------------------------------------------------------------------
2680Rounds the extended double-precision floating-point value `a' to an integer,
2681and returns the result as an extended quadruple-precision floating-point
2682value. The operation is performed according to the IEC/IEEE Standard for
2683Binary Floating-point Arithmetic.
2684-------------------------------------------------------------------------------
2685*/
2686floatx80 floatx80_round_to_int( floatx80 a )
2687{
2688 flag aSign;
2689 int32 aExp;
2690 bits64 lastBitMask, roundBitsMask;
2691 int8 roundingMode;
2692 floatx80 z;
2693
2694 aExp = extractFloatx80Exp( a );
2695 if ( 0x403E <= aExp ) {
2696 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
2697 return propagateFloatx80NaN( a, a );
2698 }
2699 return a;
2700 }
2701 if ( aExp <= 0x3FFE ) {
2702 if ( ( aExp == 0 )
2703 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
2704 return a;
2705 }
2706 float_exception_flags |= float_flag_inexact;
2707 aSign = extractFloatx80Sign( a );
2708 switch ( float_rounding_mode ) {
2709 case float_round_nearest_even:
2710 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
2711 ) {
2712 return
2713 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
2714 }
2715 break;
2716 case float_round_down:
2717 return
2718 aSign ?
2719 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
2720 : packFloatx80( 0, 0, 0 );
2721 case float_round_up:
2722 return
2723 aSign ? packFloatx80( 1, 0, 0 )
2724 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
2725 }
2726 return packFloatx80( aSign, 0, 0 );
2727 }
2728 lastBitMask = 1;
2729 lastBitMask <<= 0x403E - aExp;
2730 roundBitsMask = lastBitMask - 1;
2731 z = a;
2732 roundingMode = float_rounding_mode;
2733 if ( roundingMode == float_round_nearest_even ) {
2734 z.low += lastBitMask>>1;
2735 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2736 }
2737 else if ( roundingMode != float_round_to_zero ) {
2738 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2739 z.low += roundBitsMask;
2740 }
2741 }
2742 z.low &= ~ roundBitsMask;
2743 if ( z.low == 0 ) {
2744 ++z.high;
2745 z.low = LIT64( 0x8000000000000000 );
2746 }
2747 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
2748 return z;
2749
2750}
2751
2752/*
2753-------------------------------------------------------------------------------
2754Returns the result of adding the absolute values of the extended double-
2755precision floating-point values `a' and `b'. If `zSign' is true, the sum is
2756negated before being returned. `zSign' is ignored if the result is a NaN.
2757The addition is performed according to the IEC/IEEE Standard for Binary
2758Floating-point Arithmetic.
2759-------------------------------------------------------------------------------
2760*/
2761static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2762{
2763 int32 aExp, bExp, zExp;
2764 bits64 aSig, bSig, zSig0, zSig1;
2765 int32 expDiff;
2766
2767 aSig = extractFloatx80Frac( a );
2768 aExp = extractFloatx80Exp( a );
2769 bSig = extractFloatx80Frac( b );
2770 bExp = extractFloatx80Exp( b );
2771 expDiff = aExp - bExp;
2772 if ( 0 < expDiff ) {
2773 if ( aExp == 0x7FFF ) {
2774 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2775 return a;
2776 }
2777 if ( bExp == 0 ) --expDiff;
2778 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2779 zExp = aExp;
2780 }
2781 else if ( expDiff < 0 ) {
2782 if ( bExp == 0x7FFF ) {
2783 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2784 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2785 }
2786 if ( aExp == 0 ) ++expDiff;
2787 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2788 zExp = bExp;
2789 }
2790 else {
2791 if ( aExp == 0x7FFF ) {
2792 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2793 return propagateFloatx80NaN( a, b );
2794 }
2795 return a;
2796 }
2797 zSig1 = 0;
2798 zSig0 = aSig + bSig;
2799 if ( aExp == 0 ) {
2800 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
2801 goto roundAndPack;
2802 }
2803 zExp = aExp;
2804 goto shiftRight1;
2805 }
2806
2807 zSig0 = aSig + bSig;
2808
2809 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
2810 shiftRight1:
2811 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
2812 zSig0 |= LIT64( 0x8000000000000000 );
2813 ++zExp;
2814 roundAndPack:
2815 return
2816 roundAndPackFloatx80(
2817 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2818
2819}
2820
2821/*
2822-------------------------------------------------------------------------------
2823Returns the result of subtracting the absolute values of the extended
2824double-precision floating-point values `a' and `b'. If `zSign' is true,
2825the difference is negated before being returned. `zSign' is ignored if the
2826result is a NaN. The subtraction is performed according to the IEC/IEEE
2827Standard for Binary Floating-point Arithmetic.
2828-------------------------------------------------------------------------------
2829*/
2830static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
2831{
2832 int32 aExp, bExp, zExp;
2833 bits64 aSig, bSig, zSig0, zSig1;
2834 int32 expDiff;
2835 floatx80 z;
2836
2837 aSig = extractFloatx80Frac( a );
2838 aExp = extractFloatx80Exp( a );
2839 bSig = extractFloatx80Frac( b );
2840 bExp = extractFloatx80Exp( b );
2841 expDiff = aExp - bExp;
2842 if ( 0 < expDiff ) goto aExpBigger;
2843 if ( expDiff < 0 ) goto bExpBigger;
2844 if ( aExp == 0x7FFF ) {
2845 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
2846 return propagateFloatx80NaN( a, b );
2847 }
2848 float_raise( float_flag_invalid );
2849 z.low = floatx80_default_nan_low;
2850 z.high = floatx80_default_nan_high;
2851 return z;
2852 }
2853 if ( aExp == 0 ) {
2854 aExp = 1;
2855 bExp = 1;
2856 }
2857 zSig1 = 0;
2858 if ( bSig < aSig ) goto aBigger;
2859 if ( aSig < bSig ) goto bBigger;
2860 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
2861 bExpBigger:
2862 if ( bExp == 0x7FFF ) {
2863 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2864 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
2865 }
2866 if ( aExp == 0 ) ++expDiff;
2867 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
2868 bBigger:
2869 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
2870 zExp = bExp;
2871 zSign ^= 1;
2872 goto normalizeRoundAndPack;
2873 aExpBigger:
2874 if ( aExp == 0x7FFF ) {
2875 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
2876 return a;
2877 }
2878 if ( bExp == 0 ) --expDiff;
2879 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
2880 aBigger:
2881 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
2882 zExp = aExp;
2883 normalizeRoundAndPack:
2884 return
2885 normalizeRoundAndPackFloatx80(
2886 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2887
2888}
2889
2890/*
2891-------------------------------------------------------------------------------
2892Returns the result of adding the extended double-precision floating-point
2893values `a' and `b'. The operation is performed according to the IEC/IEEE
2894Standard for Binary Floating-point Arithmetic.
2895-------------------------------------------------------------------------------
2896*/
2897floatx80 floatx80_add( floatx80 a, floatx80 b )
2898{
2899 flag aSign, bSign;
2900
2901 aSign = extractFloatx80Sign( a );
2902 bSign = extractFloatx80Sign( b );
2903 if ( aSign == bSign ) {
2904 return addFloatx80Sigs( a, b, aSign );
2905 }
2906 else {
2907 return subFloatx80Sigs( a, b, aSign );
2908 }
2909
2910}
2911
2912/*
2913-------------------------------------------------------------------------------
2914Returns the result of subtracting the extended double-precision floating-
2915point values `a' and `b'. The operation is performed according to the
2916IEC/IEEE Standard for Binary Floating-point Arithmetic.
2917-------------------------------------------------------------------------------
2918*/
2919floatx80 floatx80_sub( floatx80 a, floatx80 b )
2920{
2921 flag aSign, bSign;
2922
2923 aSign = extractFloatx80Sign( a );
2924 bSign = extractFloatx80Sign( b );
2925 if ( aSign == bSign ) {
2926 return subFloatx80Sigs( a, b, aSign );
2927 }
2928 else {
2929 return addFloatx80Sigs( a, b, aSign );
2930 }
2931
2932}
2933
2934/*
2935-------------------------------------------------------------------------------
2936Returns the result of multiplying the extended double-precision floating-
2937point values `a' and `b'. The operation is performed according to the
2938IEC/IEEE Standard for Binary Floating-point Arithmetic.
2939-------------------------------------------------------------------------------
2940*/
2941floatx80 floatx80_mul( floatx80 a, floatx80 b )
2942{
2943 flag aSign, bSign, zSign;
2944 int32 aExp, bExp, zExp;
2945 bits64 aSig, bSig, zSig0, zSig1;
2946 floatx80 z;
2947
2948 aSig = extractFloatx80Frac( a );
2949 aExp = extractFloatx80Exp( a );
2950 aSign = extractFloatx80Sign( a );
2951 bSig = extractFloatx80Frac( b );
2952 bExp = extractFloatx80Exp( b );
2953 bSign = extractFloatx80Sign( b );
2954 zSign = aSign ^ bSign;
2955 if ( aExp == 0x7FFF ) {
2956 if ( (bits64) ( aSig<<1 )
2957 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
2958 return propagateFloatx80NaN( a, b );
2959 }
2960 if ( ( bExp | bSig ) == 0 ) goto invalid;
2961 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2962 }
2963 if ( bExp == 0x7FFF ) {
2964 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
2965 if ( ( aExp | aSig ) == 0 ) {
2966 invalid:
2967 float_raise( float_flag_invalid );
2968 z.low = floatx80_default_nan_low;
2969 z.high = floatx80_default_nan_high;
2970 return z;
2971 }
2972 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2973 }
2974 if ( aExp == 0 ) {
2975 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
2976 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
2977 }
2978 if ( bExp == 0 ) {
2979 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
2980 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
2981 }
2982 zExp = aExp + bExp - 0x3FFE;
2983 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2984 if ( 0 < (sbits64) zSig0 ) {
2985 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
2986 --zExp;
2987 }
2988 return
2989 roundAndPackFloatx80(
2990 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
2991
2992}
2993
2994/*
2995-------------------------------------------------------------------------------
2996Returns the result of dividing the extended double-precision floating-point
2997value `a' by the corresponding value `b'. The operation is performed
2998according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
2999-------------------------------------------------------------------------------
3000*/
3001floatx80 floatx80_div( floatx80 a, floatx80 b )
3002{
3003 flag aSign, bSign, zSign;
3004 int32 aExp, bExp, zExp;
3005 bits64 aSig, bSig, zSig0, zSig1;
3006 bits64 rem0, rem1, rem2, term0, term1, term2;
3007 floatx80 z;
3008
3009 aSig = extractFloatx80Frac( a );
3010 aExp = extractFloatx80Exp( a );
3011 aSign = extractFloatx80Sign( a );
3012 bSig = extractFloatx80Frac( b );
3013 bExp = extractFloatx80Exp( b );
3014 bSign = extractFloatx80Sign( b );
3015 zSign = aSign ^ bSign;
3016 if ( aExp == 0x7FFF ) {
3017 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3018 if ( bExp == 0x7FFF ) {
3019 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3020 goto invalid;
3021 }
3022 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3023 }
3024 if ( bExp == 0x7FFF ) {
3025 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3026 return packFloatx80( zSign, 0, 0 );
3027 }
3028 if ( bExp == 0 ) {
3029 if ( bSig == 0 ) {
3030 if ( ( aExp | aSig ) == 0 ) {
3031 invalid:
3032 float_raise( float_flag_invalid );
3033 z.low = floatx80_default_nan_low;
3034 z.high = floatx80_default_nan_high;
3035 return z;
3036 }
3037 float_raise( float_flag_divbyzero );
3038 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3039 }
3040 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3041 }
3042 if ( aExp == 0 ) {
3043 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3044 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3045 }
3046 zExp = aExp - bExp + 0x3FFE;
3047 rem1 = 0;
3048 if ( bSig <= aSig ) {
3049 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3050 ++zExp;
3051 }
3052 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3053 mul64To128( bSig, zSig0, &term0, &term1 );
3054 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3055 while ( (sbits64) rem0 < 0 ) {
3056 --zSig0;
3057 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3058 }
3059 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3060 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3061 mul64To128( bSig, zSig1, &term1, &term2 );
3062 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3063 while ( (sbits64) rem1 < 0 ) {
3064 --zSig1;
3065 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3066 }
3067 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3068 }
3069 return
3070 roundAndPackFloatx80(
3071 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3072
3073}
3074
3075/*
3076-------------------------------------------------------------------------------
3077Returns the remainder of the extended double-precision floating-point value
3078`a' with respect to the corresponding value `b'. The operation is performed
3079according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3080-------------------------------------------------------------------------------
3081*/
3082floatx80 floatx80_rem( floatx80 a, floatx80 b )
3083{
3084 flag aSign, bSign, zSign;
3085 int32 aExp, bExp, expDiff;
3086 bits64 aSig0, aSig1, bSig;
3087 bits64 q, term0, term1, alternateASig0, alternateASig1;
3088 floatx80 z;
3089
3090 aSig0 = extractFloatx80Frac( a );
3091 aExp = extractFloatx80Exp( a );
3092 aSign = extractFloatx80Sign( a );
3093 bSig = extractFloatx80Frac( b );
3094 bExp = extractFloatx80Exp( b );
3095 bSign = extractFloatx80Sign( b );
3096 if ( aExp == 0x7FFF ) {
3097 if ( (bits64) ( aSig0<<1 )
3098 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3099 return propagateFloatx80NaN( a, b );
3100 }
3101 goto invalid;
3102 }
3103 if ( bExp == 0x7FFF ) {
3104 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3105 return a;
3106 }
3107 if ( bExp == 0 ) {
3108 if ( bSig == 0 ) {
3109 invalid:
3110 float_raise( float_flag_invalid );
3111 z.low = floatx80_default_nan_low;
3112 z.high = floatx80_default_nan_high;
3113 return z;
3114 }
3115 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3116 }
3117 if ( aExp == 0 ) {
3118 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3119 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3120 }
3121 bSig |= LIT64( 0x8000000000000000 );
3122 zSign = aSign;
3123 expDiff = aExp - bExp;
3124 aSig1 = 0;
3125 if ( expDiff < 0 ) {
3126 if ( expDiff < -1 ) return a;
3127 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3128 expDiff = 0;
3129 }
3130 q = ( bSig <= aSig0 );
3131 if ( q ) aSig0 -= bSig;
3132 expDiff -= 64;
3133 while ( 0 < expDiff ) {
3134 q = estimateDiv128To64( aSig0, aSig1, bSig );
3135 q = ( 2 < q ) ? q - 2 : 0;
3136 mul64To128( bSig, q, &term0, &term1 );
3137 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3138 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3139 expDiff -= 62;
3140 }
3141 expDiff += 64;
3142 if ( 0 < expDiff ) {
3143 q = estimateDiv128To64( aSig0, aSig1, bSig );
3144 q = ( 2 < q ) ? q - 2 : 0;
3145 q >>= 64 - expDiff;
3146 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3147 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3148 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3149 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3150 ++q;
3151 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3152 }
3153 }
3154 else {
3155 term1 = 0;
3156 term0 = bSig;
3157 }
3158 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3159 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3160 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3161 && ( q & 1 ) )
3162 ) {
3163 aSig0 = alternateASig0;
3164 aSig1 = alternateASig1;
3165 zSign = ! zSign;
3166 }
3167 return
3168 normalizeRoundAndPackFloatx80(
3169 80, zSign, bExp + expDiff, aSig0, aSig1 );
3170
3171}
3172
3173/*
3174-------------------------------------------------------------------------------
3175Returns the square root of the extended double-precision floating-point
3176value `a'. The operation is performed according to the IEC/IEEE Standard
3177for Binary Floating-point Arithmetic.
3178-------------------------------------------------------------------------------
3179*/
3180floatx80 floatx80_sqrt( floatx80 a )
3181{
3182 flag aSign;
3183 int32 aExp, zExp;
3184 bits64 aSig0, aSig1, zSig0, zSig1;
3185 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3186 bits64 shiftedRem0, shiftedRem1;
3187 floatx80 z;
3188
3189 aSig0 = extractFloatx80Frac( a );
3190 aExp = extractFloatx80Exp( a );
3191 aSign = extractFloatx80Sign( a );
3192 if ( aExp == 0x7FFF ) {
3193 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3194 if ( ! aSign ) return a;
3195 goto invalid;
3196 }
3197 if ( aSign ) {
3198 if ( ( aExp | aSig0 ) == 0 ) return a;
3199 invalid:
3200 float_raise( float_flag_invalid );
3201 z.low = floatx80_default_nan_low;
3202 z.high = floatx80_default_nan_high;
3203 return z;
3204 }
3205 if ( aExp == 0 ) {
3206 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3207 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3208 }
3209 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3210 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3211 zSig0 <<= 31;
3212 aSig1 = 0;
3213 shift128Right( aSig0, 0, ( aExp & 1 ) + 2, &aSig0, &aSig1 );
3214 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0 ) + zSig0 + 4;
3215 if ( 0 <= (sbits64) zSig0 ) zSig0 = LIT64( 0xFFFFFFFFFFFFFFFF );
3216 shortShift128Left( aSig0, aSig1, 2, &aSig0, &aSig1 );
3217 mul64To128( zSig0, zSig0, &term0, &term1 );
3218 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3219 while ( (sbits64) rem0 < 0 ) {
3220 --zSig0;
3221 shortShift128Left( 0, zSig0, 1, &term0, &term1 );
3222 term1 |= 1;
3223 add128( rem0, rem1, term0, term1, &rem0, &rem1 );
3224 }
3225 shortShift128Left( rem0, rem1, 63, &shiftedRem0, &shiftedRem1 );
3226 zSig1 = estimateDiv128To64( shiftedRem0, shiftedRem1, zSig0 );
3227 if ( (bits64) ( zSig1<<1 ) <= 10 ) {
3228 if ( zSig1 == 0 ) zSig1 = 1;
3229 mul64To128( zSig0, zSig1, &term1, &term2 );
3230 shortShift128Left( term1, term2, 1, &term1, &term2 );
3231 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3232 mul64To128( zSig1, zSig1, &term2, &term3 );
3233 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3234 while ( (sbits64) rem1 < 0 ) {
3235 --zSig1;
3236 shortShift192Left( 0, zSig0, zSig1, 1, &term1, &term2, &term3 );
3237 term3 |= 1;
3238 add192(
3239 rem1, rem2, rem3, term1, term2, term3, &rem1, &rem2, &rem3 );
3240 }
3241 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3242 }
3243 return
3244 roundAndPackFloatx80(
3245 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3246
3247}
3248
3249/*
3250-------------------------------------------------------------------------------
3251Returns 1 if the extended double-precision floating-point value `a' is
3252equal to the corresponding value `b', and 0 otherwise. The comparison is
3253performed according to the IEC/IEEE Standard for Binary Floating-point
3254Arithmetic.
3255-------------------------------------------------------------------------------
3256*/
3257flag floatx80_eq( floatx80 a, floatx80 b )
3258{
3259
3260 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3261 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3262 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3263 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3264 ) {
3265 if ( floatx80_is_signaling_nan( a )
3266 || floatx80_is_signaling_nan( b ) ) {
3267 float_raise( float_flag_invalid );
3268 }
3269 return 0;
3270 }
3271 return
3272 ( a.low == b.low )
3273 && ( ( a.high == b.high )
3274 || ( ( a.low == 0 )
3275 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3276 );
3277
3278}
3279
3280/*
3281-------------------------------------------------------------------------------
3282Returns 1 if the extended double-precision floating-point value `a' is
3283less than or equal to the corresponding value `b', and 0 otherwise. The
3284comparison is performed according to the IEC/IEEE Standard for Binary
3285Floating-point Arithmetic.
3286-------------------------------------------------------------------------------
3287*/
3288flag floatx80_le( floatx80 a, floatx80 b )
3289{
3290 flag aSign, bSign;
3291
3292 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3293 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3294 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3295 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3296 ) {
3297 float_raise( float_flag_invalid );
3298 return 0;
3299 }
3300 aSign = extractFloatx80Sign( a );
3301 bSign = extractFloatx80Sign( b );
3302 if ( aSign != bSign ) {
3303 return
3304 aSign
3305 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3306 == 0 );
3307 }
3308 return
3309 aSign ? le128( b.high, b.low, a.high, a.low )
3310 : le128( a.high, a.low, b.high, b.low );
3311
3312}
3313
3314/*
3315-------------------------------------------------------------------------------
3316Returns 1 if the extended double-precision floating-point value `a' is
3317less than the corresponding value `b', and 0 otherwise. The comparison
3318is performed according to the IEC/IEEE Standard for Binary Floating-point
3319Arithmetic.
3320-------------------------------------------------------------------------------
3321*/
3322flag floatx80_lt( floatx80 a, floatx80 b )
3323{
3324 flag aSign, bSign;
3325
3326 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3327 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3328 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3329 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3330 ) {
3331 float_raise( float_flag_invalid );
3332 return 0;
3333 }
3334 aSign = extractFloatx80Sign( a );
3335 bSign = extractFloatx80Sign( b );
3336 if ( aSign != bSign ) {
3337 return
3338 aSign
3339 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3340 != 0 );
3341 }
3342 return
3343 aSign ? lt128( b.high, b.low, a.high, a.low )
3344 : lt128( a.high, a.low, b.high, b.low );
3345
3346}
3347
3348/*
3349-------------------------------------------------------------------------------
3350Returns 1 if the extended double-precision floating-point value `a' is equal
3351to the corresponding value `b', and 0 otherwise. The invalid exception is
3352raised if either operand is a NaN. Otherwise, the comparison is performed
3353according to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3354-------------------------------------------------------------------------------
3355*/
3356flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3357{
3358
3359 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3360 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3361 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3362 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3363 ) {
3364 float_raise( float_flag_invalid );
3365 return 0;
3366 }
3367 return
3368 ( a.low == b.low )
3369 && ( ( a.high == b.high )
3370 || ( ( a.low == 0 )
3371 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3372 );
3373
3374}
3375
3376/*
3377-------------------------------------------------------------------------------
3378Returns 1 if the extended double-precision floating-point value `a' is less
3379than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3380do not cause an exception. Otherwise, the comparison is performed according
3381to the IEC/IEEE Standard for Binary Floating-point Arithmetic.
3382-------------------------------------------------------------------------------
3383*/
3384flag floatx80_le_quiet( floatx80 a, floatx80 b )
3385{
3386 flag aSign, bSign;
3387
3388 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3389 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3390 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3391 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3392 ) {
3393 if ( floatx80_is_signaling_nan( a )
3394 || floatx80_is_signaling_nan( b ) ) {
3395 float_raise( float_flag_invalid );
3396 }
3397 return 0;
3398 }
3399 aSign = extractFloatx80Sign( a );
3400 bSign = extractFloatx80Sign( b );
3401 if ( aSign != bSign ) {
3402 return
3403 aSign
3404 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3405 == 0 );
3406 }
3407 return
3408 aSign ? le128( b.high, b.low, a.high, a.low )
3409 : le128( a.high, a.low, b.high, b.low );
3410
3411}
3412
3413/*
3414-------------------------------------------------------------------------------
3415Returns 1 if the extended double-precision floating-point value `a' is less
3416than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
3417an exception. Otherwise, the comparison is performed according to the
3418IEC/IEEE Standard for Binary Floating-point Arithmetic.
3419-------------------------------------------------------------------------------
3420*/
3421flag floatx80_lt_quiet( floatx80 a, floatx80 b )
3422{
3423 flag aSign, bSign;
3424
3425 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3426 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3427 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3428 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3429 ) {
3430 if ( floatx80_is_signaling_nan( a )
3431 || floatx80_is_signaling_nan( b ) ) {
3432 float_raise( float_flag_invalid );
3433 }
3434 return 0;
3435 }
3436 aSign = extractFloatx80Sign( a );
3437 bSign = extractFloatx80Sign( b );
3438 if ( aSign != bSign ) {
3439 return
3440 aSign
3441 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3442 != 0 );
3443 }
3444 return
3445 aSign ? lt128( b.high, b.low, a.high, a.low )
3446 : lt128( a.high, a.low, b.high, b.low );
3447
3448}
3449
3450#endif
3451