Blame - priv/host_generic_maddf.c - platform/external/valgrind

blob: d4e9fb7d604805d40553cf760dc82b537b854ebc [file] [log] [blame]

sewardj	cc3d219	2013-03-27 11:37:33 +0000	[diff] [blame]	1
				2	/---------------------------------------------------------------/
				3	/--- begin host_generic_maddf.c ---/
				4	/---------------------------------------------------------------/
				5
				6	/*
				7	Compute x * y + z as ternary operation.
				8	Copyright (C) 2010-2013 Free Software Foundation, Inc.
				9	This file is part of the GNU C Library.
				10	Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.
				11
				12	The GNU C Library is free software; you can redistribute it and/or
				13	modify it under the terms of the GNU Lesser General Public
				14	License as published by the Free Software Foundation; either
				15	version 2.1 of the License, or (at your option) any later version.
				16
				17	The GNU C Library is distributed in the hope that it will be useful,
				18	but WITHOUT ANY WARRANTY; without even the implied warranty of
				19	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				20	Lesser General Public License for more details.
				21
				22	You should have received a copy of the GNU Lesser General Public
				23	License along with the GNU C Library; if not, see
				24	<http://www.gnu.org/licenses/>.
				25	*/
				26
				27	/* Generic helper functions for doing FMA, i.e. compute x * y + z
				28	as ternary operation.
				29	These are purely back-end entities and cannot be seen/referenced
				30	from IR. */
				31
				32	#include "libvex_basictypes.h"
				33	#include "host_generic_maddf.h"
				34	#include "main_util.h"
				35
				36	/* This implementation relies on Double being more than twice as
				37	precise as Float and uses rounding to odd in order to avoid problems
				38	with double rounding.
				39	See a paper by Boldo and Melquiond:
				40	http://www.lri.fr/~melquion/doc/08-tc.pdf */
				41
				42	#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))
				43
				44	#if defined(__x86_64__) && defined(__SSE2_MATH__)
				45	# define ENV_TYPE unsigned int
				46	/* Save current rounding mode into ENV, hold exceptions, set rounding
				47	mode to rounding toward zero. */
				48	# define ROUNDTOZERO(env) \
				49	do { \
				50	unsigned int mxcsr; \
				51	__asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
				52	(env) = mxcsr; \
				53	mxcsr = (mxcsr \| 0x7f80) & ~0x3f; \
				54	__asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
				55	} while (0)
				56	/* Restore exceptions from ENV, return if inexact exception has been raised
				57	since ROUNDTOZERO. */
				58	# define RESET_TESTINEXACT(env) \
				59	({ \
				60	unsigned int mxcsr, ret; \
				61	__asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
				62	ret = (mxcsr >> 5) & 1; \
				63	mxcsr = (mxcsr & 0x3d) \| (env); \
				64	__asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
				65	ret; \
				66	})
				67	/* Return if inexact exception has been raised since ROUNDTOZERO. */
				68	# define TESTINEXACT() \
				69	({ \
				70	unsigned int mxcsr; \
				71	__asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr)); \
				72	(mxcsr >> 5) & 1; \
				73	})
				74	#endif
				75
				76	#define DBL_MANT_DIG 53
				77	#define IEEE754_DOUBLE_BIAS 0x3ff
				78
				79	union vg_ieee754_double {
				80	Double d;
				81
				82	/* This is the IEEE 754 double-precision format. */
				83	struct {
				84	#ifdef VKI_BIG_ENDIAN
				85	unsigned int negative:1;
				86	unsigned int exponent:11;
				87	unsigned int mantissa0:20;
				88	unsigned int mantissa1:32;
				89	#else
				90	unsigned int mantissa1:32;
				91	unsigned int mantissa0:20;
				92	unsigned int exponent:11;
				93	unsigned int negative:1;
				94	#endif
				95	} ieee;
				96	};
				97
				98	void VEX_REGPARM(3)
				99	h_generic_calc_MAddF32 ( /OUT/Float* res,
				100	Float* argX, Float* argY, Float* argZ )
				101	{
				102	#ifndef ENV_TYPE
				103	/* Lame fallback implementation. */
				104	res = argX * argY + argZ;
				105	#else
				106	ENV_TYPE env;
				107	/* Multiplication is always exact. */
				108	Double temp = (Double) argX (Double) *argY;
				109	union vg_ieee754_double u;
				110
				111	ROUNDTOZERO (env);
				112
				113	/* Perform addition with round to odd. */
				114	u.d = temp + (Double) *argZ;
				115	/* Ensure the addition is not scheduled after fetestexcept call. */
				116	FORCE_EVAL (u.d);
				117
				118	/* Reset rounding mode and test for inexact simultaneously. */
				119	int j = RESET_TESTINEXACT (env);
				120
				121	if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
				122	u.ieee.mantissa1 \|= j;
				123
				124	/* And finally truncation with round to nearest. */
				125	*res = (Float) u.d;
				126	#endif
				127	}
				128
				129
				130	void VEX_REGPARM(3)
				131	h_generic_calc_MAddF64 ( /OUT/Double* res,
				132	Double* argX, Double* argY, Double* argZ )
				133	{
				134	#ifndef ENV_TYPE
				135	/* Lame fallback implementation. */
				136	res = argX * argY + argZ;
				137	#else
				138	Double x = argX, y = argY, z = *argZ;
				139	union vg_ieee754_double u, v, w;
				140	int adjust = 0;
				141	u.d = x;
				142	v.d = y;
				143	w.d = z;
				144	if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
				145	>= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
				146	\|\| UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
				147	\|\| UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
				148	\|\| UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
				149	\|\| UNLIKELY (u.ieee.exponent + v.ieee.exponent
				150	<= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
				151	/* If z is Inf, but x and y are finite, the result should be
				152	z rather than NaN. */
				153	if (w.ieee.exponent == 0x7ff
				154	&& u.ieee.exponent != 0x7ff
				155	&& v.ieee.exponent != 0x7ff) {
				156	*res = (z + x) + y;
				157	return;
				158	}
				159	/* If x or y or z is Inf/NaN, or if fma will certainly overflow,
				160	or if x * y is less than half of DBL_DENORM_MIN,
				161	compute as x * y + z. */
				162	if (u.ieee.exponent == 0x7ff
				163	\|\| v.ieee.exponent == 0x7ff
				164	\|\| w.ieee.exponent == 0x7ff
				165	\|\| u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
				166	\|\| u.ieee.exponent + v.ieee.exponent
				167	< IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
				168	res = x y + z;
				169	return;
				170	}
				171	if (u.ieee.exponent + v.ieee.exponent
				172	>= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
				173	/* Compute 1p-53 times smaller result and multiply
				174	at the end. */
				175	if (u.ieee.exponent > v.ieee.exponent)
				176	u.ieee.exponent -= DBL_MANT_DIG;
				177	else
				178	v.ieee.exponent -= DBL_MANT_DIG;
				179	/* If x + y exponent is very large and z exponent is very small,
				180	it doesn't matter if we don't adjust it. */
				181	if (w.ieee.exponent > DBL_MANT_DIG)
				182	w.ieee.exponent -= DBL_MANT_DIG;
				183	adjust = 1;
				184	} else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
				185	/* Similarly.
				186	If z exponent is very large and x and y exponents are
				187	very small, it doesn't matter if we don't adjust it. */
				188	if (u.ieee.exponent > v.ieee.exponent) {
				189	if (u.ieee.exponent > DBL_MANT_DIG)
				190	u.ieee.exponent -= DBL_MANT_DIG;
				191	} else if (v.ieee.exponent > DBL_MANT_DIG)
				192	v.ieee.exponent -= DBL_MANT_DIG;
				193	w.ieee.exponent -= DBL_MANT_DIG;
				194	adjust = 1;
				195	} else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
				196	u.ieee.exponent -= DBL_MANT_DIG;
				197	if (v.ieee.exponent)
				198	v.ieee.exponent += DBL_MANT_DIG;
				199	else
				200	v.d *= 0x1p53;
				201	} else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
				202	v.ieee.exponent -= DBL_MANT_DIG;
				203	if (u.ieee.exponent)
				204	u.ieee.exponent += DBL_MANT_DIG;
				205	else
				206	u.d *= 0x1p53;
				207	} else /* if (u.ieee.exponent + v.ieee.exponent
				208	<= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
				209	if (u.ieee.exponent > v.ieee.exponent)
				210	u.ieee.exponent += 2 * DBL_MANT_DIG;
				211	else
				212	v.ieee.exponent += 2 * DBL_MANT_DIG;
				213	if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
				214	if (w.ieee.exponent)
				215	w.ieee.exponent += 2 * DBL_MANT_DIG;
				216	else
				217	w.d *= 0x1p106;
				218	adjust = -1;
				219	}
				220	/* Otherwise x * y should just affect inexact
				221	and nothing else. */
				222	}
				223	x = u.d;
				224	y = v.d;
				225	z = w.d;
				226	}
				227	/* Multiplication m1 + m2 = x * y using Dekker's algorithm. */
				228	# define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
				229	Double x1 = x * C;
				230	Double y1 = y * C;
				231	Double m1 = x * y;
				232	x1 = (x - x1) + x1;
				233	y1 = (y - y1) + y1;
				234	Double x2 = x - x1;
				235	Double y2 = y - y1;
				236	Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
				237	# undef C
				238
				239	/* Addition a1 + a2 = z + m1 using Knuth's algorithm. */
				240	Double a1 = z + m1;
				241	Double t1 = a1 - z;
				242	Double t2 = a1 - t1;
				243	t1 = m1 - t1;
				244	t2 = z - t2;
				245	Double a2 = t1 + t2;
				246
				247	ENV_TYPE env;
				248	ROUNDTOZERO (env);
				249
				250	/* Perform m2 + a2 addition with round to odd. */
				251	u.d = a2 + m2;
				252
				253	if (UNLIKELY (adjust < 0)) {
				254	if ((u.ieee.mantissa1 & 1) == 0)
				255	u.ieee.mantissa1 \|= TESTINEXACT ();
				256	v.d = a1 + u.d;
				257	/* Ensure the addition is not scheduled after fetestexcept call. */
				258	FORCE_EVAL (v.d);
				259	}
				260
				261	/* Reset rounding mode and test for inexact simultaneously. */
				262	int j = RESET_TESTINEXACT (env) != 0;
				263
				264	if (LIKELY (adjust == 0)) {
				265	if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
				266	u.ieee.mantissa1 \|= j;
				267	/* Result is a1 + u.d. */
				268	*res = a1 + u.d;
				269	} else if (LIKELY (adjust > 0)) {
				270	if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
				271	u.ieee.mantissa1 \|= j;
				272	/* Result is a1 + u.d, scaled up. */
				273	res = (a1 + u.d) 0x1p53;
				274	} else {
				275	/* If a1 + u.d is exact, the only rounding happens during
				276	scaling down. */
				277	if (j == 0) {
				278	res = v.d 0x1p-106;
				279	return;
				280	}
				281	/* If result rounded to zero is not subnormal, no double
				282	rounding will occur. */
				283	if (v.ieee.exponent > 106) {
				284	res = (a1 + u.d) 0x1p-106;
				285	return;
				286	}
				287	/* If v.d * 0x1p-106 with round to zero is a subnormal above
				288	or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
				289	down just by 1 bit, which means v.ieee.mantissa1 \|= j would
				290	change the round bit, not sticky or guard bit.
				291	v.d * 0x1p-106 never normalizes by shifting up,
				292	so round bit plus sticky bit should be already enough
				293	for proper rounding. */
				294	if (v.ieee.exponent == 106) {
				295	/* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
				296	v.ieee.mantissa1 & 1 is the round bit and j is our sticky
				297	bit. In round-to-nearest 001 rounds down like 00,
				298	011 rounds up, even though 01 rounds down (thus we need
				299	to adjust), 101 rounds down like 10 and 111 rounds up
				300	like 11. */
				301	if ((v.ieee.mantissa1 & 3) == 1) {
				302	v.d *= 0x1p-106;
				303	if (v.ieee.negative)
				304	*res = v.d - 0x1p-1074;
				305	else
				306	*res = v.d + 0x1p-1074;
				307	} else
				308	res = v.d 0x1p-106;
				309	return;
				310	}
				311	v.ieee.mantissa1 \|= j;
				312	res = v.d 0x1p-106;
				313	return;
				314	}
				315	#endif
				316	}
				317
				318	/---------------------------------------------------------------/
				319	/--- end host_generic_maddf.c --/
				320	/---------------------------------------------------------------/