arcsin/arccos/arctan precision fix

The dEQP tests for arccos/arcsin/arctan were failing due to
the precision of the arcsin/arccos/arctan approximation being
too low. It is possible to significantly improve this precision
with a few more terms, which is done here.

All arccos/arcsin/arctan dEQP tests pass.

Change-Id: I612551b48bdab03b81b2e72ddb8cd7f8502231c5
Reviewed-on: https://swiftshader-review.googlesource.com/13568
Tested-by: Alexis Hétu <sugoi@google.com>
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/src/Shader/ShaderCore.cpp b/src/Shader/ShaderCore.cpp
index 017d0f8..ec159fd 100644
--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -339,51 +339,95 @@
 
 	Float4 arcsin(RValue<Float4> x, bool pp)
 	{
-		// x*(pi/2-sqrt(1-x*x)*pi/5)
-		return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
+		if(false) // Simpler implementation fails even lowp precision tests
+		{
+			// x*(pi/2-sqrt(1-x*x)*pi/5)
+			return x * (Float4(1.57079632e+0f) - Sqrt(Float4(1.0f) - x*x) * Float4(6.28318531e-1f));
+		}
+		else
+		{
+			// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+			const Float4 half_pi(1.57079632f);
+			const Float4 a0(1.5707288f);
+			const Float4 a1(-0.2121144f);
+			const Float4 a2(0.0742610f);
+			const Float4 a3(-0.0187293f);
+			Float4 absx = Abs(x);
+			return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
+			       (As<Int4>(x) & Int4(0x80000000)));
+		}
+	}
+
+	// Approximation of atan in [0..1]
+	Float4 arctan_01(Float4 x, bool pp)
+	{
+		if(pp)
+		{
+			return x * (Float4(-0.27f) * x + Float4(1.05539816f));
+		}
+		else
+		{
+			// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+			const Float4 a2(-0.3333314528f);
+			const Float4 a4(0.1999355085f);
+			const Float4 a6(-0.1420889944f);
+			const Float4 a8(0.1065626393f);
+			const Float4 a10(-0.0752896400f);
+			const Float4 a12(0.0429096138f);
+			const Float4 a14(-0.0161657367f);
+			const Float4 a16(0.0028662257f);
+			Float4 x2 = x * x;
+			return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
+		}
 	}
 
 	Float4 arctan(RValue<Float4> x, bool pp)
 	{
-		Int4 O = CmpNLT(Abs(x), Float4(1.0f));
-		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / x)) | (~O & As<Int4>(x)));   // FIXME: Vector select
+		Float4 absx = Abs(x);
+		Int4 O = CmpNLT(absx, Float4(1.0f));
+		Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
 
-		// Approximation of atan in [-1..1]
-		Float4 theta = y * (Float4(-0.27f) * Abs(y) + Float4(1.05539816f));
-
-		// +/-pi/2 depending on sign of x
-		Float4 sgnPi_2 = As<Float4>(As<Int4>(Float4(1.57079632e+0f)) ^ (As<Int4>(x) & Int4(0x80000000)));
-
-		theta = As<Float4>((O & As<Int4>(sgnPi_2 - theta)) | (~O & As<Int4>(theta)));   // FIXME: Vector select
-
-		return theta;
+		const Float4 half_pi(1.57079632f);
+		Float4 theta = arctan_01(y, pp);
+		return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
+		       (As<Int4>(x) & Int4(0x80000000)));
 	}
 
 	Float4 arctan(RValue<Float4> y, RValue<Float4> x, bool pp)
 	{
+		const Float4 pi(3.14159265f);            // pi
+		const Float4 minus_pi(-3.14159265f);     // -pi
+		const Float4 half_pi(1.57079632f);       // pi/2
+		const Float4 quarter_pi(7.85398163e-1f); // pi/4
+
 		// Rotate to upper semicircle when in lower semicircle
 		Int4 S = CmpLT(y, Float4(0.0f));
-		Float4 theta = As<Float4>(S & As<Int4>(Float4(-3.14159265e+0f)));   // -pi
+		Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
 		Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
 		Float4 y0 = Abs(y);
 
 		// Rotate to right quadrant when in left quadrant
-		Int4 Q = CmpLT(x0, Float4(0.0f));
-		theta += As<Float4>(Q & As<Int4>(Float4(1.57079632e+0f)));   // pi/2
-		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));    // FIXME: Vector select
-		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0)));   // FIXME: Vector select
+		Int4 non_zero_y = CmpNEQ(y0, Float4(0.0f));
+		Int4 Q = CmpLT(x0, Float4(0.0f)) & non_zero_y;
+		theta += As<Float4>(Q & As<Int4>(half_pi));
+		Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));  // FIXME: Vector select
+		Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
 
-		// Rotate to first octant when in second octant
-		Int4 O = CmpNLT(y1, x1);
-		theta += As<Float4>(O & As<Int4>(Float4(7.85398163e-1f)));   // pi/4
-		Float4 x2 = As<Float4>((O & As<Int4>(Float4(7.07106781e-1f) * x1 + Float4(7.07106781e-1f) * y1)) | (~O & As<Int4>(x1)));   // sqrt(2)/2   // FIXME: Vector select
-		Float4 y2 = As<Float4>((O & As<Int4>(Float4(7.07106781e-1f) * y1 - Float4(7.07106781e-1f) * x1)) | (~O & As<Int4>(y1)));   // FIXME: Vector select
+		// Mirror to first octant when in second octant
+		Int4 O = CmpNLT(y1, x1) & non_zero_y;
+		Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
+		Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
 
 		// Approximation of atan in [0..1]
-		Float4 y_x = y2 / x2;
-		theta += y_x * (Float4(-0.27f) * y_x + Float4(1.05539816f));
+		Int4 zero_x = CmpEQ(x2, Float4(0.0f));
+		Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
+		Float4 atan2_theta = arctan_01(y2 / x2, pp);
+		theta += As<Float4>((~zero_x & ~inf_y & non_zero_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
+		                    (inf_y & As<Int4>(quarter_pi)));
 
-		return theta;
+		// Recover loss of precision for tiny theta angles
+		Int4 precision_loss = S & Q & O & ~inf_y; // This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
+		return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
 	}
 
 	Float4 sineh(RValue<Float4> x, bool pp)