Use llvm fabs intrinsic.

Change-Id: I7e593ec2306305bc510c0a18ebc697b18c5992dc
diff --git a/lib/Renderscript/runtime/math.ll b/lib/Renderscript/runtime/math.ll
index 4ea2b10..dd4dc4b 100644
--- a/lib/Renderscript/runtime/math.ll
+++ b/lib/Renderscript/runtime/math.ll
@@ -3,14 +3,35 @@
 
 declare float @llvm.sqrt.f32(float)
 declare float @llvm.pow.f32(float, float)
+declare float @llvm.fabs.f32(float)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
+declare <3 x float> @llvm.fabs.v3f32(<3 x float>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 
-define float @_Z4sqrtf(float %v) {
+define float @_Z4sqrtf(float %v) nounwind readnone alwaysinline {
   %1 = tail call float @llvm.sqrt.f32(float %v)
   ret float %1
 }
 
-define float @_Z3powf(float %v1, float %v2) {
+define float @_Z3powf(float %v1, float %v2) nounwind readnone alwaysinline {
   %1 = tail call float @llvm.pow.f32(float  %v1, float %v2)
   ret float %1
 }
 
+define float @_Z4fabsf(float %v) nounwind readnone alwaysinline {
+  %1 = tail call float @llvm.fabs.f32(float %v)
+  ret float %1
+}
+define <2 x float> @_Z4fabsDv2_f(<2 x float> %v) nounwind readnone alwaysinline {
+  %1 = tail call <2 x float> @llvm.fabs.v2f32(<2 x float> %v)
+  ret <2 x float> %1
+}
+define <3 x float> @_Z4fabsDv3_f(<3 x float> %v) nounwind readnone alwaysinline {
+  %1 = tail call <3 x float> @llvm.fabs.v3f32(<3 x float> %v)
+  ret <3 x float> %1
+}
+define <4 x float> @_Z4fabsDv4_f(<4 x float> %v) nounwind readnone alwaysinline {
+  %1 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %v)
+  ret <4 x float> %1
+}
+
diff --git a/lib/Renderscript/runtime/rs_cl.c b/lib/Renderscript/runtime/rs_cl.c
index b6c2b6a..858161d 100644
--- a/lib/Renderscript/runtime/rs_cl.c
+++ b/lib/Renderscript/runtime/rs_cl.c
@@ -404,8 +404,10 @@
 extern float __attribute__((overloadable)) expm1(float);
 FN_FUNC_FN(expm1)
 
-extern float __attribute__((overloadable)) fabs(float);
-FN_FUNC_FN(fabs)
+extern float __attribute__((overloadable)) fabs(float v);
+extern float2 __attribute__((overloadable)) fabs(float2 v);
+extern float3 __attribute__((overloadable)) fabs(float3 v);
+extern float4 __attribute__((overloadable)) fabs(float4 v);
 
 extern float __attribute__((overloadable)) fdim(float, float);
 FN_FUNC_FN_FN(fdim)