Merge "Fix incorrect dependence on the system-built liblog."
diff --git a/api/rs_core_math.spec b/api/rs_core_math.spec
index d83b0d9..dcfefe2 100644
--- a/api/rs_core_math.spec
+++ b/api/rs_core_math.spec
@@ -1294,7 +1294,7 @@
  Return the approximate reciprocal of a value.
 version: 17
 # TODO enable once precision is improved
-test: noverify
+#test: noverify
 end:
 
 start:
diff --git a/driver/runtime/arch/generic.c b/driver/runtime/arch/generic.c
index 79dca97..58dbb28 100644
--- a/driver/runtime/arch/generic.c
+++ b/driver/runtime/arch/generic.c
@@ -773,68 +773,19 @@
  * half_RECIP
  */
 
-extern float __attribute__((overloadable)) half_recip(float v) {
-    // FIXME:  actual algorithm for generic approximate reciprocal
-    return 1.f / v;
-}
-
 extern float2 __attribute__((overloadable)) half_recip(float2 v) {
-    float2 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    return r;
+    return ((float2) 1.f) / v;
 }
 
 extern float3 __attribute__((overloadable)) half_recip(float3 v) {
-    float3 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    return r;
+    return ((float3) 1.f) / v;
 }
 
 extern float4 __attribute__((overloadable)) half_recip(float4 v) {
-    float4 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    r.w = half_recip(r.w);
-    return r;
+    return ((float4) 1.f) / v;
 }
 
 
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
-    return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
-    float2 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
-    float3 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
-    float4 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    r.w = half_sqrt(v.w);
-    return r;
-}
-
 
 /*
  * half_rsqrt
diff --git a/driver/runtime/arch/neon.ll b/driver/runtime/arch/neon.ll
index 66b253f..090ddbc 100644
--- a/driver/runtime/arch/neon.ll
+++ b/driver/runtime/arch/neon.ll
@@ -41,6 +41,12 @@
 declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
 
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;;;;;;;;                HELPERS                 ;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -876,60 +882,29 @@
 ;;;;;;;;;              half_RECIP              ;;;;;;;;;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-define float @_Z10half_recipf(float %v) {
-  %1 = insertelement <2 x float> undef, float %v, i32 0
-  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
-  %3 = extractelement <2 x float> %2, i32 0
-  ret float %3
-}
-
 define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
   %1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
-  %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
-  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %3
+  %2 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
+  %3 = fmul <2 x float> %1, %2
+  %4 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
+  %5 = fmul <2 x float> %4, %3
+  ret <2 x float> %5
 }
 
 define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
   %1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
-  ret <4 x float> %1
+  %2 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
+  %3 = fmul <4 x float> %1, %2
+  %4 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
+  %5 = fmul <4 x float> %4, %3
+  ret <4 x float> %5
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;;              half_SQRT               ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define float @_Z9half_sqrtf(float %v) {
-  %1 = insertelement <2 x float> undef, float %v, i32 0
-  %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
-  %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone
-  %4 = extractelement <2 x float> %3, i32 0
-  ret float %4
-}
-
-define <2 x float> @_Z9half_sqrtDv2_f(<2 x float> %v) nounwind readnone {
-  %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
-  ret <2 x float> %2
-}
-
-define <3 x float> @_Z9half_sqrtDv3_f(<3 x float> %v) nounwind readnone {
+define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
   %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
-  %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone
-  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %4
-}
-
-define <4 x float> @_Z9half_sqrtDv4_f(<4 x float> %v) nounwind readnone {
-  %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
-  ret <4 x float> %2
+  %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
+  %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %3
 }
 
 
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index d9b8f3c..3637785 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -997,7 +997,10 @@
     return l == 0.0f ? v : v / l;
 }
 
-extern float __attribute__((overloadable)) half_sqrt(float);
+extern float __attribute__((overloadable)) half_sqrt(float v) {
+    return sqrt(v);
+}
+FN_FUNC_FN(half_sqrt)
 
 extern float __attribute__((overloadable)) fast_length(float v) {
     return fabs(v);
@@ -1053,7 +1056,9 @@
     return (rlength == rlength) ? v * rlength : v;
 }
 
-extern float __attribute__((overloadable)) half_recip(float);
+extern float __attribute__((overloadable)) half_recip(float v) {
+    return 1.f / v;
+}
 
 /*
 extern float __attribute__((overloadable)) approx_atan(float x) {