Merge "Fix incorrect dependence on the system-built liblog."
diff --git a/api/rs_core_math.spec b/api/rs_core_math.spec
index d83b0d9..dcfefe2 100644
--- a/api/rs_core_math.spec
+++ b/api/rs_core_math.spec
@@ -1294,7 +1294,7 @@
Return the approximate reciprocal of a value.
version: 17
# TODO enable once precision is improved
-test: noverify
+#test: noverify
end:
start:
diff --git a/driver/runtime/arch/generic.c b/driver/runtime/arch/generic.c
index 79dca97..58dbb28 100644
--- a/driver/runtime/arch/generic.c
+++ b/driver/runtime/arch/generic.c
@@ -773,68 +773,19 @@
* half_RECIP
*/
-extern float __attribute__((overloadable)) half_recip(float v) {
- // FIXME: actual algorithm for generic approximate reciprocal
- return 1.f / v;
-}
-
extern float2 __attribute__((overloadable)) half_recip(float2 v) {
- float2 r;
- r.x = half_recip(r.x);
- r.y = half_recip(r.y);
- return r;
+ return ((float2) 1.f) / v;
}
extern float3 __attribute__((overloadable)) half_recip(float3 v) {
- float3 r;
- r.x = half_recip(r.x);
- r.y = half_recip(r.y);
- r.z = half_recip(r.z);
- return r;
+ return ((float3) 1.f) / v;
}
extern float4 __attribute__((overloadable)) half_recip(float4 v) {
- float4 r;
- r.x = half_recip(r.x);
- r.y = half_recip(r.y);
- r.z = half_recip(r.z);
- r.w = half_recip(r.w);
- return r;
+ return ((float4) 1.f) / v;
}
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
- return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
- float2 r;
- r.x = half_sqrt(v.x);
- r.y = half_sqrt(v.y);
- return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
- float3 r;
- r.x = half_sqrt(v.x);
- r.y = half_sqrt(v.y);
- r.z = half_sqrt(v.z);
- return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
- float4 r;
- r.x = half_sqrt(v.x);
- r.y = half_sqrt(v.y);
- r.z = half_sqrt(v.z);
- r.w = half_sqrt(v.w);
- return r;
-}
-
/*
* half_rsqrt
diff --git a/driver/runtime/arch/neon.ll b/driver/runtime/arch/neon.ll
index 66b253f..090ddbc 100644
--- a/driver/runtime/arch/neon.ll
+++ b/driver/runtime/arch/neon.ll
@@ -41,6 +41,12 @@
declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone
declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;; HELPERS ;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -876,60 +882,29 @@
;;;;;;;;; half_RECIP ;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-define float @_Z10half_recipf(float %v) {
- %1 = insertelement <2 x float> undef, float %v, i32 0
- %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
- %3 = extractelement <2 x float> %2, i32 0
- ret float %3
-}
-
define <2 x float> @_Z10half_recipDv2_f(<2 x float> %v) nounwind readnone {
%1 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %v) nounwind readnone
- ret <2 x float> %1
-}
-
-define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
- %1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
- %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %3
+ %2 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %1, <2 x float> %v) nounwind readnone
+ %3 = fmul <2 x float> %1, %2
+ %4 = tail call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %3, <2 x float> %v) nounwind readnone
+ %5 = fmul <2 x float> %4, %3
+ ret <2 x float> %5
}
define <4 x float> @_Z10half_recipDv4_f(<4 x float> %v) nounwind readnone {
%1 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %v) nounwind readnone
- ret <4 x float> %1
+ %2 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %1, <4 x float> %v) nounwind readnone
+ %3 = fmul <4 x float> %1, %2
+ %4 = tail call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %3, <4 x float> %v) nounwind readnone
+ %5 = fmul <4 x float> %4, %3
+ ret <4 x float> %5
}
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;;;;;;;; half_SQRT ;;;;;;;;;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-define float @_Z9half_sqrtf(float %v) {
- %1 = insertelement <2 x float> undef, float %v, i32 0
- %2 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %1) nounwind readnone
- %3 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %2) nounwind readnone
- %4 = extractelement <2 x float> %3, i32 0
- ret float %4
-}
-
-define <2 x float> @_Z9half_sqrtDv2_f(<2 x float> %v) nounwind readnone {
- %1 = tail call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %v) nounwind readnone
- %2 = tail call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %1) nounwind readnone
- ret <2 x float> %2
-}
-
-define <3 x float> @_Z9half_sqrtDv3_f(<3 x float> %v) nounwind readnone {
+define <3 x float> @_Z10half_recipDv3_f(<3 x float> %v) nounwind readnone {
%1 = shufflevector <3 x float> %v, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %2 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %1) nounwind readnone
- %3 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %2) nounwind readnone
- %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %4
-}
-
-define <4 x float> @_Z9half_sqrtDv4_f(<4 x float> %v) nounwind readnone {
- %1 = tail call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %v) nounwind readnone
- %2 = tail call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %1) nounwind readnone
- ret <4 x float> %2
+ %2 = tail call <4 x float> @_Z10half_recipDv4_f(<4 x float> %1) nounwind readnone
+ %3 = shufflevector <4 x float> %2, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+ ret <3 x float> %3
}
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index d9b8f3c..3637785 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -997,7 +997,10 @@
return l == 0.0f ? v : v / l;
}
-extern float __attribute__((overloadable)) half_sqrt(float);
+extern float __attribute__((overloadable)) half_sqrt(float v) {
+ return sqrt(v);
+}
+FN_FUNC_FN(half_sqrt)
extern float __attribute__((overloadable)) fast_length(float v) {
return fabs(v);
@@ -1053,7 +1056,9 @@
return (rlength == rlength) ? v * rlength : v;
}
-extern float __attribute__((overloadable)) half_recip(float);
+extern float __attribute__((overloadable)) half_recip(float v) {
+ return 1.f / v;
+}
/*
extern float __attribute__((overloadable)) approx_atan(float x) {