Merge "remove fabs LLVM intrinsic"
diff --git a/lib/ExecutionEngine/Android.mk b/lib/ExecutionEngine/Android.mk
index d5a4573..1666f87 100644
--- a/lib/ExecutionEngine/Android.mk
+++ b/lib/ExecutionEngine/Android.mk
@@ -48,6 +48,10 @@
LOCAL_SRC_FILES := $(libbcc_executionengine_SRC_FILES)
+ifeq ($(strip $(TARGET_CPU_VARIANT)),cortex-a15)
+LOCAL_CFLAGS += -DHAS_HW_DIV
+endif
+
include $(LIBBCC_DEVICE_BUILD_MK)
include $(LIBBCC_GEN_CONFIG_MK)
include $(LLVM_DEVICE_BUILD_MK)
diff --git a/lib/Renderscript/runtime/matrix.ll b/lib/Renderscript/runtime/matrix.ll
index e559d99..c56405d 100644
--- a/lib/Renderscript/runtime/matrix.ll
+++ b/lib/Renderscript/runtime/matrix.ll
@@ -25,13 +25,13 @@
%px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
%px2 = bitcast float* %px to <4 x float>*
- %xm = load <4 x float>* %px2
+ %xm = load <4 x float>* %px2, align 4
%py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
%py2 = bitcast float* %py to <4 x float>*
- %ym = load <4 x float>* %py2
+ %ym = load <4 x float>* %py2, align 4
%pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 6
%pz2 = bitcast float* %pz to <3 x float>*
- %zm2 = load <3 x float>* %pz2
+ %zm2 = load <3 x float>* %pz2, align 4
%zm = shufflevector <3 x float> %zm2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%a1 = fmul <4 x float> %x, %xm
@@ -56,10 +56,10 @@
%px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
%px2 = bitcast float* %px to <4 x float>*
- %xm = load <4 x float>* %px2
+ %xm = load <4 x float>* %px2, align 4
%py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
%py2 = bitcast float* %py to <4 x float>*
- %ym = load <4 x float>* %py2
+ %ym = load <4 x float>* %py2, align 4
%a1 = fmul <4 x float> %x, %xm
%a2 = fmul <4 x float> %y, %ym
@@ -85,16 +85,16 @@
%px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
%px2 = bitcast float* %px to <4 x float>*
- %xm = load <4 x float>* %px2
+ %xm = load <4 x float>* %px2, align 4
%py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
%py2 = bitcast float* %py to <4 x float>*
- %ym = load <4 x float>* %py2
+ %ym = load <4 x float>* %py2, align 4
%pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
%pz2 = bitcast float* %pz to <4 x float>*
- %zm = load <4 x float>* %pz2
+ %zm = load <4 x float>* %pz2, align 4
%pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
%pw2 = bitcast float* %pw to <4 x float>*
- %wm = load <4 x float>* %pw2
+ %wm = load <4 x float>* %pw2, align 4
%a1 = fmul <4 x float> %x, %xm
%a2 = fmul <4 x float> %y, %ym
@@ -121,16 +121,16 @@
%px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
%px2 = bitcast float* %px to <4 x float>*
- %xm = load <4 x float>* %px2
+ %xm = load <4 x float>* %px2, align 4
%py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
%py2 = bitcast float* %py to <4 x float>*
- %ym = load <4 x float>* %py2
+ %ym = load <4 x float>* %py2, align 4
%pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
%pz2 = bitcast float* %pz to <4 x float>*
- %zm = load <4 x float>* %pz2
+ %zm = load <4 x float>* %pz2, align 4
%pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
%pw2 = bitcast float* %pw to <4 x float>*
- %wm = load <4 x float>* %pw2
+ %wm = load <4 x float>* %pw2, align 4
%a1 = fmul <4 x float> %x, %xm
%a2 = fadd <4 x float> %wm, %a1
@@ -154,13 +154,13 @@
%px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
%px2 = bitcast float* %px to <4 x float>*
- %xm = load <4 x float>* %px2
+ %xm = load <4 x float>* %px2, align 4
%py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
%py2 = bitcast float* %py to <4 x float>*
- %ym = load <4 x float>* %py2
+ %ym = load <4 x float>* %py2, align 4
%pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
%pw2 = bitcast float* %pw to <4 x float>*
- %wm = load <4 x float>* %pw2
+ %wm = load <4 x float>* %pw2, align 4
%a1 = fmul <4 x float> %x, %xm
%a2 = fadd <4 x float> %wm, %a1
diff --git a/lib/Renderscript/runtime/rs_cl.c b/lib/Renderscript/runtime/rs_cl.c
index 5d11a93..d94426f 100644
--- a/lib/Renderscript/runtime/rs_cl.c
+++ b/lib/Renderscript/runtime/rs_cl.c
@@ -4,6 +4,10 @@
extern float3 __attribute__((overloadable)) convert_float3(int3 c);
extern float4 __attribute__((overloadable)) convert_float4(int4 c);
+extern int2 __attribute__((overloadable)) convert_int2(float2 c);
+extern int3 __attribute__((overloadable)) convert_int3(float3 c);
+extern int4 __attribute__((overloadable)) convert_int4(float4 c);
+
// Float ops, 6.11.2
#define FN_FUNC_FN(fnc) \
@@ -424,12 +428,6 @@
extern float __attribute__((overloadable)) fmod(float, float);
FN_FUNC_FN_FN(fmod)
-extern float __attribute__((overloadable)) fract(float v) {
- int i = (int)floor(v);
- return fmin(v - i, 0x1.fffffep-1f);
-}
-FN_FUNC_FN(fract)
-
extern float __attribute__((overloadable)) fract(float v, float *iptr) {
int i = (int)floor(v);
if (iptr) {
@@ -957,6 +955,111 @@
FN_FUNC_FN(approx_atan)
*/
+typedef union
+{
+ float fv;
+ int32_t iv;
+} ieee_float_shape_type;
+
+/* Get a 32 bit int from a float. */
+
+#define GET_FLOAT_WORD(i,d) \
+do { \
+ ieee_float_shape_type gf_u; \
+ gf_u.fv = (d); \
+ (i) = gf_u.iv; \
+} while (0)
+
+/* Set a float from a 32 bit int. */
+
+#define SET_FLOAT_WORD(d,i) \
+do { \
+ ieee_float_shape_type sf_u; \
+ sf_u.iv = (i); \
+ (d) = sf_u.fv; \
+} while (0)
+
+
+
+// Valid -125 to 125
+extern float __attribute__((overloadable)) native_exp2(float v) {
+ int32_t iv = (int)v;
+ int32_t x = iv + (iv >> 31); // ~floor(v)
+ float r = (v - x);
+
+ float fo;
+ SET_FLOAT_WORD(fo, (x + 127) << 23);
+
+ r *= 0.694f; // ~ log(e) / log(2)
+ float r2 = r*r;
+ float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+ return fo * adj;
+}
+
+extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
+ int2 iv = convert_int2(v);
+ int2 x = iv + (iv >> (int2)31);//floor(v);
+ float2 r = (v - convert_float2(x));
+
+ x += 127;
+
+ float2 fo = (float2)(x << (int2)23);
+
+ r *= 0.694f; // ~ log(e) / log(2)
+ float2 r2 = r*r;
+ float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+ return fo * adj;
+}
+
+extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
+ int4 iv = convert_int4(v);
+ int4 x = iv + (iv >> (int4)31);//floor(v);
+ float4 r = (v - convert_float4(x));
+
+ x += 127;
+
+ float4 fo = (float4)(x << (int4)23);
+
+ r *= 0.694f; // ~ log(e) / log(2)
+ float4 r2 = r*r;
+ float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+ return fo * adj;
+}
+
+extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
+ float4 t = 1.f;
+ t.xyz = v;
+ return native_exp2(t).xyz;
+}
+
+
+extern float __attribute__((overloadable)) native_exp(float v) {
+ return native_exp2(v * 1.442695041f);
+}
+extern float2 __attribute__((overloadable)) native_exp(float2 v) {
+ return native_exp2(v * 1.442695041f);
+}
+extern float3 __attribute__((overloadable)) native_exp(float3 v) {
+ return native_exp2(v * 1.442695041f);
+}
+extern float4 __attribute__((overloadable)) native_exp(float4 v) {
+ return native_exp2(v * 1.442695041f);
+}
+
+extern float __attribute__((overloadable)) native_exp10(float v) {
+ return native_exp2(v * 3.321928095f);
+}
+extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
+ return native_exp2(v * 3.321928095f);
+}
+extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
+ return native_exp2(v * 3.321928095f);
+}
+extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
+ return native_exp2(v * 3.321928095f);
+}
+
+
#undef FN_FUNC_FN
#undef IN_FUNC_FN
#undef FN_FUNC_FN_FN
diff --git a/runtime/lib/divsi3.c b/runtime/lib/divsi3.c
index de1700a..3ab82f0 100644
--- a/runtime/lib/divsi3.c
+++ b/runtime/lib/divsi3.c
@@ -12,7 +12,10 @@
* ===----------------------------------------------------------------------===
*/
-#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) // gcc >= 4.8 implements this in libgcc
+#if !defined(__GNUC__) || __GNUC__ < 4 || \
+ (__GNUC__ == 4 && __GNUC_MINOR__ < 8 && !defined(HAS_HW_DIV))
+// gcc >= 4.8 implements this in libgcc
+// gcc 4.7 also implements this in libgcc with -mcpu=cortex-a15
#include "int_lib.h"
su_int __udivsi3(su_int n, su_int d);