Merge "remove fabs LLVM intrinsic"
diff --git a/lib/ExecutionEngine/Android.mk b/lib/ExecutionEngine/Android.mk
index d5a4573..1666f87 100644
--- a/lib/ExecutionEngine/Android.mk
+++ b/lib/ExecutionEngine/Android.mk
@@ -48,6 +48,10 @@
 
 LOCAL_SRC_FILES := $(libbcc_executionengine_SRC_FILES)
 
+ifeq ($(strip $(TARGET_CPU_VARIANT)),cortex-a15)
+LOCAL_CFLAGS += -DHAS_HW_DIV
+endif
+
 include $(LIBBCC_DEVICE_BUILD_MK)
 include $(LIBBCC_GEN_CONFIG_MK)
 include $(LLVM_DEVICE_BUILD_MK)
diff --git a/lib/Renderscript/runtime/matrix.ll b/lib/Renderscript/runtime/matrix.ll
index e559d99..c56405d 100644
--- a/lib/Renderscript/runtime/matrix.ll
+++ b/lib/Renderscript/runtime/matrix.ll
@@ -25,13 +25,13 @@
 
   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 6
   %pz2 = bitcast float* %pz to <3 x float>*
-  %zm2 = load <3 x float>* %pz2
+  %zm2 = load <3 x float>* %pz2, align 4
   %zm = shufflevector <3 x float> %zm2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 
   %a1 = fmul <4 x float> %x, %xm
@@ -56,10 +56,10 @@
 
   %px = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix3x3* %m, i32 0, i32 0, i32 3
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fmul <4 x float> %y, %ym
@@ -85,16 +85,16 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2
+  %zm = load <4 x float>* %pz2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fmul <4 x float> %y, %ym
@@ -121,16 +121,16 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pz = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 8
   %pz2 = bitcast float* %pz to <4 x float>*
-  %zm = load <4 x float>* %pz2
+  %zm = load <4 x float>* %pz2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fadd <4 x float> %wm, %a1
@@ -154,13 +154,13 @@
 
   %px = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 0
   %px2 = bitcast float* %px to <4 x float>*
-  %xm = load <4 x float>* %px2
+  %xm = load <4 x float>* %px2, align 4
   %py = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 4
   %py2 = bitcast float* %py to <4 x float>*
-  %ym = load <4 x float>* %py2
+  %ym = load <4 x float>* %py2, align 4
   %pw = getelementptr inbounds %struct.rs_matrix4x4* %m, i32 0, i32 0, i32 12
   %pw2 = bitcast float* %pw to <4 x float>*
-  %wm = load <4 x float>* %pw2
+  %wm = load <4 x float>* %pw2, align 4
 
   %a1 = fmul <4 x float> %x, %xm
   %a2 = fadd <4 x float> %wm, %a1
diff --git a/lib/Renderscript/runtime/rs_cl.c b/lib/Renderscript/runtime/rs_cl.c
index 5d11a93..d94426f 100644
--- a/lib/Renderscript/runtime/rs_cl.c
+++ b/lib/Renderscript/runtime/rs_cl.c
@@ -4,6 +4,10 @@
 extern float3 __attribute__((overloadable)) convert_float3(int3 c);
 extern float4 __attribute__((overloadable)) convert_float4(int4 c);
 
+extern int2 __attribute__((overloadable)) convert_int2(float2 c);
+extern int3 __attribute__((overloadable)) convert_int3(float3 c);
+extern int4 __attribute__((overloadable)) convert_int4(float4 c);
+
 // Float ops, 6.11.2
 
 #define FN_FUNC_FN(fnc)                                         \
@@ -424,12 +428,6 @@
 extern float __attribute__((overloadable)) fmod(float, float);
 FN_FUNC_FN_FN(fmod)
 
-extern float __attribute__((overloadable)) fract(float v) {
-    int i = (int)floor(v);
-    return fmin(v - i, 0x1.fffffep-1f);
-}
-FN_FUNC_FN(fract)
-
 extern float __attribute__((overloadable)) fract(float v, float *iptr) {
     int i = (int)floor(v);
     if (iptr) {
@@ -957,6 +955,111 @@
 FN_FUNC_FN(approx_atan)
 */
 
+typedef union
+{
+  float fv;
+  int32_t iv;
+} ieee_float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+
+#define GET_FLOAT_WORD(i,d)                 \
+do {                                \
+  ieee_float_shape_type gf_u;                   \
+  gf_u.fv = (d);                     \
+  (i) = gf_u.iv;                      \
+} while (0)
+
+/* Set a float from a 32 bit int.  */
+
+#define SET_FLOAT_WORD(d,i)                 \
+do {                                \
+  ieee_float_shape_type sf_u;                   \
+  sf_u.iv = (i);                      \
+  (d) = sf_u.fv;                     \
+} while (0)
+
+
+
+// Valid -125 to 125
+extern float __attribute__((overloadable)) native_exp2(float v) {
+    int32_t iv = (int)v;
+    int32_t x = iv + (iv >> 31); // ~floor(v)
+    float r = (v - x);
+
+    float fo;
+    SET_FLOAT_WORD(fo, (x + 127) << 23);
+
+    r *= 0.694f; // ~ log(e) / log(2)
+    float r2 = r*r;
+    float adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+    return fo * adj;
+}
+
+extern float2 __attribute__((overloadable)) native_exp2(float2 v) {
+    int2 iv = convert_int2(v);
+    int2 x = iv + (iv >> (int2)31);//floor(v);
+    float2 r = (v - convert_float2(x));
+
+    x += 127;
+
+    float2 fo = (float2)(x << (int2)23);
+
+    r *= 0.694f; // ~ log(e) / log(2)
+    float2 r2 = r*r;
+    float2 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+    return fo * adj;
+}
+
+extern float4 __attribute__((overloadable)) native_exp2(float4 v) {
+    int4 iv = convert_int4(v);
+    int4 x = iv + (iv >> (int4)31);//floor(v);
+    float4 r = (v - convert_float4(x));
+
+    x += 127;
+
+    float4 fo = (float4)(x << (int4)23);
+
+    r *= 0.694f; // ~ log(e) / log(2)
+    float4 r2 = r*r;
+    float4 adj = 1.f + r + (r2 * 0.5f) + (r2*r * 0.166666f) + (r2*r2 * 0.0416666f);
+    return fo * adj;
+}
+
+extern float3 __attribute__((overloadable)) native_exp2(float3 v) {
+    float4 t = 1.f;
+    t.xyz = v;
+    return native_exp2(t).xyz;
+}
+
+
+extern float __attribute__((overloadable)) native_exp(float v) {
+    return native_exp2(v * 1.442695041f);
+}
+extern float2 __attribute__((overloadable)) native_exp(float2 v) {
+    return native_exp2(v * 1.442695041f);
+}
+extern float3 __attribute__((overloadable)) native_exp(float3 v) {
+    return native_exp2(v * 1.442695041f);
+}
+extern float4 __attribute__((overloadable)) native_exp(float4 v) {
+    return native_exp2(v * 1.442695041f);
+}
+
+extern float __attribute__((overloadable)) native_exp10(float v) {
+    return native_exp2(v * 3.321928095f);
+}
+extern float2 __attribute__((overloadable)) native_exp10(float2 v) {
+    return native_exp2(v * 3.321928095f);
+}
+extern float3 __attribute__((overloadable)) native_exp10(float3 v) {
+    return native_exp2(v * 3.321928095f);
+}
+extern float4 __attribute__((overloadable)) native_exp10(float4 v) {
+    return native_exp2(v * 3.321928095f);
+}
+
+
 #undef FN_FUNC_FN
 #undef IN_FUNC_FN
 #undef FN_FUNC_FN_FN
diff --git a/runtime/lib/divsi3.c b/runtime/lib/divsi3.c
index de1700a..3ab82f0 100644
--- a/runtime/lib/divsi3.c
+++ b/runtime/lib/divsi3.c
@@ -12,7 +12,10 @@
  * ===----------------------------------------------------------------------===
  */
 
-#if !defined(__GNUC__) || __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) // gcc >= 4.8 implements this in libgcc
+#if !defined(__GNUC__) || __GNUC__ < 4 || \
+    (__GNUC__ == 4 && __GNUC_MINOR__ < 8 && !defined(HAS_HW_DIV))
+// gcc >= 4.8 implements this in libgcc
+// gcc 4.7 also implements this in libgcc with -mcpu=cortex-a15
 #include "int_lib.h"
 
 su_int __udivsi3(su_int n, su_int d);