DO NOT MERGE: Refactor the libbcc runtime for x86 platform

This patch refactors the libbcc runtime code to support x86 platform.
It removed the redundant x86 code and added the missing functions in
libclcore_x86.bc.

It resolved the RenderScript failures on the x86 platform.

Bug: 9961583

Change-Id: Ieed97e90c7c7691185a88dc425a2dd8c68aeb806

Signed-off-by: Yong Chen <yong.a.chen@intel.com>
diff --git a/lib/Renderscript/runtime/Android.mk b/lib/Renderscript/runtime/Android.mk
index 08cefb6..0fe2440 100755
--- a/lib/Renderscript/runtime/Android.mk
+++ b/lib/Renderscript/runtime/Android.mk
@@ -33,30 +33,23 @@
 clcore_files := \
     $(clcore_base_files) \
     math.ll \
-    arch/generic.c \
-    arch/sqrt.c \
-    arch/dot_length.c
+    arch/generic.c
 
 clcore_neon_files := \
     $(clcore_base_files) \
     math.ll \
-    arch/neon.ll \
-    arch/sqrt.c \
-    arch/dot_length.c
+    arch/neon.ll
 
 ifeq ($(ARCH_X86_HAVE_SSE2), true)
     clcore_x86_files := \
     $(clcore_base_files) \
-    arch/x86_generic.c \
-    arch/x86_clamp.ll \
-    arch/x86_math.ll
+    arch/generic.c \
+    arch/x86_sse2.ll
 
+    # FIXME: without SSE3, it is still able to get better code through PSHUFD. But,
+    # so far, there is no such device with SSE2 only.
     ifeq ($(ARCH_X86_HAVE_SSE3), true)
-        clcore_x86_files += arch/x86_dot_length.ll
-    else
-        # FIXME: without SSE3, it is still able to get better code through PSHUFD. But,
-        # so far, there is no such device with SSE2 only.
-        clcore_x86_files += arch/dot_length.c
+        clcore_x86_files += arch/x86_sse3.ll
     endif
 endif
 
diff --git a/lib/Renderscript/runtime/arch/dot_length.c b/lib/Renderscript/runtime/arch/dot_length.c
deleted file mode 100644
index 94c99b6..0000000
--- a/lib/Renderscript/runtime/arch/dot_length.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "rs_types.rsh"
-
-extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
-    return lhs * rhs;
-}
-extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y;
-}
-extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
-}
-extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
-}
-
-extern float __attribute__((overloadable)) fabs(float);
-extern float __attribute__((overloadable)) sqrt(float);
-
-extern float __attribute__((overloadable)) length(float v) {
-    return fabs(v);
-}
-extern float __attribute__((overloadable)) length(float2 v) {
-    return sqrt(v.x*v.x + v.y*v.y);
-}
-extern float __attribute__((overloadable)) length(float3 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-extern float __attribute__((overloadable)) length(float4 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
diff --git a/lib/Renderscript/runtime/arch/generic.c b/lib/Renderscript/runtime/arch/generic.c
index 3724e22..986c71e 100644
--- a/lib/Renderscript/runtime/arch/generic.c
+++ b/lib/Renderscript/runtime/arch/generic.c
@@ -27,6 +27,8 @@
 /*
  * CLAMP
  */
+#if !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+
 extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
     return amount < low ? low : (amount > high ? high : amount);
 }
@@ -79,6 +81,17 @@
     return r;
 }
 
+#else
+
+extern float __attribute__((overloadable)) clamp(float amount, float low, float high);
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high);
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high);
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high);
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high);
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high);
+
+#endif // !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
 
 /*
  * FMAX
@@ -933,4 +946,3 @@
     uchar4 c = {color.x, color.y, color.z, color.w};
     return c;
 }
-
diff --git a/lib/Renderscript/runtime/arch/sqrt.c b/lib/Renderscript/runtime/arch/sqrt.c
deleted file mode 100755
index f1dac5f..0000000
--- a/lib/Renderscript/runtime/arch/sqrt.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-#define FN_FUNC_FN(fnc)                                         \
-extern float2 __attribute__((overloadable)) fnc(float2 v) { \
-    float2 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-extern float3 __attribute__((overloadable)) fnc(float3 v) { \
-    float3 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-extern float4 __attribute__((overloadable)) fnc(float4 v) { \
-    float4 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
-
-extern float __attribute__((overloadable)) sqrt(float);
-
-FN_FUNC_FN(sqrt)
diff --git a/lib/Renderscript/runtime/arch/x86_generic.c b/lib/Renderscript/runtime/arch/x86_generic.c
deleted file mode 100644
index c46c54a..0000000
--- a/lib/Renderscript/runtime/arch/x86_generic.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-extern short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
-extern uchar4 __attribute__((overloadable)) convert_uchar4(short4);
-extern float __attribute__((overloadable)) sqrt(float);
-
-/*
- * FMAX
- */
-
-extern float __attribute__((overloadable)) fmax(float v1, float v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    r.w = v1.w > v2 ? v1.w : v2;
-    return r;
-}
-
-extern float __attribute__((overloadable)) fmin(float v1, float v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-
-/*
- * FMIN
- */
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    r.w = v1.w < v2 ? v1.w : v2;
-    return r;
-}
-
-
-/*
- * MAX
- */
-
-extern char __attribute__((overloadable)) max(char v1, char v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) max(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) max(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) max(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern short __attribute__((overloadable)) max(short v1, short v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) max(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) max(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) max(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int __attribute__((overloadable)) max(int v1, int v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) max(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) max(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) max(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) max(int64_t v1, int64_t v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) max(uchar v1, uchar v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) max(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) max(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) max(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) max(ushort v1, ushort v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) max(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) max(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) max(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) max(uint v1, uint v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) max(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) max(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) max(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) max(float v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float2 v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float3 v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float4 v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-
-/*
- * MIN
- */
-
-extern int8_t __attribute__((overloadable)) min(int8_t v1, int8_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) min(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) min(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) min(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int16_t __attribute__((overloadable)) min(int16_t v1, int16_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) min(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) min(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) min(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int32_t __attribute__((overloadable)) min(int32_t v1, int32_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) min(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) min(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) min(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) min(int64_t v1, int64_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) min(uchar v1, uchar v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) min(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) min(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) min(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) min(ushort v1, ushort v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) min(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) min(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) min(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) min(uint v1, uint v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) min(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) min(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) min(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) min(float v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float2 v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float3 v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float4 v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-
-/*
- * YUV
- */
-
-extern uchar4 __attribute__((overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    short Y = ((short)y) - 16;
-    short U = ((short)u) - 128;
-    short V = ((short)v) - 128;
-
-    short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    p.r = rsClamp(p.r, (short)0, (short)255);
-    p.g = rsClamp(p.g, (short)0, (short)255);
-    p.b = rsClamp(p.b, (short)0, (short)255);
-
-    return convert_uchar4(p);
-}
-
-static float4 yuv_U_values = {0.f, -0.392f * 0.003921569f, +2.02 * 0.003921569f, 0.f};
-static float4 yuv_V_values = {1.603f * 0.003921569f, -0.815f * 0.003921569f, 0.f, 0.f};
-
-extern float4 __attribute__((overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v) {
-    float4 color = (float)y * 0.003921569f;
-    float4 fU = ((float)u) - 128.f;
-    float4 fV = ((float)v) - 128.f;
-
-    color += fU * yuv_U_values;
-    color += fV * yuv_V_values;
-    color = clamp(color, 0.f, 1.f);
-    return color;
-}
-
-
-/*
- * half_RECIP
- */
-
-extern float __attribute__((overloadable)) half_recip(float v) {
-    // FIXME:  actual algorithm for generic approximate reciprocal
-    return 1.f / v;
-}
-
-extern float2 __attribute__((overloadable)) half_recip(float2 v) {
-    float2 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_recip(float3 v) {
-    float3 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_recip(float4 v) {
-    float4 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    r.w = half_recip(r.w);
-    return r;
-}
-
-
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
-    return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
-    float2 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
-    float3 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
-    float4 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    r.w = half_sqrt(v.w);
-    return r;
-}
-
-
-/*
- * half_rsqrt
- */
-
-extern float __attribute__((overloadable)) half_rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_rsqrt(float2 v) {
-    float2 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_rsqrt(float3 v) {
-    float3 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_rsqrt(float4 v) {
-    float4 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    r.w = half_rsqrt(v.w);
-    return r;
-}
-
diff --git a/lib/Renderscript/runtime/arch/x86_math.ll b/lib/Renderscript/runtime/arch/x86_math.ll
deleted file mode 100755
index 60add80..0000000
--- a/lib/Renderscript/runtime/arch/x86_math.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
-declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) nounwind readnone
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
-declare float @llvm.exp.f32(float) nounwind readonly
-declare float @llvm.pow.f32(float, float) nounwind readonly
-
-define float @_Z4sqrtf(float %in) nounwind readnone alwaysinline {
-  %1 = tail call float @llvm.sqrt.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define <2 x float> @_Z4sqrtDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z4sqrtDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <3 x float> @llvm.sqrt.v3f32(<3 x float> %in) nounwind readnone
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z4sqrtDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) nounwind readnone
-  ret <4 x float> %1
-}
-
-define float @_Z3expf(float %in) nounwind readnone {
-  %1 = tail call float @llvm.exp.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define float @_Z3powff(float %v1, float %v2) nounwind readnone {
-  %1 = tail call float @llvm.pow.f32(float %v1, float %v2) nounwind readnone
-  ret float %1
-}
-
diff --git a/lib/Renderscript/runtime/arch/x86_clamp.ll b/lib/Renderscript/runtime/arch/x86_sse2.ll
old mode 100755
new mode 100644
similarity index 79%
rename from lib/Renderscript/runtime/arch/x86_clamp.ll
rename to lib/Renderscript/runtime/arch/x86_sse2.ll
index 422e9f6..e4fb035
--- a/lib/Renderscript/runtime/arch/x86_clamp.ll
+++ b/lib/Renderscript/runtime/arch/x86_sse2.ll
@@ -6,6 +6,14 @@
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
 
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
+
+declare float @llvm.exp.f32(float) nounwind readonly
+declare float @llvm.pow.f32(float, float) nounwind readonly
+
 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %in, <4 x float> %low, <4 x float> %high) nounwind readnone alwaysinline {
   %1 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %in, <4 x float> %high) nounwind readnone
   %2 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %low) nounwind readnone
@@ -72,3 +80,23 @@
   %5 = tail call <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %in, <2 x float> %2, <2 x float> %4) nounwind readnone
   ret <2 x float> %5
 }
+
+define float @_Z4sqrtf(float %in) nounwind readnone alwaysinline {
+  %1 = tail call float @llvm.sqrt.f32(float %in) nounwind readnone
+  ret float %1
+}
+
+define <2 x float> @_Z4sqrtDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) nounwind readnone
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z4sqrtDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <3 x float> @llvm.sqrt.v3f32(<3 x float> %in) nounwind readnone
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z4sqrtDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) nounwind readnone
+  ret <4 x float> %1
+}
diff --git a/lib/Renderscript/runtime/arch/x86_dot_length.ll b/lib/Renderscript/runtime/arch/x86_sse3.ll
similarity index 99%
rename from lib/Renderscript/runtime/arch/x86_dot_length.ll
rename to lib/Renderscript/runtime/arch/x86_sse3.ll
index 21f2f3e..5c96daa 100644
--- a/lib/Renderscript/runtime/arch/x86_dot_length.ll
+++ b/lib/Renderscript/runtime/arch/x86_sse3.ll
@@ -72,4 +72,3 @@
 define float @_Z6lengthf(float %in) nounwind readnone alwaysinline {
   ret float %in
 }
-
diff --git a/lib/Renderscript/runtime/build_bc_lib.mk b/lib/Renderscript/runtime/build_bc_lib.mk
index 1d20b7a..a704c08 100644
--- a/lib/Renderscript/runtime/build_bc_lib.mk
+++ b/lib/Renderscript/runtime/build_bc_lib.mk
@@ -36,10 +36,17 @@
              $(bc_translated_clang_cc1_cflags)
 
 ifeq ($(rs_debug_runtime),1)
-bc_cflags += -DRS_DEBUG_RUNTIME
+    bc_cflags += -DRS_DEBUG_RUNTIME
 endif
 rs_debug_runtime:=
 
+ifeq ($(ARCH_X86_HAVE_SSE2), true)
+    bc_cflags += -DARCH_X86_HAVE_SSE2
+endif
+ifeq ($(ARCH_X86_HAVE_SSE3), true)
+    bc_cflags += -DARCH_X86_HAVE_SSE3
+endif
+
 c_sources := $(filter %.c,$(LOCAL_SRC_FILES))
 ll_sources := $(filter %.ll,$(LOCAL_SRC_FILES))
 
diff --git a/lib/Renderscript/runtime/rs_cl.c b/lib/Renderscript/runtime/rs_cl.c
old mode 100755
new mode 100644
index b7f9158..7e8a574
--- a/lib/Renderscript/runtime/rs_cl.c
+++ b/lib/Renderscript/runtime/rs_cl.c
@@ -591,6 +591,11 @@
 extern float __attribute__((overloadable)) rsqrt(float v) {
     return 1.f / sqrt(v);
 }
+
+#if !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+FN_FUNC_FN(sqrt)
+#endif // !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+
 FN_FUNC_FN(rsqrt)
 
 extern float __attribute__((overloadable)) sin(float);
@@ -897,11 +902,43 @@
     return r;
 }
 
+#if !defined(ARCH_X86_HAVE_SSE3)
+
+extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
+    return lhs * rhs;
+}
+extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y;
+}
+extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
+}
+extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
+}
+
+extern float __attribute__((overloadable)) length(float v) {
+    return fabs(v);
+}
+extern float __attribute__((overloadable)) length(float2 v) {
+    return sqrt(v.x*v.x + v.y*v.y);
+}
+extern float __attribute__((overloadable)) length(float3 v) {
+    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+}
+extern float __attribute__((overloadable)) length(float4 v) {
+    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
+}
+
+#else
+
 extern float __attribute__((overloadable)) length(float v);
 extern float __attribute__((overloadable)) length(float2 v);
 extern float __attribute__((overloadable)) length(float3 v);
 extern float __attribute__((overloadable)) length(float4 v);
 
+#endif
+
 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
     return length(lhs - rhs);
 }
diff --git a/lib/Renderscript/runtime/rs_sample.c b/lib/Renderscript/runtime/rs_sample.c
index 8bc6966..e7a1e6f 100644
--- a/lib/Renderscript/runtime/rs_sample.c
+++ b/lib/Renderscript/runtime/rs_sample.c
@@ -421,7 +421,7 @@
 }
 
 static float4 __attribute__((overloadable))
-        sample_LOD_LinearPixel(const Allocation_t *alloc, const Type_t *type,
+        sample_LOD_LinearPixel(const Allocation_t *alloc,
                                rs_data_kind dk, rs_data_type dt,
                                rs_sampler s,
                                float uv, uint32_t lod) {