Refactor the libbcc runtime for x86 platform

This patch is merged/rebased from AOSP, where it was initially submitted to
frameworks/compile/libbcc by Jun Tian <jun.j.tian@intel.com>. All conflicts
have been resolved.

This patch refactors the libbcc runtime code to support x86 platform.
It removed the redundant x86 code and added the missing functions in
libclcore_x86.bc.
It resolved the RenderScript failures on the x86 platform.

Bug: 9961583
Change-Id: I2c8be0f710960ee5e0614721f5edfbaf028c67e1
diff --git a/driver/runtime/Android.mk b/driver/runtime/Android.mk
index d2ced77..80fbb1c 100755
--- a/driver/runtime/Android.mk
+++ b/driver/runtime/Android.mk
@@ -34,31 +34,24 @@
 clcore_files := \
     $(clcore_base_files) \
     math.ll \
-    arch/generic.c \
-    arch/sqrt.c \
-    arch/dot_length.c
+    arch/generic.c
 
 clcore_neon_files := \
     $(clcore_base_files) \
     math.ll \
     arch/neon.ll \
-    arch/sqrt.c \
-    arch/dot_length.c \
     arch/clamp.c
 
 ifeq ($(ARCH_X86_HAVE_SSE2), true)
     clcore_x86_files := \
     $(clcore_base_files) \
-    arch/x86_generic.c \
-    arch/x86_clamp.ll \
-    arch/x86_math.ll
+    arch/generic.c \
+    arch/x86_sse2.ll
 
+    # FIXME: without SSE3, it is still able to get better code through PSHUFD. But,
+    # so far, there is no such device with SSE2 only.
     ifeq ($(ARCH_X86_HAVE_SSE3), true)
-        clcore_x86_files += arch/x86_dot_length.ll
-    else
-        # FIXME: without SSE3, it is still able to get better code through PSHUFD. But,
-        # so far, there is no such device with SSE2 only.
-        clcore_x86_files += arch/dot_length.c
+        clcore_x86_files += arch/x86_sse3.ll
     endif
 endif
 
diff --git a/driver/runtime/arch/dot_length.c b/driver/runtime/arch/dot_length.c
deleted file mode 100644
index 94c99b6..0000000
--- a/driver/runtime/arch/dot_length.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "rs_types.rsh"
-
-extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
-    return lhs * rhs;
-}
-extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y;
-}
-extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
-}
-extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
-    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
-}
-
-extern float __attribute__((overloadable)) fabs(float);
-extern float __attribute__((overloadable)) sqrt(float);
-
-extern float __attribute__((overloadable)) length(float v) {
-    return fabs(v);
-}
-extern float __attribute__((overloadable)) length(float2 v) {
-    return sqrt(v.x*v.x + v.y*v.y);
-}
-extern float __attribute__((overloadable)) length(float3 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
-}
-extern float __attribute__((overloadable)) length(float4 v) {
-    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
-}
-
diff --git a/driver/runtime/arch/generic.c b/driver/runtime/arch/generic.c
index da83c2a..50722b1 100644
--- a/driver/runtime/arch/generic.c
+++ b/driver/runtime/arch/generic.c
@@ -79,7 +79,22 @@
     return r;                                                                       \
 }
 
+#if !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+
 _CLAMP(float);
+
+#else
+
+extern float __attribute__((overloadable)) clamp(float amount, float low, float high);
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high);
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high);
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high);
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high);
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high);
+
+#endif // !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+
 _CLAMP(double);
 _CLAMP(char);
 _CLAMP(uchar);
@@ -945,4 +960,3 @@
     uchar4 c = {color.x, color.y, color.z, color.w};
     return c;
 }
-
diff --git a/driver/runtime/arch/sqrt.c b/driver/runtime/arch/sqrt.c
deleted file mode 100755
index f1dac5f..0000000
--- a/driver/runtime/arch/sqrt.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-#define FN_FUNC_FN(fnc)                                         \
-extern float2 __attribute__((overloadable)) fnc(float2 v) { \
-    float2 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    return r;                                                   \
-}                                                               \
-extern float3 __attribute__((overloadable)) fnc(float3 v) { \
-    float3 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    return r;                                                   \
-}                                                               \
-extern float4 __attribute__((overloadable)) fnc(float4 v) { \
-    float4 r;                                                   \
-    r.x = fnc(v.x);                                             \
-    r.y = fnc(v.y);                                             \
-    r.z = fnc(v.z);                                             \
-    r.w = fnc(v.w);                                             \
-    return r;                                                   \
-}
-
-extern float __attribute__((overloadable)) sqrt(float);
-
-FN_FUNC_FN(sqrt)
diff --git a/driver/runtime/arch/x86_generic.c b/driver/runtime/arch/x86_generic.c
deleted file mode 100644
index c46c54a..0000000
--- a/driver/runtime/arch/x86_generic.c
+++ /dev/null
@@ -1,786 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-extern short __attribute__((overloadable, always_inline)) rsClamp(short amount, short low, short high);
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high);
-extern uchar4 __attribute__((overloadable)) convert_uchar4(short4);
-extern float __attribute__((overloadable)) sqrt(float);
-
-/*
- * FMAX
- */
-
-extern float __attribute__((overloadable)) fmax(float v1, float v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmax(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmax(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmax(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x > v2 ? v1.x : v2;
-    r.y = v1.y > v2 ? v1.y : v2;
-    r.z = v1.z > v2 ? v1.z : v2;
-    r.w = v1.w > v2 ? v1.w : v2;
-    return r;
-}
-
-extern float __attribute__((overloadable)) fmin(float v1, float v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-
-/*
- * FMIN
- */
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float2 v2) {
-    float2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float3 v2) {
-    float3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float4 v2) {
-    float4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) fmin(float2 v1, float v2) {
-    float2 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) fmin(float3 v1, float v2) {
-    float3 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) fmin(float4 v1, float v2) {
-    float4 r;
-    r.x = v1.x < v2 ? v1.x : v2;
-    r.y = v1.y < v2 ? v1.y : v2;
-    r.z = v1.z < v2 ? v1.z : v2;
-    r.w = v1.w < v2 ? v1.w : v2;
-    return r;
-}
-
-
-/*
- * MAX
- */
-
-extern char __attribute__((overloadable)) max(char v1, char v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) max(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) max(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) max(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern short __attribute__((overloadable)) max(short v1, short v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) max(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) max(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) max(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int __attribute__((overloadable)) max(int v1, int v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) max(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) max(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) max(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) max(int64_t v1, int64_t v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) max(uchar v1, uchar v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) max(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) max(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) max(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) max(ushort v1, ushort v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) max(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) max(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) max(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) max(uint v1, uint v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) max(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) max(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) max(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
-    return v1 > v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x > v2.x ? v1.x : v2.x;
-    r.y = v1.y > v2.y ? v1.y : v2.y;
-    r.z = v1.z > v2.z ? v1.z : v2.z;
-    r.w = v1.w > v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) max(float v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float2 v2) {
-    return fmax(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) max(float2 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float3 v2) {
-    return fmax(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) max(float3 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float4 v2) {
-    return fmax(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) max(float4 v1, float v2) {
-    return fmax(v1, v2);
-}
-
-
-/*
- * MIN
- */
-
-extern int8_t __attribute__((overloadable)) min(int8_t v1, int8_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern char2 __attribute__((overloadable)) min(char2 v1, char2 v2) {
-    char2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern char3 __attribute__((overloadable)) min(char3 v1, char3 v2) {
-    char3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern char4 __attribute__((overloadable)) min(char4 v1, char4 v2) {
-    char4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int16_t __attribute__((overloadable)) min(int16_t v1, int16_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern short2 __attribute__((overloadable)) min(short2 v1, short2 v2) {
-    short2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern short3 __attribute__((overloadable)) min(short3 v1, short3 v2) {
-    short3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern short4 __attribute__((overloadable)) min(short4 v1, short4 v2) {
-    short4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int32_t __attribute__((overloadable)) min(int32_t v1, int32_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern int2 __attribute__((overloadable)) min(int2 v1, int2 v2) {
-    int2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern int3 __attribute__((overloadable)) min(int3 v1, int3 v2) {
-    int3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern int4 __attribute__((overloadable)) min(int4 v1, int4 v2) {
-    int4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern int64_t __attribute__((overloadable)) min(int64_t v1, int64_t v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
-    long2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
-    long3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
-    long4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uchar __attribute__((overloadable)) min(uchar v1, uchar v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uchar2 __attribute__((overloadable)) min(uchar2 v1, uchar2 v2) {
-    uchar2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uchar3 __attribute__((overloadable)) min(uchar3 v1, uchar3 v2) {
-    uchar3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uchar4 __attribute__((overloadable)) min(uchar4 v1, uchar4 v2) {
-    uchar4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ushort __attribute__((overloadable)) min(ushort v1, ushort v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ushort2 __attribute__((overloadable)) min(ushort2 v1, ushort2 v2) {
-    ushort2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ushort3 __attribute__((overloadable)) min(ushort3 v1, ushort3 v2) {
-    ushort3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ushort4 __attribute__((overloadable)) min(ushort4 v1, ushort4 v2) {
-    ushort4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern uint __attribute__((overloadable)) min(uint v1, uint v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern uint2 __attribute__((overloadable)) min(uint2 v1, uint2 v2) {
-    uint2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern uint3 __attribute__((overloadable)) min(uint3 v1, uint3 v2) {
-    uint3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern uint4 __attribute__((overloadable)) min(uint4 v1, uint4 v2) {
-    uint4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
-    return v1 < v2 ? v1 : v2;
-}
-
-extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
-    ulong2 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    return r;
-}
-
-extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
-    ulong3 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    return r;
-}
-
-extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
-    ulong4 r;
-    r.x = v1.x < v2.x ? v1.x : v2.x;
-    r.y = v1.y < v2.y ? v1.y : v2.y;
-    r.z = v1.z < v2.z ? v1.z : v2.z;
-    r.w = v1.w < v2.w ? v1.w : v2.w;
-    return r;
-}
-
-extern float __attribute__((overloadable)) min(float v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float2 v2) {
-    return fmin(v1, v2);
-}
-
-extern float2 __attribute__((overloadable)) min(float2 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float3 v2) {
-    return fmin(v1, v2);
-}
-
-extern float3 __attribute__((overloadable)) min(float3 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float4 v2) {
-    return fmin(v1, v2);
-}
-
-extern float4 __attribute__((overloadable)) min(float4 v1, float v2) {
-    return fmin(v1, v2);
-}
-
-
-/*
- * YUV
- */
-
-extern uchar4 __attribute__((overloadable)) rsYuvToRGBA_uchar4(uchar y, uchar u, uchar v) {
-    short Y = ((short)y) - 16;
-    short U = ((short)u) - 128;
-    short V = ((short)v) - 128;
-
-    short4 p;
-    p.r = (Y * 298 + V * 409 + 128) >> 8;
-    p.g = (Y * 298 - U * 100 - V * 208 + 128) >> 8;
-    p.b = (Y * 298 + U * 516 + 128) >> 8;
-    p.a = 255;
-    p.r = rsClamp(p.r, (short)0, (short)255);
-    p.g = rsClamp(p.g, (short)0, (short)255);
-    p.b = rsClamp(p.b, (short)0, (short)255);
-
-    return convert_uchar4(p);
-}
-
-static float4 yuv_U_values = {0.f, -0.392f * 0.003921569f, +2.02 * 0.003921569f, 0.f};
-static float4 yuv_V_values = {1.603f * 0.003921569f, -0.815f * 0.003921569f, 0.f, 0.f};
-
-extern float4 __attribute__((overloadable)) rsYuvToRGBA_float4(uchar y, uchar u, uchar v) {
-    float4 color = (float)y * 0.003921569f;
-    float4 fU = ((float)u) - 128.f;
-    float4 fV = ((float)v) - 128.f;
-
-    color += fU * yuv_U_values;
-    color += fV * yuv_V_values;
-    color = clamp(color, 0.f, 1.f);
-    return color;
-}
-
-
-/*
- * half_RECIP
- */
-
-extern float __attribute__((overloadable)) half_recip(float v) {
-    // FIXME:  actual algorithm for generic approximate reciprocal
-    return 1.f / v;
-}
-
-extern float2 __attribute__((overloadable)) half_recip(float2 v) {
-    float2 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_recip(float3 v) {
-    float3 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_recip(float4 v) {
-    float4 r;
-    r.x = half_recip(r.x);
-    r.y = half_recip(r.y);
-    r.z = half_recip(r.z);
-    r.w = half_recip(r.w);
-    return r;
-}
-
-
-/*
- * half_SQRT
- */
-
-extern float __attribute__((overloadable)) half_sqrt(float v) {
-    return sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_sqrt(float2 v) {
-    float2 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_sqrt(float3 v) {
-    float3 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_sqrt(float4 v) {
-    float4 r;
-    r.x = half_sqrt(v.x);
-    r.y = half_sqrt(v.y);
-    r.z = half_sqrt(v.z);
-    r.w = half_sqrt(v.w);
-    return r;
-}
-
-
-/*
- * half_rsqrt
- */
-
-extern float __attribute__((overloadable)) half_rsqrt(float v) {
-    return 1.f / sqrt(v);
-}
-
-extern float2 __attribute__((overloadable)) half_rsqrt(float2 v) {
-    float2 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) half_rsqrt(float3 v) {
-    float3 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) half_rsqrt(float4 v) {
-    float4 r;
-    r.x = half_rsqrt(v.x);
-    r.y = half_rsqrt(v.y);
-    r.z = half_rsqrt(v.z);
-    r.w = half_rsqrt(v.w);
-    return r;
-}
-
diff --git a/driver/runtime/arch/x86_math.ll b/driver/runtime/arch/x86_math.ll
deleted file mode 100755
index 60add80..0000000
--- a/driver/runtime/arch/x86_math.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-target triple = "i386-unknown-linux-gnu"
-
-declare float @llvm.sqrt.f32(float) nounwind readnone
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
-declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) nounwind readnone
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
-declare float @llvm.exp.f32(float) nounwind readonly
-declare float @llvm.pow.f32(float, float) nounwind readonly
-
-define float @_Z4sqrtf(float %in) nounwind readnone alwaysinline {
-  %1 = tail call float @llvm.sqrt.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define <2 x float> @_Z4sqrtDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) nounwind readnone
-  ret <2 x float> %1
-}
-
-define <3 x float> @_Z4sqrtDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <3 x float> @llvm.sqrt.v3f32(<3 x float> %in) nounwind readnone
-  ret <3 x float> %1
-}
-
-define <4 x float> @_Z4sqrtDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) nounwind readnone
-  ret <4 x float> %1
-}
-
-define float @_Z3expf(float %in) nounwind readnone {
-  %1 = tail call float @llvm.exp.f32(float %in) nounwind readnone
-  ret float %1
-}
-
-define float @_Z3powff(float %v1, float %v2) nounwind readnone {
-  %1 = tail call float @llvm.pow.f32(float %v1, float %v2) nounwind readnone
-  ret float %1
-}
-
diff --git a/driver/runtime/arch/x86_clamp.ll b/driver/runtime/arch/x86_sse2.ll
old mode 100755
new mode 100644
similarity index 79%
rename from driver/runtime/arch/x86_clamp.ll
rename to driver/runtime/arch/x86_sse2.ll
index 422e9f6..e4fb035
--- a/driver/runtime/arch/x86_clamp.ll
+++ b/driver/runtime/arch/x86_sse2.ll
@@ -6,6 +6,14 @@
 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
 
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
+declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) nounwind readnone
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
+
+declare float @llvm.exp.f32(float) nounwind readonly
+declare float @llvm.pow.f32(float, float) nounwind readonly
+
 define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %in, <4 x float> %low, <4 x float> %high) nounwind readnone alwaysinline {
   %1 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %in, <4 x float> %high) nounwind readnone
   %2 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %low) nounwind readnone
@@ -72,3 +80,23 @@
   %5 = tail call <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %in, <2 x float> %2, <2 x float> %4) nounwind readnone
   ret <2 x float> %5
 }
+
+define float @_Z4sqrtf(float %in) nounwind readnone alwaysinline {
+  %1 = tail call float @llvm.sqrt.f32(float %in) nounwind readnone
+  ret float %1
+}
+
+define <2 x float> @_Z4sqrtDv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) nounwind readnone
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z4sqrtDv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <3 x float> @llvm.sqrt.v3f32(<3 x float> %in) nounwind readnone
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z4sqrtDv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) nounwind readnone
+  ret <4 x float> %1
+}
diff --git a/driver/runtime/arch/x86_dot_length.ll b/driver/runtime/arch/x86_sse3.ll
similarity index 99%
rename from driver/runtime/arch/x86_dot_length.ll
rename to driver/runtime/arch/x86_sse3.ll
index 21f2f3e..5c96daa 100644
--- a/driver/runtime/arch/x86_dot_length.ll
+++ b/driver/runtime/arch/x86_sse3.ll
@@ -72,4 +72,3 @@
 define float @_Z6lengthf(float %in) nounwind readnone alwaysinline {
   ret float %in
 }
-
diff --git a/driver/runtime/build_bc_lib.mk b/driver/runtime/build_bc_lib.mk
index 0344983..ab2c17b 100644
--- a/driver/runtime/build_bc_lib.mk
+++ b/driver/runtime/build_bc_lib.mk
@@ -37,10 +37,17 @@
              $(bc_translated_clang_cc1_cflags)
 
 ifeq ($(rs_debug_runtime),1)
-bc_cflags += -DRS_DEBUG_RUNTIME
+    bc_cflags += -DRS_DEBUG_RUNTIME
 endif
 rs_debug_runtime:=
 
+ifeq ($(ARCH_X86_HAVE_SSE2), true)
+    bc_cflags += -DARCH_X86_HAVE_SSE2
+endif
+ifeq ($(ARCH_X86_HAVE_SSE3), true)
+    bc_cflags += -DARCH_X86_HAVE_SSE3
+endif
+
 c_sources := $(filter %.c,$(LOCAL_SRC_FILES))
 ll_sources := $(filter %.ll,$(LOCAL_SRC_FILES))
 
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index b7f9158..7e8a574 100755
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -591,6 +591,11 @@
 extern float __attribute__((overloadable)) rsqrt(float v) {
     return 1.f / sqrt(v);
 }
+
+#if !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+FN_FUNC_FN(sqrt)
+#endif // !defined(ARCH_X86_HAVE_SSE2) && !defined(ARCH_X86_HAVE_SSE3)
+
 FN_FUNC_FN(rsqrt)
 
 extern float __attribute__((overloadable)) sin(float);
@@ -897,11 +902,43 @@
     return r;
 }
 
+#if !defined(ARCH_X86_HAVE_SSE3)
+
+extern float __attribute__((overloadable)) dot(float lhs, float rhs) {
+    return lhs * rhs;
+}
+extern float __attribute__((overloadable)) dot(float2 lhs, float2 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y;
+}
+extern float __attribute__((overloadable)) dot(float3 lhs, float3 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z;
+}
+extern float __attribute__((overloadable)) dot(float4 lhs, float4 rhs) {
+    return lhs.x*rhs.x + lhs.y*rhs.y + lhs.z*rhs.z + lhs.w*rhs.w;
+}
+
+extern float __attribute__((overloadable)) length(float v) {
+    return fabs(v);
+}
+extern float __attribute__((overloadable)) length(float2 v) {
+    return sqrt(v.x*v.x + v.y*v.y);
+}
+extern float __attribute__((overloadable)) length(float3 v) {
+    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
+}
+extern float __attribute__((overloadable)) length(float4 v) {
+    return sqrt(v.x*v.x + v.y*v.y + v.z*v.z + v.w*v.w);
+}
+
+#else
+
 extern float __attribute__((overloadable)) length(float v);
 extern float __attribute__((overloadable)) length(float2 v);
 extern float __attribute__((overloadable)) length(float3 v);
 extern float __attribute__((overloadable)) length(float4 v);
 
+#endif
+
 extern float __attribute__((overloadable)) distance(float lhs, float rhs) {
     return length(lhs - rhs);
 }