Implement optimized clamp.

Provide both generic C impl file and optimized
neon path.  Makefile will need switch to build both.

Change-Id: I24cc80de03260a9053eff7dc8e64dc7fe03a92ac
diff --git a/lib/ScriptCRT/Android.mk b/lib/ScriptCRT/Android.mk
index fc26062..d439181 100644
--- a/lib/ScriptCRT/Android.mk
+++ b/lib/ScriptCRT/Android.mk
@@ -32,6 +32,7 @@
 
 # C source files for the library
 clcore_c_files := \
+    clamp.c \
     rs_allocation.c \
     rs_cl.c \
     rs_core.c \
@@ -43,7 +44,6 @@
 
 # Hand-written bitcode for the library
 clcore_ll_files := \
-    clamp.ll \
     convert.ll \
     matrix.ll \
     pixel_packing.ll
diff --git a/lib/ScriptCRT/clamp.c b/lib/ScriptCRT/clamp.c
new file mode 100644
index 0000000..c7e2c39
--- /dev/null
+++ b/lib/ScriptCRT/clamp.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rs_types.rsh"
+
+extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
+    float2 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
+    float3 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
+    float4 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
+    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
+    return r;
+}
+
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
+    float2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
+    float3 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+
diff --git a/lib/ScriptCRT/clamp.ll b/lib/ScriptCRT/clamp.ll
deleted file mode 100644
index f4d58ec..0000000
--- a/lib/ScriptCRT/clamp.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-define i32 @_Z7rsClampjjj(i32 %amount, i32 %low, i32 %high) nounwind readnone alwaysinline {
-  %1 = icmp ult i32 %amount, %low
-  br i1 %1, label %5, label %2
-
-; <label>:2                                       ; preds = %0
-  %3 = icmp ugt i32 %amount, %high
-  %4 = select i1 %3, i32 %high, i32 %amount
-  br label %5
-
-; <label>:5                                       ; preds = %2, %0
-  %6 = phi i32 [ %4, %2 ], [ %low, %0 ]
-  ret i32 %6
-}
diff --git a/lib/ScriptCRT/neon/clamp.ll b/lib/ScriptCRT/neon/clamp.ll
new file mode 100644
index 0000000..4bcbdaa
--- /dev/null
+++ b/lib/ScriptCRT/neon/clamp.ll
@@ -0,0 +1,81 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
+  %1 = insertelement <4 x float> undef, float %in, i32 0
+  %2 = insertelement <4 x float> %1, float %in, i32 1
+  %3 = insertelement <4 x float> %2, float %in, i32 2
+  %4 = insertelement <4 x float> %3, float %in, i32 3
+  ret <4 x float> %4
+}
+
+define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
+  %1 = insertelement <2 x float> undef, float %in, i32 0
+  %2 = insertelement <2 x float> %1, float %in, i32 1
+  ret <2 x float> %2
+}
+
+declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
+  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
+  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
+  ret <4 x float> %2
+}
+
+define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
+  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
+  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
+  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
+  ret <4 x float> %out
+}
+
+define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
+  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
+  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
+  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
+  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
+  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
+  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
+  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
+  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+
+define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
+  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
+  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
+  ret <2 x float> %2
+}
+
+define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
+  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
+  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
+  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
+  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
+  ret <2 x float> %b
+}
+
+
+define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
+  %_value = tail call <2 x float> @smear_2f(float %value) nounwind readnone
+  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
+  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
+  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %_value, <2 x float> %_high) nounwind readnone
+  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
+  %c = extractelement <2 x float> %b, i32 0
+  ret float %c
+}
+
diff --git a/lib/ScriptCRT/rs_cl.c b/lib/ScriptCRT/rs_cl.c
index 77a535b..1225725 100644
--- a/lib/ScriptCRT/rs_cl.c
+++ b/lib/ScriptCRT/rs_cl.c
@@ -682,51 +682,6 @@
 
 // 6.11.4
 
-extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
-    float2 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    return r;
-}
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
-    float3 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    return r;
-}
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
-    float4 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
-    return r;
-}
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
-    float2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
-    float3 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    return r;
-}
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
-    float4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
 
 extern float __attribute__((overloadable)) degrees(float radians) {
     return radians * (180.f / M_PI);
diff --git a/lib/ScriptCRT/rs_core.c b/lib/ScriptCRT/rs_core.c
index c9bf3be..f655c05 100644
--- a/lib/ScriptCRT/rs_core.c
+++ b/lib/ScriptCRT/rs_core.c
@@ -190,9 +190,9 @@
 // int ops
 /////////////////////////////////////////////////////
 
-/*extern uint __attribute__((overloadable, always_inline)) rsClamp(uint amount, uint low, uint high) {
+extern uint __attribute__((overloadable, always_inline)) rsClamp(uint amount, uint low, uint high) {
     return amount < low ? low : (amount > high ? high : amount);
-}*/
+}
 extern int __attribute__((overloadable, always_inline)) rsClamp(int amount, int low, int high) {
     return amount < low ? low : (amount > high ? high : amount);
 }