Reconcile with jb-release

Change-Id: I004954e32fede7bc02333dcfa29982117be6bbea
diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index 3be7e35..e4a2573 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp
@@ -20,6 +20,7 @@
 
 #define LOG_TAG "bcinfo"
 #include <cutils/log.h>
+#include <cutils/properties.h>
 
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Bitcode/ReaderWriter.h"
@@ -221,6 +222,7 @@
   // Check to see if we have any FP precision-related pragmas.
   std::string Relaxed("rs_fp_relaxed");
   std::string Imprecise("rs_fp_imprecise");
+  std::string Full("rs_fp_full");
   bool RelaxedPragmaSeen = false;
   bool ImprecisePragmaSeen = false;
 
@@ -246,6 +248,26 @@
     mRSFloatPrecision = RS_FP_Relaxed;
   }
 
+  // Provide an override for precsion via adb shell setprop
+  // adb shell setprop debug.rs.precision rs_fp_full
+  // adb shell setprop debug.rs.precision rs_fp_relaxed
+  // adb shell setprop debug.rs.precision rs_fp_imprecise
+  char PrecisionPropBuf[PROPERTY_VALUE_MAX];
+  const std::string PrecisionPropName("debug.rs.precision");
+  property_get("debug.rs.precision", PrecisionPropBuf, "");
+  if (PrecisionPropBuf[0]) {
+    if (!Relaxed.compare(PrecisionPropBuf)) {
+      ALOGE("Switching to RS FP relaxed mode via setprop");
+      mRSFloatPrecision = RS_FP_Relaxed;
+    } else if (!Imprecise.compare(PrecisionPropBuf)) {
+      ALOGE("Switching to RS FP imprecise mode via setprop");
+      mRSFloatPrecision = RS_FP_Imprecise;
+    } else if (!Full.compare(PrecisionPropBuf)) {
+      ALOGE("Switching to RS FP full mode via setprop");
+      mRSFloatPrecision = RS_FP_Full;
+    }
+  }
+
   return;
 }
 
diff --git a/bcinfo/tools/Android.mk b/bcinfo/tools/Android.mk
index 0d99f71..f7cf34b 100644
--- a/bcinfo/tools/Android.mk
+++ b/bcinfo/tools/Android.mk
@@ -16,6 +16,8 @@
 
 LOCAL_PATH := $(call my-dir)
 
+LLVM_ROOT_PATH := external/llvm
+
 # Executable for host
 # ========================================================
 include $(CLEAR_VARS)
@@ -28,6 +30,12 @@
 LOCAL_SHARED_LIBRARIES := \
   libbcinfo
 
+LOCAL_STATIC_LIBRARIES := \
+  libLLVMBitReader \
+  libLLVMBitWriter \
+  libLLVMCore \
+  libLLVMSupport
+
 LOCAL_CFLAGS += -D__HOST__
 
 LOCAL_C_INCLUDES := \
@@ -37,5 +45,6 @@
 
 LOCAL_LDLIBS = -ldl
 
+include $(LLVM_ROOT_PATH)/llvm-host-build.mk
 include $(BUILD_HOST_EXECUTABLE)
 
diff --git a/bcinfo/tools/main.cpp b/bcinfo/tools/main.cpp
index 1360c68..2d1e449 100644
--- a/bcinfo/tools/main.cpp
+++ b/bcinfo/tools/main.cpp
@@ -18,6 +18,16 @@
 #include <bcinfo/BitcodeWrapper.h>
 #include <bcinfo/MetadataExtractor.h>
 
+#include <llvm/ADT/OwningPtr.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Assembly/AssemblyAnnotationWriter.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/Support/ManagedStatic.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/ToolOutputFile.h>
+
 #include <ctype.h>
 #include <dlfcn.h>
 #include <stdarg.h>
@@ -33,12 +43,14 @@
 
 #include <unistd.h>
 
+#include <string>
 #include <vector>
 
 // This file corresponds to the standalone bcinfo tool. It prints a variety of
 // information about a supplied bitcode input file.
 
-const char* inFile = NULL;
+std::string inFile;
+std::string outFile;
 
 extern int opterr;
 extern int optind;
@@ -72,6 +84,13 @@
   }
 
   inFile = argv[optind];
+
+  int l = inFile.length();
+  if (l > 3 && inFile[l-3] == '.' && inFile[l-2] == 'b' && inFile[l-1] == 'c') {
+    outFile = std::string(inFile.begin(), inFile.end() - 3) + ".ll";
+  } else {
+    outFile = inFile + ".ll";
+  }
   return 1;
 }
 
@@ -141,26 +160,26 @@
 
 
 static size_t readBitcode(const char **bitcode) {
-  if (!inFile) {
+  if (!inFile.length()) {
     fprintf(stderr, "input file required\n");
-    return NULL;
+    return 0;
   }
 
   struct stat statInFile;
-  if (stat(inFile, &statInFile) < 0) {
+  if (stat(inFile.c_str(), &statInFile) < 0) {
     fprintf(stderr, "Unable to stat input file: %s\n", strerror(errno));
-    return NULL;
+    return 0;
   }
 
   if (!S_ISREG(statInFile.st_mode)) {
     fprintf(stderr, "Input file should be a regular file.\n");
-    return NULL;
+    return 0;
   }
 
-  FILE *in = fopen(inFile, "r");
+  FILE *in = fopen(inFile.c_str(), "r");
   if (!in) {
-    fprintf(stderr, "Could not open input file %s\n", inFile);
-    return NULL;
+    fprintf(stderr, "Could not open input file %s\n", inFile.c_str());
+    return 0;
   }
 
   size_t bitcodeSize = statInFile.st_size;
@@ -169,7 +188,7 @@
   size_t nread = fread((void*) *bitcode, 1, bitcodeSize, in);
 
   if (nread != bitcodeSize)
-      fprintf(stderr, "Could not read all of file %s\n", inFile);
+      fprintf(stderr, "Could not read all of file %s\n", inFile.c_str());
 
   fclose(in);
   return nread;
@@ -192,7 +211,6 @@
   }
 
   const char *bitcode = NULL;
-  const char *translatedBitcode = NULL;
   size_t bitcodeSize = readBitcode(&bitcode);
 
   unsigned int version = 0;
@@ -209,25 +227,58 @@
   printf("compilerVersion: %u\n", bcWrapper.getCompilerVersion());
   printf("optimizationLevel: %u\n\n", bcWrapper.getOptimizationLevel());
 
-  bcinfo::BitcodeTranslator *BT =
-      new bcinfo::BitcodeTranslator(bitcode, bitcodeSize, version);
+  llvm::OwningPtr<bcinfo::BitcodeTranslator> BT;
+  BT.reset(new bcinfo::BitcodeTranslator(bitcode, bitcodeSize, version));
   if (!BT->translate()) {
     fprintf(stderr, "failed to translate bitcode\n");
     return 3;
   }
 
-  bcinfo::MetadataExtractor *ME =
-      new bcinfo::MetadataExtractor(BT->getTranslatedBitcode(),
-                                    BT->getTranslatedBitcodeSize());
+  llvm::OwningPtr<bcinfo::MetadataExtractor> ME;
+  ME.reset(new bcinfo::MetadataExtractor(BT->getTranslatedBitcode(),
+                                         BT->getTranslatedBitcodeSize()));
   if (!ME->extract()) {
     fprintf(stderr, "failed to get metadata\n");
     return 4;
   }
 
-  dumpMetadata(ME);
+  dumpMetadata(ME.get());
 
-  delete ME;
-  delete BT;
+  const char *translatedBitcode = BT->getTranslatedBitcode();
+  size_t translatedBitcodeSize = BT->getTranslatedBitcodeSize();
+
+  llvm::LLVMContext &ctx = llvm::getGlobalContext();
+  llvm::llvm_shutdown_obj called_on_exit;
+
+  llvm::OwningPtr<llvm::MemoryBuffer> mem;
+
+  mem.reset(llvm::MemoryBuffer::getMemBuffer(
+      llvm::StringRef(translatedBitcode, translatedBitcodeSize),
+      inFile.c_str(), false));
+
+  llvm::OwningPtr<llvm::Module> module;
+  std::string errmsg;
+  module.reset(llvm::ParseBitcodeFile(mem.get(), ctx, &errmsg));
+  if (module.get() != 0 && module->MaterializeAllPermanently(&errmsg)) {
+    module.reset();
+  }
+
+  if (module.get() == 0) {
+    if (errmsg.size()) {
+      fprintf(stderr, "error: %s\n", errmsg.c_str());
+    } else {
+      fprintf(stderr, "error: failed to parse bitcode file\n");
+    }
+    return 5;
+  }
+
+  llvm::OwningPtr<llvm::tool_output_file> tof(
+      new llvm::tool_output_file(outFile.c_str(), errmsg,
+                                 llvm::raw_fd_ostream::F_Binary));
+  llvm::OwningPtr<llvm::AssemblyAnnotationWriter> ann;
+  module->print(tof->os(), ann.get());
+
+  tof->keep();
 
   releaseBitcode(&bitcode);
 
diff --git a/lib/ExecutionEngine/Runtime.def b/lib/ExecutionEngine/Runtime.def
index e4a6875..69c2b16 100644
--- a/lib/ExecutionEngine/Runtime.def
+++ b/lib/ExecutionEngine/Runtime.def
@@ -117,9 +117,7 @@
 
 DEF_GENERIC_OR_VFP_RUNTIME(__divdf3)
 
-#if !defined(__i386__)
-    DEF_LLVM_RUNTIME(__divdi3)
-#endif
+DEF_LLVM_RUNTIME(__divdi3)
 DEF_LLVM_RUNTIME(__divsi3)
 
 #if !defined(ANDROID) /* no complex extension */
@@ -183,9 +181,7 @@
 DEF_VFP_RUNTIME(__ltdf2)
 DEF_VFP_RUNTIME(__ltsf2)
 
-#if !defined(__i386__)
-    DEF_LLVM_RUNTIME(__moddi3)
-#endif
+DEF_LLVM_RUNTIME(__moddi3)
 DEF_LLVM_RUNTIME(__modsi3)
 
 #ifndef ANDROID // no complex extension
@@ -235,14 +231,10 @@
 DEF_GENERIC_OR_VFP_RUNTIME(__truncdfsf2)
 
 DEF_LLVM_RUNTIME(__ucmpdi2)
-#if !defined(__i386__)
-    DEF_LLVM_RUNTIME(__udivdi3)
-#endif
+DEF_LLVM_RUNTIME(__udivdi3)
 DEF_LLVM_RUNTIME(__udivmoddi4)
 DEF_LLVM_RUNTIME(__udivsi3)
-#if !defined(__i386__)
-    DEF_LLVM_RUNTIME(__umoddi3)
-#endif
+DEF_LLVM_RUNTIME(__umoddi3)
 DEF_LLVM_RUNTIME(__umodsi3)
 
 DEF_GENERIC_OR_VFP_RUNTIME(__unorddf2)
diff --git a/lib/ExecutionEngine/RuntimeStub.h b/lib/ExecutionEngine/RuntimeStub.h
index 1ca678b..bea4a6c 100644
--- a/lib/ExecutionEngine/RuntimeStub.h
+++ b/lib/ExecutionEngine/RuntimeStub.h
@@ -25,9 +25,7 @@
 #ifndef ANDROID /* no complex.h */
 extern double _Complex __divdc3(double, double, double, double);
 #endif
-#if !defined(__i386__)
 extern di_int __divdi3(di_int, di_int);
-#endif
 extern si_int __divsi3(si_int, si_int);
 #ifndef ANDROID /* no complex.h */
 extern float _Complex __divsc3(float, float, float, float);
@@ -44,8 +42,8 @@
 extern float __floatdisf(di_int);
 extern double __floatundidf(du_int);
 extern float __floatundisf(du_int);
-extern di_int __moddi3(di_int, di_int);
 #endif
+extern di_int __moddi3(di_int, di_int);
 extern si_int __modsi3(si_int, si_int);
 #if !defined(__i386__) && !defined(__SSE2__)
 extern di_int __lshrdi3(di_int, si_int);
@@ -73,14 +71,10 @@
 extern di_int __subvdi3(di_int, di_int);
 extern si_int __subvsi3(si_int, si_int);
 extern si_int __ucmpdi2(du_int, du_int);
-#if !defined(__i386__)
 extern du_int __udivdi3(du_int, du_int);
-#endif
 extern su_int __udivsi3(su_int, su_int);
 extern du_int __udivmoddi4(du_int, du_int, du_int *);
-#if !defined(__i386__)
 extern du_int __umoddi3(du_int, du_int);
-#endif
 extern su_int __umodsi3(su_int, su_int);
 extern void __eprintf(char const *, char const *, char const *, char const *)
   __attribute__((visibility("hidden")));
diff --git a/lib/ExecutionEngine/Script.cpp b/lib/ExecutionEngine/Script.cpp
index 11ef0e1..0c3c7a5 100644
--- a/lib/ExecutionEngine/Script.cpp
+++ b/lib/ExecutionEngine/Script.cpp
@@ -48,6 +48,12 @@
   return strcmp(buf, "0") != 0;
 }
 
+bool isSetProp(const char *str) {
+  char buf[PROPERTY_VALUE_MAX];
+  property_get(str, buf, "");
+  return buf[0] != '\0';
+}
+
 } // namespace anonymous
 
 namespace bcc {
@@ -762,6 +768,11 @@
     return false;
   }
 
+  if (isSetProp("debug.rs.precision")) {
+    // If we have a floating point precision override, don't use the cache.
+    return false;
+  }
+
   if (mCacheDir.empty() || mCacheName.empty()) {
     // The application developer has not specified the cachePath, so
     // we don't know where to open the cache file.
diff --git a/lib/ScriptCRT/Android.mk b/lib/ScriptCRT/Android.mk
index f8b6d80..4227875 100644
--- a/lib/ScriptCRT/Android.mk
+++ b/lib/ScriptCRT/Android.mk
@@ -28,15 +28,16 @@
     rs_sampler.c \
     convert.ll \
     matrix.ll \
-    pixel_packing.ll
+    pixel_packing.ll \
+    math.ll
 
 clcore_files := \
     $(clcore_base_files) \
-    clamp.c
+    arch/generic.c
 
 clcore_neon_files := \
     $(clcore_base_files) \
-    neon/clamp.ll
+    arch/neon.ll
 
 ifeq "REL" "$(PLATFORM_VERSION_CODENAME)"
   RS_VERSION := $(PLATFORM_SDK_VERSION)
diff --git a/lib/ScriptCRT/arch/generic.c b/lib/ScriptCRT/arch/generic.c
new file mode 100644
index 0000000..97b7b2f
--- /dev/null
+++ b/lib/ScriptCRT/arch/generic.c
@@ -0,0 +1,702 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "rs_types.rsh"
+
+/*
+ * CLAMP
+ */
+
+extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
+    return amount < low ? low : (amount > high ? high : amount);
+}
+
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
+    float2 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
+    float3 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
+    float4 r;
+    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
+    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
+    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
+    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
+    return r;
+}
+
+extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
+    float2 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
+    float3 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
+    float4 r;
+    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
+    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
+    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
+    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
+    return r;
+}
+
+
+/*
+ * FMAX
+ */
+
+extern float __attribute__((overloadable)) fmax(float v1, float v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern float2 __attribute__((overloadable)) fmax(float2 v1, float2 v2) {
+    float2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) fmax(float3 v1, float3 v2) {
+    float3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) fmax(float4 v1, float4 v2) {
+    float4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern float2 __attribute__((overloadable)) fmax(float2 v1, float v2) {
+    float2 r;
+    r.x = v1.x > v2 ? v1.x : v2;
+    r.y = v1.y > v2 ? v1.y : v2;
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) fmax(float3 v1, float v2) {
+    float3 r;
+    r.x = v1.x > v2 ? v1.x : v2;
+    r.y = v1.y > v2 ? v1.y : v2;
+    r.z = v1.z > v2 ? v1.z : v2;
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) fmax(float4 v1, float v2) {
+    float4 r;
+    r.x = v1.x > v2 ? v1.x : v2;
+    r.y = v1.y > v2 ? v1.y : v2;
+    r.z = v1.z > v2 ? v1.z : v2;
+    r.w = v1.w > v2 ? v1.w : v2;
+    return r;
+}
+
+extern float __attribute__((overloadable)) fmin(float v1, float v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+
+/*
+ * FMIN
+ */
+extern float2 __attribute__((overloadable)) fmin(float2 v1, float2 v2) {
+    float2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) fmin(float3 v1, float3 v2) {
+    float3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) fmin(float4 v1, float4 v2) {
+    float4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern float2 __attribute__((overloadable)) fmin(float2 v1, float v2) {
+    float2 r;
+    r.x = v1.x < v2 ? v1.x : v2;
+    r.y = v1.y < v2 ? v1.y : v2;
+    return r;
+}
+
+extern float3 __attribute__((overloadable)) fmin(float3 v1, float v2) {
+    float3 r;
+    r.x = v1.x < v2 ? v1.x : v2;
+    r.y = v1.y < v2 ? v1.y : v2;
+    r.z = v1.z < v2 ? v1.z : v2;
+    return r;
+}
+
+extern float4 __attribute__((overloadable)) fmin(float4 v1, float v2) {
+    float4 r;
+    r.x = v1.x < v2 ? v1.x : v2;
+    r.y = v1.y < v2 ? v1.y : v2;
+    r.z = v1.z < v2 ? v1.z : v2;
+    r.w = v1.w < v2 ? v1.w : v2;
+    return r;
+}
+
+
+/*
+ * MAX
+ */
+
+extern char __attribute__((overloadable)) max(char v1, char v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern char2 __attribute__((overloadable)) max(char2 v1, char2 v2) {
+    char2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern char3 __attribute__((overloadable)) max(char3 v1, char3 v2) {
+    char3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern char4 __attribute__((overloadable)) max(char4 v1, char4 v2) {
+    char4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern short __attribute__((overloadable)) max(short v1, short v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern short2 __attribute__((overloadable)) max(short2 v1, short2 v2) {
+    short2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern short3 __attribute__((overloadable)) max(short3 v1, short3 v2) {
+    short3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern short4 __attribute__((overloadable)) max(short4 v1, short4 v2) {
+    short4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern int __attribute__((overloadable)) max(int v1, int v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern int2 __attribute__((overloadable)) max(int2 v1, int2 v2) {
+    int2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern int3 __attribute__((overloadable)) max(int3 v1, int3 v2) {
+    int3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern int4 __attribute__((overloadable)) max(int4 v1, int4 v2) {
+    int4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern int64_t __attribute__((overloadable)) max(int64_t v1, int64_t v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
+    long2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
+    long3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
+    long4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern uchar __attribute__((overloadable)) max(uchar v1, uchar v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern uchar2 __attribute__((overloadable)) max(uchar2 v1, uchar2 v2) {
+    uchar2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern uchar3 __attribute__((overloadable)) max(uchar3 v1, uchar3 v2) {
+    uchar3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern uchar4 __attribute__((overloadable)) max(uchar4 v1, uchar4 v2) {
+    uchar4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern ushort __attribute__((overloadable)) max(ushort v1, ushort v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern ushort2 __attribute__((overloadable)) max(ushort2 v1, ushort2 v2) {
+    ushort2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern ushort3 __attribute__((overloadable)) max(ushort3 v1, ushort3 v2) {
+    ushort3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern ushort4 __attribute__((overloadable)) max(ushort4 v1, ushort4 v2) {
+    ushort4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern uint __attribute__((overloadable)) max(uint v1, uint v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern uint2 __attribute__((overloadable)) max(uint2 v1, uint2 v2) {
+    uint2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern uint3 __attribute__((overloadable)) max(uint3 v1, uint3 v2) {
+    uint3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern uint4 __attribute__((overloadable)) max(uint4 v1, uint4 v2) {
+    uint4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
+    return v1 > v2 ? v1 : v2;
+}
+
+extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
+    ulong2 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
+    ulong3 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
+    ulong4 r;
+    r.x = v1.x > v2.x ? v1.x : v2.x;
+    r.y = v1.y > v2.y ? v1.y : v2.y;
+    r.z = v1.z > v2.z ? v1.z : v2.z;
+    r.w = v1.w > v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern float __attribute__((overloadable)) max(float v1, float v2) {
+    return fmax(v1, v2);
+}
+
+extern float2 __attribute__((overloadable)) max(float2 v1, float2 v2) {
+    return fmax(v1, v2);
+}
+
+extern float2 __attribute__((overloadable)) max(float2 v1, float v2) {
+    return fmax(v1, v2);
+}
+
+extern float3 __attribute__((overloadable)) max(float3 v1, float3 v2) {
+    return fmax(v1, v2);
+}
+
+extern float3 __attribute__((overloadable)) max(float3 v1, float v2) {
+    return fmax(v1, v2);
+}
+
+extern float4 __attribute__((overloadable)) max(float4 v1, float4 v2) {
+    return fmax(v1, v2);
+}
+
+extern float4 __attribute__((overloadable)) max(float4 v1, float v2) {
+    return fmax(v1, v2);
+}
+
+
+/*
+ * MIN
+ */
+
+extern int8_t __attribute__((overloadable)) min(int8_t v1, int8_t v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern char2 __attribute__((overloadable)) min(char2 v1, char2 v2) {
+    char2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern char3 __attribute__((overloadable)) min(char3 v1, char3 v2) {
+    char3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern char4 __attribute__((overloadable)) min(char4 v1, char4 v2) {
+    char4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern int16_t __attribute__((overloadable)) min(int16_t v1, int16_t v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern short2 __attribute__((overloadable)) min(short2 v1, short2 v2) {
+    short2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern short3 __attribute__((overloadable)) min(short3 v1, short3 v2) {
+    short3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern short4 __attribute__((overloadable)) min(short4 v1, short4 v2) {
+    short4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern int32_t __attribute__((overloadable)) min(int32_t v1, int32_t v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern int2 __attribute__((overloadable)) min(int2 v1, int2 v2) {
+    int2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern int3 __attribute__((overloadable)) min(int3 v1, int3 v2) {
+    int3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern int4 __attribute__((overloadable)) min(int4 v1, int4 v2) {
+    int4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern int64_t __attribute__((overloadable)) min(int64_t v1, int64_t v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
+    long2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
+    long3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
+    long4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern uchar __attribute__((overloadable)) min(uchar v1, uchar v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern uchar2 __attribute__((overloadable)) min(uchar2 v1, uchar2 v2) {
+    uchar2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern uchar3 __attribute__((overloadable)) min(uchar3 v1, uchar3 v2) {
+    uchar3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern uchar4 __attribute__((overloadable)) min(uchar4 v1, uchar4 v2) {
+    uchar4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern ushort __attribute__((overloadable)) min(ushort v1, ushort v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern ushort2 __attribute__((overloadable)) min(ushort2 v1, ushort2 v2) {
+    ushort2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern ushort3 __attribute__((overloadable)) min(ushort3 v1, ushort3 v2) {
+    ushort3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern ushort4 __attribute__((overloadable)) min(ushort4 v1, ushort4 v2) {
+    ushort4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern uint __attribute__((overloadable)) min(uint v1, uint v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern uint2 __attribute__((overloadable)) min(uint2 v1, uint2 v2) {
+    uint2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern uint3 __attribute__((overloadable)) min(uint3 v1, uint3 v2) {
+    uint3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern uint4 __attribute__((overloadable)) min(uint4 v1, uint4 v2) {
+    uint4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
+    return v1 < v2 ? v1 : v2;
+}
+
+extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
+    ulong2 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    return r;
+}
+
+extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
+    ulong3 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    return r;
+}
+
+extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
+    ulong4 r;
+    r.x = v1.x < v2.x ? v1.x : v2.x;
+    r.y = v1.y < v2.y ? v1.y : v2.y;
+    r.z = v1.z < v2.z ? v1.z : v2.z;
+    r.w = v1.w < v2.w ? v1.w : v2.w;
+    return r;
+}
+
+extern float __attribute__((overloadable)) min(float v1, float v2) {
+    return fmin(v1, v2);
+}
+
+extern float2 __attribute__((overloadable)) min(float2 v1, float2 v2) {
+    return fmin(v1, v2);
+}
+
+extern float2 __attribute__((overloadable)) min(float2 v1, float v2) {
+    return fmin(v1, v2);
+}
+
+extern float3 __attribute__((overloadable)) min(float3 v1, float3 v2) {
+    return fmin(v1, v2);
+}
+
+extern float3 __attribute__((overloadable)) min(float3 v1, float v2) {
+    return fmin(v1, v2);
+}
+
+extern float4 __attribute__((overloadable)) min(float4 v1, float4 v2) {
+    return fmin(v1, v2);
+}
+
+extern float4 __attribute__((overloadable)) min(float4 v1, float v2) {
+    return fmin(v1, v2);
+}
+
diff --git a/lib/ScriptCRT/arch/neon.ll b/lib/ScriptCRT/arch/neon.ll
new file mode 100644
index 0000000..5135446
--- /dev/null
+++ b/lib/ScriptCRT/arch/neon.ll
@@ -0,0 +1,673 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;               INTRINSICS               ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                HELPERS                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
+  %1 = insertelement <4 x float> undef, float %in, i32 0
+  %2 = insertelement <4 x float> %1, float %in, i32 1
+  %3 = insertelement <4 x float> %2, float %in, i32 2
+  %4 = insertelement <4 x float> %3, float %in, i32 3
+  ret <4 x float> %4
+}
+
+define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
+  %1 = insertelement <2 x float> undef, float %in, i32 0
+  %2 = insertelement <2 x float> %1, float %in, i32 1
+  ret <2 x float> %2
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                 CLAMP                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
+  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
+  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
+  ret <4 x float> %2
+}
+
+define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
+  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
+  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
+  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
+  ret <4 x float> %out
+}
+
+define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
+  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
+  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
+  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
+  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
+  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
+  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
+  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
+  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
+  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
+  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
+  ret <2 x float> %2
+}
+
+define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
+  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
+  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
+  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
+  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
+  ret <2 x float> %b
+}
+
+
+define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
+  %_value = tail call <2 x float> @smear_2f(float %value) nounwind readnone
+  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
+  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
+  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %_value, <2 x float> %_high) nounwind readnone
+  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
+  %c = extractelement <2 x float> %b, i32 0
+  ret float %c
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  FMAX                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
+  %1 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
+  ret <4 x float> %1
+}
+
+define <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
+  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
+  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
+  ret <4 x float> %2
+}
+
+define <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
+  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
+  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %4
+}
+
+define <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
+  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
+  %3 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
+  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+define <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
+  %1 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
+  ret <2 x float> %1
+}
+
+define <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
+  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
+  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
+  ret <2 x float> %2
+}
+
+define float @_Z4fmaxff(float %v1, float %v2) nounwind readonly {
+  %1 = fcmp ogt float %v1, %v2
+  %2 = select i1 %1, float %v1, float %v2
+  ret float %2
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  FMIN                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readonly {
+  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %v2) nounwind readnone
+  ret <4 x float> %1
+}
+
+define <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2) nounwind readonly {
+  %1 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
+  %2 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %v1, <4 x float> %1) nounwind readnone
+  ret <4 x float> %2
+}
+
+define <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readonly {
+  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x float> %v2, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
+  %4 = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %4
+}
+
+define <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2) nounwind readonly {
+  %1 = shufflevector <3 x float> %v1, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <4 x float> @smear_4f(float %v2) nounwind readnone
+  %3 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %1, <4 x float> %2) nounwind readnone
+  %c = shufflevector <4 x float> %3, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %c
+}
+
+define <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readonly {
+  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %v2) nounwind readnone
+  ret <2 x float> %1
+}
+
+define <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2) nounwind readonly {
+  %1 = tail call <2 x float> @smear_2f(float %v2) nounwind readnone
+  %2 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %v1, <2 x float> %1) nounwind readnone
+  ret <2 x float> %2
+}
+
+define float @_Z4fminff(float %v1, float %v2) nounwind readnone {
+  %1 = fcmp olt float %v1, %v2
+  %2 = select i1 %1, float %v1, float %v2
+  ret float %2
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  MAX                   ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define signext i8 @_Z3maxcc(i8 signext %v1, i8 signext %v2) nounwind readnone {
+  %1 = icmp sgt i8 %v1, %v2
+  %2 = select i1 %1, i8 %v1, i8 %v2
+  ret i8 %2
+}
+
+define <2 x i8> @_Z3maxDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+  %1 = sext <2 x i8> %v1 to <2 x i32>
+  %2 = sext <2 x i8> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <3 x i8> @_Z3maxDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
+  %1 = sext <3 x i8> %v1 to <3 x i32>
+  %2 = sext <3 x i8> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i8>
+  ret <3 x i8> %7
+}
+
+define <4 x i8> @_Z3maxDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+  %1 = sext <4 x i8> %v1 to <4 x i32>
+  %2 = sext <4 x i8> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define signext i16 @_Z3maxss(i16 signext %v1, i16 signext %v2) nounwind readnone {
+  %1 = icmp sgt i16 %v1, %v2
+  %2 = select i1 %1, i16 %v1, i16 %v2
+  ret i16 %2
+}
+
+define <2 x i16> @_Z3maxDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+  %1 = sext <2 x i16> %v1 to <2 x i32>
+  %2 = sext <2 x i16> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <3 x i16> @_Z3maxDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
+  %1 = sext <3 x i16> %v1 to <3 x i32>
+  %2 = sext <3 x i16> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i16>
+  ret <3 x i16> %7
+}
+
+define <4 x i16> @_Z3maxDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+  %1 = sext <4 x i16> %v1 to <4 x i32>
+  %2 = sext <4 x i16> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  ret <4 x i16> %4
+}
+
+define i32 @_Z3maxii(i32 %v1, i32 %v2) nounwind readnone {
+  %1 = icmp sgt i32 %v1, %v2
+  %2 = select i1 %1, i32 %v1, i32 %v2
+  ret i32 %2
+}
+
+define <2 x i32> @_Z3maxDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
+  ret <2 x i32> %1
+}
+
+define <3 x i32> @_Z3maxDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
+  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %4
+}
+
+define <4 x i32> @_Z3maxDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
+  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
+  ret <4 x i32> %1
+}
+
+define i64 @_Z3maxxx(i64 %v1, i64 %v2) nounwind readnone {
+  %1 = icmp sgt i64 %v1, %v2
+  %2 = select i1 %1, i64 %v1, i64 %v2
+  ret i64 %2
+}
+
+; TODO:  long vector types
+
+define zeroext i8 @_Z3maxhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
+  %1 = icmp ugt i8 %v1, %v2
+  %2 = select i1 %1, i8 %v1, i8 %v2
+  ret i8 %2
+}
+
+define <2 x i8> @_Z3maxDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+  %1 = zext <2 x i8> %v1 to <2 x i32>
+  %2 = zext <2 x i8> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <3 x i8> @_Z3maxDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
+  %1 = zext <3 x i8> %v1 to <3 x i32>
+  %2 = zext <3 x i8> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i8>
+  ret <3 x i8> %7
+}
+
+define <4 x i8> @_Z3maxDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+  %1 = zext <4 x i8> %v1 to <4 x i32>
+  %2 = zext <4 x i8> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define zeroext i16 @_Z3maxtt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
+  %1 = icmp ugt i16 %v1, %v2
+  %2 = select i1 %1, i16 %v1, i16 %v2
+  ret i16 %2
+}
+
+define <2 x i16> @_Z3maxDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+  %1 = zext <2 x i16> %v1 to <2 x i32>
+  %2 = zext <2 x i16> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <3 x i16> @_Z3maxDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
+  %1 = zext <3 x i16> %v1 to <3 x i32>
+  %2 = zext <3 x i16> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i16>
+  ret <3 x i16> %7
+}
+
+define <4 x i16> @_Z3maxDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+  %1 = zext <4 x i16> %v1 to <4 x i32>
+  %2 = zext <4 x i16> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  ret <4 x i16> %4
+}
+
+define i32 @_Z3maxjj(i32 %v1, i32 %v2) nounwind readnone {
+  %1 = icmp ugt i32 %v1, %v2
+  %2 = select i1 %1, i32 %v1, i32 %v2
+  ret i32 %2
+}
+
+define <2 x i32> @_Z3maxDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+  %1 = tail call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
+  ret <2 x i32> %1
+}
+
+define <3 x i32> @_Z3maxDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
+  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x i32   > @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %4
+}
+
+define <4 x i32> @_Z3maxDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
+  %1 = tail call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
+  ret <4 x i32> %1
+}
+
+define i64 @_Z3maxyy(i64 %v1, i64 %v2) nounwind readnone {
+  %1 = icmp ugt i64 %v1, %v2
+  %2 = select i1 %1, i64 %v1, i64 %v2
+  ret i64 %2
+}
+
+; TODO:  long vector types
+
+define float @_Z3maxff(float %v1, float %v2) nounwind readnone {
+  %1 = tail call float @_Z4fmaxff(float %v1, float %v2)
+  ret float %1
+}
+
+define <2 x float> @_Z3maxDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
+  %1 = tail call <2 x float> @_Z4fmaxDv2_fS_(<2 x float> %v1, <2 x float> %v2)
+  ret <2 x float> %1
+}
+
+define <2 x float> @_Z3maxDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <2 x float> @_Z4fmaxDv2_ff(<2 x float> %v1, float %v2)
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z3maxDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
+  %1 = tail call <3 x float> @_Z4fmaxDv3_fS_(<3 x float> %v1, <3 x float> %v2)
+  ret <3 x float> %1
+}
+
+define <3 x float> @_Z3maxDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <3 x float> @_Z4fmaxDv3_ff(<3 x float> %v1, float %v2)
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z3maxDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
+  %1 = tail call <4 x float> @_Z4fmaxDv4_fS_(<4 x float> %v1, <4 x float> %v2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @_Z3maxDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <4 x float> @_Z4fmaxDv4_ff(<4 x float> %v1, float %v2)
+  ret <4 x float> %1
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  MIN                   ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define signext i8 @_Z3mincc(i8 signext %v1, i8 signext %v2) nounwind readnone {
+  %1 = icmp slt i8 %v1, %v2
+  %2 = select i1 %1, i8 %v1, i8 %v2
+  ret i8 %2
+}
+
+define <2 x i8> @_Z3minDv2_cS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+  %1 = sext <2 x i8> %v1 to <2 x i32>
+  %2 = sext <2 x i8> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <3 x i8> @_Z3minDv3_cS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
+  %1 = sext <3 x i8> %v1 to <3 x i32>
+  %2 = sext <3 x i8> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i8>
+  ret <3 x i8> %7
+}
+
+define <4 x i8> @_Z3minDv4_cS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+  %1 = sext <4 x i8> %v1 to <4 x i32>
+  %2 = sext <4 x i8> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define signext i16 @_Z3minss(i16 signext %v1, i16 signext %v2) nounwind readnone {
+  %1 = icmp slt i16 %v1, %v2
+  %2 = select i1 %1, i16 %v1, i16 %v2
+  ret i16 %2
+}
+
+define <2 x i16> @_Z3minDv2_sS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+  %1 = sext <2 x i16> %v1 to <2 x i32>
+  %2 = sext <2 x i16> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <3 x i16> @_Z3minDv3_sS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
+  %1 = sext <3 x i16> %v1 to <3 x i32>
+  %2 = sext <3 x i16> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i16>
+  ret <3 x i16> %7
+}
+
+define <4 x i16> @_Z3minDv4_sS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+  %1 = sext <4 x i16> %v1 to <4 x i32>
+  %2 = sext <4 x i16> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  ret <4 x i16> %4
+}
+
+define i32 @_Z3minii(i32 %v1, i32 %v2) nounwind readnone {
+  %1 = icmp slt i32 %v1, %v2
+  %2 = select i1 %1, i32 %v1, i32 %v2
+  ret i32 %2
+}
+
+define <2 x i32> @_Z3minDv2_iS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+  %1 = tail call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
+  ret <2 x i32> %1
+}
+
+define <3 x i32> @_Z3minDv3_iS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
+  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x i32   > @llvm.arm.neon.vmins.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %4
+}
+
+define <4 x i32> @_Z3minDv4_iS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
+  %1 = tail call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
+  ret <4 x i32> %1
+}
+
+define i64 @_Z3minxx(i64 %v1, i64 %v2) nounwind readnone {
+  %1 = icmp slt i64 %v1, %v2
+  %2 = select i1 %1, i64 %v1, i64 %v2
+  ret i64 %2
+}
+
+; TODO:  long vector types
+
+define zeroext i8 @_Z3minhh(i8 zeroext %v1, i8 zeroext %v2) nounwind readnone {
+  %1 = icmp ult i8 %v1, %v2
+  %2 = select i1 %1, i8 %v1, i8 %v2
+  ret i8 %2
+}
+
+define <2 x i8> @_Z3minDv2_hS_(<2 x i8> %v1, <2 x i8> %v2) nounwind readnone {
+  %1 = zext <2 x i8> %v1 to <2 x i32>
+  %2 = zext <2 x i8> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i8>
+  ret <2 x i8> %4
+}
+
+define <3 x i8> @_Z3minDv3_hS_(<3 x i8> %v1, <3 x i8> %v2) nounwind readnone {
+  %1 = zext <3 x i8> %v1 to <3 x i32>
+  %2 = zext <3 x i8> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i8>
+  ret <3 x i8> %7
+}
+
+define <4 x i8> @_Z3minDv4_hS_(<4 x i8> %v1, <4 x i8> %v2) nounwind readnone {
+  %1 = zext <4 x i8> %v1 to <4 x i32>
+  %2 = zext <4 x i8> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i8>
+  ret <4 x i8> %4
+}
+
+define zeroext i16 @_Z3mintt(i16 zeroext %v1, i16 zeroext %v2) nounwind readnone {
+  %1 = icmp ult i16 %v1, %v2
+  %2 = select i1 %1, i16 %v1, i16 %v2
+  ret i16 %2
+}
+
+define <2 x i16> @_Z3minDv2_tS_(<2 x i16> %v1, <2 x i16> %v2) nounwind readnone {
+  %1 = zext <2 x i16> %v1 to <2 x i32>
+  %2 = zext <2 x i16> %v2 to <2 x i32>
+  %3 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %1, <2 x i32> %2) nounwind readnone
+  %4 = trunc <2 x i32> %3 to <2 x i16>
+  ret <2 x i16> %4
+}
+
+define <3 x i16> @_Z3minDv3_tS_(<3 x i16> %v1, <3 x i16> %v2) nounwind readnone {
+  %1 = zext <3 x i16> %v1 to <3 x i32>
+  %2 = zext <3 x i16> %v2 to <3 x i32>
+  %3 = shufflevector <3 x i32> %1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %4 = shufflevector <3 x i32> %2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %3, <4 x i32> %4) nounwind readnone
+  %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  %7 = trunc <3 x i32> %6 to <3 x i16>
+  ret <3 x i16> %7
+}
+
+define <4 x i16> @_Z3minDv4_tS_(<4 x i16> %v1, <4 x i16> %v2) nounwind readnone {
+  %1 = zext <4 x i16> %v1 to <4 x i32>
+  %2 = zext <4 x i16> %v2 to <4 x i32>
+  %3 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = trunc <4 x i32> %3 to <4 x i16>
+  ret <4 x i16> %4
+}
+
+define i32 @_Z3minjj(i32 %v1, i32 %v2) nounwind readnone {
+  %1 = icmp ult i32 %v1, %v2
+  %2 = select i1 %1, i32 %v1, i32 %v2
+  ret i32 %2
+}
+
+define <2 x i32> @_Z3minDv2_jS_(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone {
+  %1 = tail call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %v1, <2 x i32> %v2) nounwind readnone
+  ret <2 x i32> %1
+}
+
+define <3 x i32> @_Z3minDv3_jS_(<3 x i32> %v1, <3 x i32> %v2) nounwind readnone {
+  %1 = shufflevector <3 x i32> %v1, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <3 x i32> %v2, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = tail call <4 x i32   > @llvm.arm.neon.vminu.v4i32(<4 x i32> %1, <4 x i32> %2) nounwind readnone
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x i32> %4
+}
+
+define <4 x i32> @_Z3minDv4_jS_(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone {
+  %1 = tail call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %v1, <4 x i32> %v2) nounwind readnone
+  ret <4 x i32> %1
+}
+
+define i64 @_Z3minyy(i64 %v1, i64 %v2) nounwind readnone {
+  %1 = icmp ult i64 %v1, %v2
+  %2 = select i1 %1, i64 %v1, i64 %v2
+  ret i64 %2
+}
+
+; TODO:  long vector types
+
+define float @_Z3minff(float %v1, float %v2) nounwind readnone {
+  %1 = tail call float @_Z4fminff(float %v1, float %v2)
+  ret float %1
+}
+
+define <2 x float> @_Z3minDv2_fS_(<2 x float> %v1, <2 x float> %v2) nounwind readnone {
+  %1 = tail call <2 x float> @_Z4fminDv2_fS_(<2 x float> %v1, <2 x float> %v2)
+  ret <2 x float> %1
+}
+
+define <2 x float> @_Z3minDv2_ff(<2 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <2 x float> @_Z4fminDv2_ff(<2 x float> %v1, float %v2)
+  ret <2 x float> %1
+}
+
+define <3 x float> @_Z3minDv3_fS_(<3 x float> %v1, <3 x float> %v2) nounwind readnone {
+  %1 = tail call <3 x float> @_Z4fminDv3_fS_(<3 x float> %v1, <3 x float> %v2)
+  ret <3 x float> %1
+}
+
+define <3 x float> @_Z3minDv3_ff(<3 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <3 x float> @_Z4fminDv3_ff(<3 x float> %v1, float %v2)
+  ret <3 x float> %1
+}
+
+define <4 x float> @_Z3minDv4_fS_(<4 x float> %v1, <4 x float> %v2) nounwind readnone {
+  %1 = tail call <4 x float> @_Z4fminDv4_fS_(<4 x float> %v1, <4 x float> %v2)
+  ret <4 x float> %1
+}
+
+define <4 x float> @_Z3minDv4_ff(<4 x float> %v1, float %v2) nounwind readnone {
+  %1 = tail call <4 x float> @_Z4fminDv4_ff(<4 x float> %v1, float %v2)
+  ret <4 x float> %1
+}
+
diff --git a/lib/ScriptCRT/clamp.c b/lib/ScriptCRT/clamp.c
deleted file mode 100644
index c7e2c39..0000000
--- a/lib/ScriptCRT/clamp.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2012 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-#include "rs_types.rsh"
-
-extern float __attribute__((overloadable)) clamp(float amount, float low, float high) {
-    return amount < low ? low : (amount > high ? high : amount);
-}
-
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float2 low, float2 high) {
-    float2 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float3 low, float3 high) {
-    float3 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float4 low, float4 high) {
-    float4 r;
-    r.x = amount.x < low.x ? low.x : (amount.x > high.x ? high.x : amount.x);
-    r.y = amount.y < low.y ? low.y : (amount.y > high.y ? high.y : amount.y);
-    r.z = amount.z < low.z ? low.z : (amount.z > high.z ? high.z : amount.z);
-    r.w = amount.w < low.w ? low.w : (amount.w > high.w ? high.w : amount.w);
-    return r;
-}
-
-extern float2 __attribute__((overloadable)) clamp(float2 amount, float low, float high) {
-    float2 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    return r;
-}
-
-extern float3 __attribute__((overloadable)) clamp(float3 amount, float low, float high) {
-    float3 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    return r;
-}
-
-extern float4 __attribute__((overloadable)) clamp(float4 amount, float low, float high) {
-    float4 r;
-    r.x = amount.x < low ? low : (amount.x > high ? high : amount.x);
-    r.y = amount.y < low ? low : (amount.y > high ? high : amount.y);
-    r.z = amount.z < low ? low : (amount.z > high ? high : amount.z);
-    r.w = amount.w < low ? low : (amount.w > high ? high : amount.w);
-    return r;
-}
-
-
diff --git a/lib/ScriptCRT/convert.ll b/lib/ScriptCRT/convert.ll
index e590ad1..f45850d 100644
--- a/lib/ScriptCRT/convert.ll
+++ b/lib/ScriptCRT/convert.ll
@@ -1,6 +1,11 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
 target triple = "armv7-none-linux-gnueabi"
 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  FLOAT                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 define <2 x float> @_Z14convert_float2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
   %1 = uitofp <2 x i8> %in to <2 x float>
   ret <2 x float> %1
@@ -103,35 +108,127 @@
   ret <4 x float> %in
 }
 
-;---
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  CHAR                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+define <4 x i8> @_Z13convert_char4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <4 x float> %in to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <3 x float> %in to <3 x i8>
+  ret <3 x i8> %1
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <2 x float> %in to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  ret <4 x i8> %in
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  ret <3 x i8> %in
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  ret <2 x i8> %in
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  ret <4 x i8> %in
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  ret <3 x i8> %in
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  ret <2 x i8> %in
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i16> %in to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i16> %in to <3 x i8>
+  ret <3 x i8> %1
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i16> %in to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i16> %in to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i16> %in to <3 x i8>
+  ret <3 x i8> %1
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i16> %in to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i8>
+  ret <3 x i8> %1
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i8>
+  ret <2 x i8> %1
+}
+
+define <4 x i8> @_Z13convert_char4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <3 x i8> @_Z13convert_char3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i8>
+  ret <3 x i8> %1
+}
+
+define <2 x i8> @_Z13convert_char2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i8>
+  ret <2 x i8> %1
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  UCHAR                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 define <4 x i8> @_Z14convert_uchar4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
-  %1 = fptoui <4 x float> %in to <4 x i32>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i8> %5
+  %1 = fptoui <4 x float> %in to <4 x i8>
+  ret <4 x i8> %1
 }
 
 define <3 x i8> @_Z14convert_uchar3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
-  %in2 = shufflevector <3 x float> %in, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %1 = fptoui <4 x float> %in2 to <4 x i32>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i8> %5
+  %1 = fptoui <3 x float> %in to <3 x i8>
+  ret <3 x i8> %1
 }
 
 define <2 x i8> @_Z14convert_uchar2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
-  %in2 = shufflevector <2 x float> %in, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %1 = fptoui <4 x float> %in2 to <4 x i32>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i8> %5
+  %1 = fptoui <2 x float> %in to <2 x i8>
+  ret <2 x i8> %1
 }
 
 define <4 x i8> @_Z14convert_uchar4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
@@ -158,99 +255,477 @@
   ret <2 x i8> %in
 }
 
-
 define <4 x i8> @_Z14convert_uchar4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <4 x i16> %in, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i8> %3
+  %1 = trunc <4 x i16> %in to <4 x i8>
+  ret <4 x i8> %1
 }
 
 define <3 x i8> @_Z14convert_uchar3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <3 x i16> %in, <3 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i8> %3
+  %1 = trunc <3 x i16> %in to <3 x i8>
+  ret <3 x i8> %1
 }
 
 define <2 x i8> @_Z14convert_uchar2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <2 x i16> %in, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i8> %3
+  %1 = trunc <2 x i16> %in to <2 x i8>
+  ret <2 x i8> %1
 }
 
 define <4 x i8> @_Z14convert_uchar4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <4 x i16> %in, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i8> %3
+  %1 = trunc <4 x i16> %in to <4 x i8>
+  ret <4 x i8> %1
 }
 
 define <3 x i8> @_Z14convert_uchar3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <3 x i16> %in, <3 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 5, i32 5>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i8> %3
+  %1 = trunc <3 x i16> %in to <3 x i8>
+  ret <3 x i8> %1
 }
 
 define <2 x i8> @_Z14convert_uchar2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <2 x i16> %in, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-  %2 = trunc <8 x i16> %1 to <8 x i8>
-  %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i8> %3
+  %1 = trunc <2 x i16> %in to <2 x i8>
+  ret <2 x i8> %1
 }
 
-
 define <4 x i8> @_Z14convert_uchar4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %3 = trunc <8 x i16> %2 to <8 x i8>
-  %4 = shufflevector <8 x i8> %3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i8> %4
+  %1 = trunc <4 x i32> %in to <4 x i8>
+  ret <4 x i8> %1
 }
 
 define <3 x i8> @_Z14convert_uchar3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <3 x i32> %in, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i8> %5
+  %1 = trunc <3 x i32> %in to <3 x i8>
+  ret <3 x i8> %1
 }
 
 define <2 x i8> @_Z14convert_uchar2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <2 x i32> %in, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i8> %5
+  %1 = trunc <2 x i32> %in to <2 x i8>
+  ret <2 x i8> %1
 }
 
 define <4 x i8> @_Z14convert_uchar4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
-  %1 = trunc <4 x i32> %in to <4 x i16>
-  %2 = shufflevector <4 x i16> %1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %3 = trunc <8 x i16> %2 to <8 x i8>
-  %4 = shufflevector <8 x i8> %3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i8> %4
+  %1 = trunc <4 x i32> %in to <4 x i8>
+  ret <4 x i8> %1
 }
 
 define <3 x i8> @_Z14convert_uchar3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <3 x i32> %in, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x i8> %5
+  %1 = trunc <3 x i32> %in to <3 x i8>
+  ret <3 x i8> %1
 }
 
 define <2 x i8> @_Z14convert_uchar2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
-  %1 = shufflevector <2 x i32> %in, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %2 = trunc <4 x i32> %1 to <4 x i16>
-  %3 = shufflevector <4 x i16> %2, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %4 = trunc <8 x i16> %3 to <8 x i8>
-  %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i8> %5
+  %1 = trunc <2 x i32> %in to <2 x i8>
+  ret <2 x i8> %1
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  SHORT                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x i16> @_Z14convert_short4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <4 x float> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <3 x float> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <2 x float> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <4 x i8> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <3 x i8> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <2 x i8> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  ret <4 x i16> %in
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  ret <3 x i16> %in
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  ret <2 x i16> %in
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  ret <4 x i16> %in
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  ret <3 x i16> %in
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  ret <2 x i16> %in
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z14convert_short4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z14convert_short3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z14convert_short2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                 USHORT                 ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x i16> @_Z15convert_ushort4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <4 x float> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <3 x float> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <2 x float> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  ret <4 x i16> %in
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  ret <3 x i16> %in
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  ret <2 x i16> %in
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  ret <4 x i16> %in
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  ret <3 x i16> %in
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  ret <2 x i16> %in
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+define <4 x i16> @_Z15convert_ushort4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <4 x i32> %in to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <3 x i16> @_Z15convert_ushort3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <3 x i32> %in to <3 x i16>
+  ret <3 x i16> %1
+}
+
+define <2 x i16> @_Z15convert_ushort2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  %1 = trunc <2 x i32> %in to <2 x i16>
+  ret <2 x i16> %1
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                   INT                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x i32> @_Z12convert_int4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <4 x float> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <3 x float> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptosi <2 x float> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <4 x i8> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <3 x i8> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = sext <2 x i8> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i16> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i16> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i16> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sext <4 x i16> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sext <3 x i16> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = sext <2 x i16> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  ret <4 x i32> %in
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  ret <3 x i32> %in
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  ret <2 x i32> %in
+}
+
+define <4 x i32> @_Z12convert_int4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  ret <4 x i32> %in
+}
+
+define <3 x i32> @_Z12convert_int3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  ret <3 x i32> %in
+}
+
+define <2 x i32> @_Z12convert_int2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  ret <2 x i32> %in
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;                  UINT                  ;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <4 x i32> @_Z13convert_uint4Dv4_f(<4 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <4 x float> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_f(<3 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <3 x float> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_f(<2 x float> %in) nounwind readnone alwaysinline {
+  %1 = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_h(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_h(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_h(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_c(<4 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i8> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_c(<3 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i8> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_c(<2 x i8> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i8> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_t(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i16> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_t(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i16> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_t(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i16> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_s(<4 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <4 x i16> %in to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_s(<3 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <3 x i16> %in to <3 x i32>
+  ret <3 x i32> %1
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_s(<2 x i16> %in) nounwind readnone alwaysinline {
+  %1 = zext <2 x i16> %in to <2 x i32>
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_j(<4 x i32> %in) nounwind readnone alwaysinline {
+  ret <4 x i32> %in
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_j(<3 x i32> %in) nounwind readnone alwaysinline {
+  ret <3 x i32> %in
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_j(<2 x i32> %in) nounwind readnone alwaysinline {
+  ret <2 x i32> %in
+}
+
+define <4 x i32> @_Z13convert_uint4Dv4_i(<4 x i32> %in) nounwind readnone alwaysinline {
+  ret <4 x i32> %in
+}
+
+define <3 x i32> @_Z13convert_uint3Dv3_i(<3 x i32> %in) nounwind readnone alwaysinline {
+  ret <3 x i32> %in
+}
+
+define <2 x i32> @_Z13convert_uint2Dv2_i(<2 x i32> %in) nounwind readnone alwaysinline {
+  ret <2 x i32> %in
+}
diff --git a/lib/ScriptCRT/math.ll b/lib/ScriptCRT/math.ll
new file mode 100644
index 0000000..4ea2b10
--- /dev/null
+++ b/lib/ScriptCRT/math.ll
@@ -0,0 +1,16 @@
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+declare float @llvm.sqrt.f32(float)
+declare float @llvm.pow.f32(float, float)
+
+define float @_Z4sqrtf(float %v) {
+  %1 = tail call float @llvm.sqrt.f32(float %v)
+  ret float %1
+}
+
+define float @_Z3powf(float %v1, float %v2) {
+  %1 = tail call float @llvm.pow.f32(float  %v1, float %v2)
+  ret float %1
+}
+
diff --git a/lib/ScriptCRT/neon/clamp.ll b/lib/ScriptCRT/neon/clamp.ll
deleted file mode 100644
index 4bcbdaa..0000000
--- a/lib/ScriptCRT/neon/clamp.ll
+++ /dev/null
@@ -1,81 +0,0 @@
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64"
-target triple = "armv7-none-linux-gnueabi"
-
-define internal <4 x float> @smear_4f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <4 x float> undef, float %in, i32 0
-  %2 = insertelement <4 x float> %1, float %in, i32 1
-  %3 = insertelement <4 x float> %2, float %in, i32 2
-  %4 = insertelement <4 x float> %3, float %in, i32 3
-  ret <4 x float> %4
-}
-
-define internal <2 x float> @smear_2f(float %in) nounwind readnone alwaysinline {
-  %1 = insertelement <2 x float> undef, float %in, i32 0
-  %2 = insertelement <2 x float> %1, float %in, i32 1
-  ret <2 x float> %2
-}
-
-declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
-
-define <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %low, <4 x float> %high) nounwind readonly {
-  %1 = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %value, <4 x float> %high) nounwind readnone
-  %2 = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %1, <4 x float> %low) nounwind readnone
-  ret <4 x float> %2
-}
-
-define <4 x float> @_Z5clampDv4_fff(<4 x float> %value, float %low, float %high) nounwind readonly {
-  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
-  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
-  %out = tail call <4 x float> @_Z5clampDv4_fS_S_(<4 x float> %value, <4 x float> %_low, <4 x float> %_high) nounwind readonly
-  ret <4 x float> %out
-}
-
-define <3 x float> @_Z5clampDv3_fS_S_(<3 x float> %value, <3 x float> %low, <3 x float> %high) nounwind readonly {
-  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_low = shufflevector <3 x float> %low, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_high = shufflevector <3 x float> %high, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
-  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
-  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-define <3 x float> @_Z5clampDv3_fff(<3 x float> %value, float %low, float %high) nounwind readonly {
-  %_value = shufflevector <3 x float> %value, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %_high = tail call <4 x float> @smear_4f(float %high) nounwind readnone
-  %_low = tail call <4 x float> @smear_4f(float %low) nounwind readnone
-  %a = tail call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %_value, <4 x float> %_high) nounwind readnone
-  %b = tail call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %_low) nounwind readnone
-  %c = shufflevector <4 x float> %b, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  ret <3 x float> %c
-}
-
-
-define <2 x float> @_Z5clampDv2_fS_S_(<2 x float> %value, <2 x float> %low, <2 x float> %high) nounwind readonly {
-  %1 = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %high) nounwind readnone
-  %2 = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %1, <2 x float> %low) nounwind readnone
-  ret <2 x float> %2
-}
-
-define <2 x float> @_Z5clampDv2_fff(<2 x float> %value, float %low, float %high) nounwind readonly {
-  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
-  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
-  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %value, <2 x float> %_high) nounwind readnone
-  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
-  ret <2 x float> %b
-}
-
-
-define float @_Z5clampfff(float %value, float %low, float %high) nounwind readonly {
-  %_value = tail call <2 x float> @smear_2f(float %value) nounwind readnone
-  %_low = tail call <2 x float> @smear_2f(float %low) nounwind readnone
-  %_high = tail call <2 x float> @smear_2f(float %high) nounwind readnone
-  %a = tail call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %_value, <2 x float> %_high) nounwind readnone
-  %b = tail call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %_low) nounwind readnone
-  %c = extractelement <2 x float> %b, i32 0
-  ret float %c
-}
-
diff --git a/lib/ScriptCRT/rs_cl.c b/lib/ScriptCRT/rs_cl.c
index 8e6f8ef..b707f44 100644
--- a/lib/ScriptCRT/rs_cl.c
+++ b/lib/ScriptCRT/rs_cl.c
@@ -1,40 +1,5 @@
 #include "rs_types.rsh"
 
-// Conversions
-#define CVT_FUNC_2(typeout, typein)                             \
-extern typeout##2 __attribute__((overloadable))             \
-        convert_##typeout##2(typein##2 v) {                     \
-    typeout##2 r = {(typeout)v.x, (typeout)v.y};                \
-    return r;                                                   \
-}                                                               \
-extern typeout##3 __attribute__((overloadable))             \
-        convert_##typeout##3(typein##3 v) {                     \
-    typeout##3 r = {(typeout)v.x, (typeout)v.y, (typeout)v.z};  \
-    return r;                                                   \
-}                                                               \
-extern typeout##4 __attribute__((overloadable))             \
-        convert_##typeout##4(typein##4 v) {                     \
-    typeout##4 r = {(typeout)v.x, (typeout)v.y, (typeout)v.z,   \
-                    (typeout)v.w};                              \
-    return r;                                                   \
-}
-
-#define CVT_FUNC(type)  CVT_FUNC_2(type, uchar)     \
-                        CVT_FUNC_2(type, char)      \
-                        CVT_FUNC_2(type, ushort)    \
-                        CVT_FUNC_2(type, short)     \
-                        CVT_FUNC_2(type, uint)      \
-                        CVT_FUNC_2(type, int)       \
-                        CVT_FUNC_2(type, float)
-
-CVT_FUNC(char)
-//CVT_FUNC(uchar)
-CVT_FUNC(short)
-CVT_FUNC(ushort)
-CVT_FUNC(int)
-CVT_FUNC(uint)
-//CVT_FUNC(float)
-
 // Float ops, 6.11.2
 
 #define FN_FUNC_FN(fnc)                                         \
@@ -447,13 +412,7 @@
 extern float __attribute__((overloadable)) fma(float, float, float);
 FN_FUNC_FN_FN_FN(fma)
 
-extern float __attribute__((overloadable)) fmax(float, float);
-FN_FUNC_FN_FN(fmax);
-FN_FUNC_FN_F(fmax);
-
 extern float __attribute__((overloadable)) fmin(float, float);
-FN_FUNC_FN_FN(fmin);
-FN_FUNC_FN_F(fmin);
 
 extern float __attribute__((overloadable)) fmod(float, float);
 FN_FUNC_FN_FN(fmod)
@@ -738,12 +697,6 @@
 UIN_FUNC_IN(abs)
 IN_FUNC_IN(clz)
 
-IN_FUNC_IN_IN_BODY(min, (v1 < v2 ? v1 : v2))
-FN_FUNC_FN_F(min)
-
-IN_FUNC_IN_IN_BODY(max, (v1 > v2 ? v1 : v2))
-FN_FUNC_FN_F(max)
-
 
 // 6.11.4
 
@@ -912,8 +865,6 @@
     return v / length(v);
 }
 
-#undef CVT_FUNC
-#undef CVT_FUNC_2
 #undef FN_FUNC_FN
 #undef IN_FUNC_FN
 #undef FN_FUNC_FN_FN
diff --git a/lib/ScriptCRT/rs_sample.c b/lib/ScriptCRT/rs_sample.c
index 670bea3..b41e7f1 100644
--- a/lib/ScriptCRT/rs_sample.c
+++ b/lib/ScriptCRT/rs_sample.c
@@ -200,7 +200,7 @@
         break;                                                                                  \
     }                                                                                           \
                                                                                                 \
-    return result;                                                                              \
+    return result * 0.003921569f;                                                                              \
 } // End of body of the bilinear sampling function
 
 // Body of the nearest sampling function
@@ -227,7 +227,7 @@
         break;                                                                                  \
     }                                                                                           \
                                                                                                 \
-    return result;                                                                              \
+    return result * 0.003921569f;                                                                              \
 } // End of body of the nearest sampling function
 
 static float4 __attribute__((overloadable))