am da2f0714: Merge "Fix x86 build for SSSE3."

* commit 'da2f071483b148bba216d40070d87f92785b527f':
  Fix x86 build for SSSE3.
diff --git a/Android.mk b/Android.mk
index 98ff0a6..f6aab50 100644
--- a/Android.mk
+++ b/Android.mk
@@ -268,6 +268,8 @@
 
 LOCAL_STATIC_LIBRARIES := libcutils libutils liblog
 
+LOCAL_CLANG := true
+
 include $(BUILD_HOST_STATIC_LIBRARY)
 
 LLVM_ROOT_PATH := external/llvm
diff --git a/api/gen_runtime.cpp b/api/gen_runtime.cpp
index 65682e5..bfc3d6d 100644
--- a/api/gen_runtime.cpp
+++ b/api/gen_runtime.cpp
@@ -211,7 +211,18 @@
      * convert.
      */
     string mCleanName;
-    string mTest;  // How to test.  One of "scalar", "vector", "noverify", "limited", and "none".
+    /* How to test.  One of:
+     * "scalar": Generate test code that checks entries of each vector indepently.  E.g. for
+     *           sin(float3), the test code will call the CoreMathVerfier.computeSin 3 times.
+     * "vector": Generate test code that calls the CoreMathVerifier only once for each vector.
+     *           This is useful for APIs like dot() or length().
+     * "noverify": Generate test code that calls the API but don't verify the returned value.
+     * "limited": Like "scalar" but tests a limited range of input values.
+     * "custom": Like "scalar" but instead of calling CoreMathVerifier.computeXXX() to compute
+     *           the expected value, we call instead CoreMathVerifier.verifyXXX().  This method
+     *           returns a string that contains the error message, null if there's no error.
+     */
+    string mTest;
     string mPrecisionLimit;  // Maximum precision required when checking output of this function.
 
     vector<vector<string> > mReplaceables;
@@ -296,6 +307,7 @@
     string mJavaArgumentsClassName;
     string mJavaArgumentsNClassName;
     string mJavaVerifierComputeMethodName;
+    string mJavaVerifierVerifyMethodName;
     string mJavaCheckMethodName;
     string mJavaVerifyMethodName;
 
@@ -305,8 +317,8 @@
 
     void writeJavaSection(ofstream& file) const;
     void writeJavaArgumentClass(ofstream& file, bool scalar) const;
-    void writeJavaCheckMethod(ofstream& file, bool generateCallToVerify) const;
-    void writeJavaVerifyScalarMethod(ofstream& file) const;
+    void writeJavaCheckMethod(ofstream& file, bool generateCallToVerifier) const;
+    void writeJavaVerifyScalarMethod(ofstream& file, bool verifierValidates) const;
     void writeJavaVerifyVectorMethod(ofstream& file) const;
     void writeJavaVerifyFunctionHeader(ofstream& file) const;
     void writeJavaInputAllocationDefinition(ofstream& file, const string& indent,
@@ -322,14 +334,15 @@
                                                     const string& seed, char vectorSize,
                                                     const Type& compatibleType,
                                                     const Type& generatedType) const;
-    void writeJavaCallToRs(ofstream& file, bool relaxed, bool generateCallToVerify) const;
+    void writeJavaCallToRs(ofstream& file, bool relaxed, bool generateCallToVerifier) const;
 
     void writeJavaTestAndSetValid(ofstream& file, int indent, const ParameterDefinition& p,
                                   const string& argsIndex, const string& actualIndex) const;
     void writeJavaTestOneValue(ofstream& file, int indent, const ParameterDefinition& p,
                                const string& argsIndex, const string& actualIndex) const;
     void writeJavaAppendOutputToMessage(ofstream& file, int indent, const ParameterDefinition& p,
-                                        const string& argsIndex, const string& actualIndex) const;
+                                        const string& argsIndex, const string& actualIndex,
+                                        bool verifierValidates) const;
     void writeJavaAppendInputToMessage(ofstream& file, int indent, const ParameterDefinition& p,
                                        const string& actual) const;
     void writeJavaAppendNewLineToMessage(ofstream& file, int indent) const;
@@ -349,10 +362,6 @@
     Permutation(Function* function, Specification* specification, int i1, int i2, int i3, int i4);
     void writeFiles(ofstream& headerFile, ofstream& rsFile, ofstream& javaFile,
                     int versionOfTestFiles);
-#define DISABLE_LONG_AND_DOUBLE_TESTS
-#ifdef DISABLE_LONG_AND_DOUBLE_TESTS
-    bool hasLongOrDoubleParameter() const;
-#endif
 };
 
 // Table of type equivalences
@@ -950,7 +959,7 @@
 
         if (s.compare(0, 5, "test:") == 0) {
             trim(&s, 5);
-            if (s == "scalar" || s == "vector" || s == "noverify" || s == "none") {
+            if (s == "scalar" || s == "vector" || s == "noverify" || s == "custom" || s == "none") {
                 spec->mTest = s;
             } else if (s.compare(0, 7, "limited") == 0) {
                 spec->mTest = "limited";
@@ -1144,32 +1153,12 @@
         mJavaVerifyMethodName += capitalize(p.rsType);
     }
     mJavaVerifierComputeMethodName = "compute" + capitalize(mCleanName);
+    mJavaVerifierVerifyMethodName = "verify" + capitalize(mCleanName);
 }
 
-#ifdef DISABLE_LONG_AND_DOUBLE_TESTS
-// TODO Remove once we have long/double copyTo/copyFrom
-bool Permutation::hasLongOrDoubleParameter() const {
-    for (size_t i = 0; i < mParams.size(); i++) {
-        const ParameterDefinition& p = *mParams[i];
-        if (p.javaBaseType == "long" || p.javaBaseType == "double") {
-            return true;
-        }
-    }
-    return false;
-}
-#endif
-
 void Permutation::writeFiles(ofstream& headerFile, ofstream& rsFile, ofstream& javaFile,
                              int versionOfTestFiles) {
     writeHeaderSection(headerFile);
-#ifdef DISABLE_LONG_AND_DOUBLE_TESTS
-    if (hasLongOrDoubleParameter()) {
-        printf("Warning: skipping a test for %s as we don't support long or double arguments (due "
-               "to Allocation not supporting them).\n",
-               mName.c_str());
-        return;
-    }
-#endif
     if (mSpecification->relevantForVersion(versionOfTestFiles) && mTest != "none") {
         writeRsSection(rsFile);
         writeJavaSection(javaFile);
@@ -1392,7 +1381,11 @@
     if (mTest == "scalar" || mTest == "limited") {
         writeJavaArgumentClass(file, true);
         writeJavaCheckMethod(file, true);
-        writeJavaVerifyScalarMethod(file);
+        writeJavaVerifyScalarMethod(file, false);
+    } else if (mTest == "custom") {
+        writeJavaArgumentClass(file, true);
+        writeJavaCheckMethod(file, true);
+        writeJavaVerifyScalarMethod(file, true);
     } else if (mTest == "vector") {
         writeJavaArgumentClass(file, false);
         writeJavaCheckMethod(file, true);
@@ -1432,7 +1425,7 @@
     mFunction->writeJavaArgumentClassDefinition(name, s);
 }
 
-void Permutation::writeJavaCheckMethod(ofstream& file, bool generateCallToVerify) const {
+void Permutation::writeJavaCheckMethod(ofstream& file, bool generateCallToVerifier) const {
     file << tab(1) << "private void " << mJavaCheckMethodName << "() {\n";
     // Generate the input allocations and initialization.
     for (size_t i = 0; i < mParams.size(); i++) {
@@ -1450,8 +1443,8 @@
                  << ");\n";
         }
     }
-    writeJavaCallToRs(file, false, generateCallToVerify);
-    writeJavaCallToRs(file, true, generateCallToVerify);
+    writeJavaCallToRs(file, false, generateCallToVerifier);
+    writeJavaCallToRs(file, true, generateCallToVerifier);
     file << tab(1) << "}\n\n";
 }
 
@@ -1571,7 +1564,7 @@
     }
 }
 
-void Permutation::writeJavaVerifyScalarMethod(ofstream& file) const {
+void Permutation::writeJavaVerifyScalarMethod(ofstream& file, bool verifierValidates) const {
     writeJavaVerifyFunctionHeader(file);
     string vectorSize = "1";
     for (size_t i = 0; i < mParams.size(); i++) {
@@ -1603,16 +1596,31 @@
         }
     }
 
-    file << tab(4) << "// Figure out what the outputs should have been.\n";
-    file << tab(4) << "Floaty.setRelaxed(relaxed);\n";
-    file << tab(4) << "CoreMathVerifier." << mJavaVerifierComputeMethodName << "(args);\n";
-
-    file << tab(4) << "// Figure out what the outputs should have been.\n";
-    file << tab(4) << "boolean valid = true;\n";
-    for (size_t i = 0; i < mParams.size(); i++) {
-        const ParameterDefinition& p = *mParams[i];
-        if (p.isOutParameter) {
-            writeJavaTestAndSetValid(file, 4, p, "", "[i * " + p.vectorWidth + " + j]");
+    if (verifierValidates) {
+        file << tab(4) << "// Extract the outputs.\n";
+        for (size_t i = 0; i < mParams.size(); i++) {
+            const ParameterDefinition& p = *mParams[i];
+            if (p.isOutParameter) {
+                file << tab(4) << "args." << p.variableName << " = " << p.javaArrayName
+                     << "[i * " + p.vectorWidth + " + j];\n";
+            }
+        }
+        file << tab(4) << "// Ask the CoreMathVerifier to validate.\n";
+        file << tab(4) << "Floaty.setRelaxed(relaxed);\n";
+        file << tab(4) << "String errorMessage = CoreMathVerifier." << mJavaVerifierVerifyMethodName
+             << "(args, relaxed);\n";
+        file << tab(4) << "boolean valid = errorMessage == null;\n";
+    } else {
+        file << tab(4) << "// Figure out what the outputs should have been.\n";
+        file << tab(4) << "Floaty.setRelaxed(relaxed);\n";
+        file << tab(4) << "CoreMathVerifier." << mJavaVerifierComputeMethodName << "(args);\n";
+        file << tab(4) << "// Validate the outputs.\n";
+        file << tab(4) << "boolean valid = true;\n";
+        for (size_t i = 0; i < mParams.size(); i++) {
+            const ParameterDefinition& p = *mParams[i];
+            if (p.isOutParameter) {
+                writeJavaTestAndSetValid(file, 4, p, "", "[i * " + p.vectorWidth + " + j]");
+            }
         }
     }
 
@@ -1621,11 +1629,15 @@
     for (size_t i = 0; i < mParams.size(); i++) {
         const ParameterDefinition& p = *mParams[i];
         if (p.isOutParameter) {
-            writeJavaAppendOutputToMessage(file, 5, p, "", "[i * " + p.vectorWidth + " + j]");
+            writeJavaAppendOutputToMessage(file, 5, p, "", "[i * " + p.vectorWidth + " + j]",
+                                           verifierValidates);
         } else {
             writeJavaAppendInputToMessage(file, 5, p, "args." + p.variableName);
         }
     }
+    if (verifierValidates) {
+        file << tab(5) << "message.append(errorMessage);\n";
+    }
 
     file << tab(5) << "assertTrue(\"Incorrect output for " << mJavaCheckMethodName << "\" +\n";
     file << tab(7) << "(relaxed ? \"_relaxed\" : \"\") + \":\\n\" + message.toString(), valid);\n";
@@ -1673,24 +1685,35 @@
 
 void Permutation::writeJavaAppendOutputToMessage(ofstream& file, int indent,
                                                  const ParameterDefinition& p,
-                                                 const string& argsIndex,
-                                                 const string& actualIndex) const {
-    const string expected = "args." + p.variableName + argsIndex;
-    const string actual = p.javaArrayName + actualIndex;
-    file << tab(indent) << "message.append(\"Expected output " + p.variableName + ": \");\n";
-    if (p.isFloatType) {
-        writeJavaAppendFloatyVariableToMessage(file, indent, expected);
+                                                 const string& argsIndex, const string& actualIndex,
+                                                 bool verifierValidates) const {
+    if (verifierValidates) {
+        const string actual = "args." + p.variableName + argsIndex;
+        file << tab(indent) << "message.append(\"Output " + p.variableName + ": \");\n";
+        if (p.isFloatType) {
+            writeJavaAppendFloatyVariableToMessage(file, indent, actual);
+        } else {
+            writeJavaAppendVariableToMessage(file, indent, p, actual);
+        }
+        writeJavaAppendNewLineToMessage(file, indent);
     } else {
-        writeJavaAppendVariableToMessage(file, indent, p, expected);
-    }
-    writeJavaAppendNewLineToMessage(file, indent);
-    file << tab(indent) << "message.append(\"Actual   output " + p.variableName + ": \");\n";
-    writeJavaAppendVariableToMessage(file, indent, p, actual);
+        const string expected = "args." + p.variableName + argsIndex;
+        const string actual = p.javaArrayName + actualIndex;
+        file << tab(indent) << "message.append(\"Expected output " + p.variableName + ": \");\n";
+        if (p.isFloatType) {
+            writeJavaAppendFloatyVariableToMessage(file, indent, expected);
+        } else {
+            writeJavaAppendVariableToMessage(file, indent, p, expected);
+        }
+        writeJavaAppendNewLineToMessage(file, indent);
+        file << tab(indent) << "message.append(\"Actual   output " + p.variableName + ": \");\n";
+        writeJavaAppendVariableToMessage(file, indent, p, actual);
 
-    writeJavaTestOneValue(file, indent, p, argsIndex, actualIndex);
-    file << tab(indent + 1) << "message.append(\" FAIL\");\n";
-    file << tab(indent) << "}\n";
-    writeJavaAppendNewLineToMessage(file, indent);
+        writeJavaTestOneValue(file, indent, p, argsIndex, actualIndex);
+        file << tab(indent + 1) << "message.append(\" FAIL\");\n";
+        file << tab(indent) << "}\n";
+        writeJavaAppendNewLineToMessage(file, indent);
+    }
 }
 
 void Permutation::writeJavaAppendInputToMessage(ofstream& file, int indent,
@@ -1755,12 +1778,12 @@
 void Permutation::writeJavaAppendVectorOutputToMessage(ofstream& file, int indent,
                                                        const ParameterDefinition& p) const {
     if (p.mVectorSize == "1") {
-        writeJavaAppendOutputToMessage(file, indent, p, "", "[i]");
+        writeJavaAppendOutputToMessage(file, indent, p, "", "[i]", false);
 
     } else {
         file << tab(indent) << "for (int j = 0; j < " << p.mVectorSize << " ; j++) {\n";
         writeJavaAppendOutputToMessage(file, indent + 1, p, "[j]",
-                                       "[i * " + p.vectorWidth + " + j]");
+                                       "[i * " + p.vectorWidth + " + j]", false);
         file << tab(indent) << "}\n";
     }
 }
@@ -1834,7 +1857,7 @@
     file << tab(1) << "}\n\n";
 }
 
-void Permutation::writeJavaCallToRs(ofstream& file, bool relaxed, bool generateCallToVerify) const {
+void Permutation::writeJavaCallToRs(ofstream& file, bool relaxed, bool generateCallToVerifier) const {
     string script = "script";
     if (relaxed) {
         script += "Relaxed";
@@ -1869,7 +1892,7 @@
         file << mParams[mReturnIndex]->variableName << ");\n";
     }
 
-    if (generateCallToVerify) {
+    if (generateCallToVerifier) {
         file << tab(3) << mJavaVerifyMethodName << "(";
         for (size_t i = 0; i < mParams.size(); i++) {
             const ParameterDefinition& p = *mParams[i];
diff --git a/api/generate.sh b/api/generate.sh
index 4aa933c..7bafa1a 100755
--- a/api/generate.sh
+++ b/api/generate.sh
@@ -17,7 +17,7 @@
 
 set -e
 g++ gen_runtime.cpp -Wall -o gen_runtime
-./gen_runtime -v 19 rs_core_math.spec
+./gen_runtime -v 21 rs_core_math.spec
 mv Test*.java ../../../cts/tests/tests/renderscript/src/android/renderscript/cts/
 mv Test*.rs ../../../cts/tests/src/android/renderscript/cts/
 mv rs_core_math.rsh ../scriptc/
diff --git a/api/rs_core_math.spec b/api/rs_core_math.spec
index b55ca97..192364e 100644
--- a/api/rs_core_math.spec
+++ b/api/rs_core_math.spec
@@ -489,6 +489,7 @@
 comment:
  Return the integer exponent of a value
 version: 9
+test: custom
 end:
 
 start:
@@ -986,7 +987,7 @@
 arg: #2#1 v2
 comment:
  Return the minimum value from two arguments
-version: 20
+version: 21
 end:
 
 start:
@@ -1078,7 +1079,7 @@
 arg: #2#1 v2
 comment:
  Return the maximum value from two arguments
-version: 20
+version: 21
 end:
 
 start:
@@ -1239,7 +1240,7 @@
      return 0.f;
  else
      return 1.f;
-version: 20
+version: 21
 end:
 
 start:
@@ -1498,7 +1499,6 @@
 comment:
  acos
 version: 21
-test: noverify
 end:
 
 start:
@@ -1510,7 +1510,6 @@
 comment:
  acosh
 version: 21
-test: noverify
 end:
 
 start:
@@ -1522,7 +1521,6 @@
 comment:
  acospi
 version: 21
-test: noverify
 end:
 
 start:
@@ -1534,7 +1532,6 @@
 comment:
  asin
 version: 21
-test: noverify
 end:
 
 start:
@@ -1546,7 +1543,6 @@
 comment:
  asinh
 version: 21
-test: noverify
 end:
 
 start:
@@ -1558,7 +1554,6 @@
 comment:
  Return the inverse sine divided by PI.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1570,7 +1565,6 @@
 comment:
  Return the inverse tangent.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1583,7 +1577,6 @@
 comment:
  Return the inverse tangent of y / x.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1595,7 +1588,6 @@
 comment:
  Return the inverse hyperbolic tangent.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1607,7 +1599,6 @@
 comment:
  Return the inverse tangent divided by PI.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1620,7 +1611,6 @@
 comment:
  Return the inverse tangent of y / x, divided by PI.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1632,7 +1622,6 @@
 comment:
  Return the cube root.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1644,7 +1633,6 @@
 comment:
  Return the cosine.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1656,7 +1644,6 @@
 comment:
  Return the hypebolic cosine.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1668,7 +1655,6 @@
 comment:
  Return the cosine of the value * PI.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1680,7 +1666,6 @@
 comment:
  Return (e ^ value) - 1.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1693,7 +1678,7 @@
 comment:
  Compute the approximate distance between two points.
 version: 21
-test: noverify
+test: vector
 end:
 
 start:
@@ -1706,7 +1691,6 @@
 comment:
  Compute the approximate division result of two values.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1719,7 +1703,6 @@
 comment:
  Return native_sqrt(x*x + y*y)
 version: 21
-test: noverify
 end:
 
 start:
@@ -1755,7 +1738,6 @@
 comment:
  Return the natural logarithm of (v + 1.0f)
 version: 21
-test: noverify
 end:
 
 start:
@@ -1779,6 +1761,7 @@
 comment:
  Compute the Nth root of a value.
 version: 21
+# TODO re-enable once how to handle zero is decided
 test: noverify
 end:
 
@@ -1791,7 +1774,6 @@
 comment:
  Return (1 / sqrt(value)).
 version: 21
-test: noverify
 end:
 
 start:
@@ -1803,7 +1785,6 @@
 comment:
  Return the sine of a value specified in radians.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1820,7 +1801,6 @@
  @param v The incoming value in radians
  @param *cosptr cosptr[0] will be set to the cosine value.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1832,7 +1812,6 @@
 comment:
  Return the hyperbolic sine of a value specified in radians.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1844,7 +1823,6 @@
 comment:
  Return the sin(v * PI).
 version: 21
-test: noverify
 end:
 
 start:
@@ -1856,7 +1834,6 @@
 comment:
  Return the aproximate sqrt(v).
 version: 21
-test: noverify
 end:
 
 start:
@@ -1868,7 +1845,6 @@
 comment:
  Return the tangent of a value.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1880,7 +1856,6 @@
 comment:
  Return the hyperbolic tangent of a value.
 version: 21
-test: noverify
 end:
 
 start:
@@ -1892,7 +1867,6 @@
 comment:
  Return tan(v * PI)
 version: 21
-test: noverify
 end:
 
 
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index ebfe6a8..3bd7d1e 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -38,9 +38,9 @@
 	rsCpuIntrinsicConvolve5x5.cpp \
 	rsCpuIntrinsicHistogram.cpp \
 	rsCpuIntrinsicLoopFilter.cpp \
-	rsCpuIntrinsicYuvToRGB.cpp \
 	rsCpuIntrinsicResize.cpp \
-	rsCpuIntrinsicLUT.cpp
+	rsCpuIntrinsicLUT.cpp \
+	rsCpuIntrinsicYuvToRGB.cpp
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
 
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index f40f411..5b07b16 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -651,14 +651,14 @@
     case RS_SCRIPT_INTRINSIC_ID_HISTOGRAM:
         i = rsdIntrinsic_Histogram(this, s, e);
         break;
-    case RS_SCRIPT_INTRINSIC_ID_RESIZE:
-        i = rsdIntrinsic_Resize(this, s, e);
-        break;
 #ifndef RS_COMPATIBILITY_LIB
     case RS_SCRIPT_INTRINSIC_ID_LOOP_FILTER:
         i = rsdIntrinsic_LoopFilter(this, s, e);
         break;
 #endif
+    case RS_SCRIPT_INTRINSIC_ID_RESIZE:
+        i = rsdIntrinsic_Resize(this, s, e);
+        break;
 
     default:
         rsAssert(0);
diff --git a/cpu_ref/rsCpuIntrinsic3DLUT.cpp b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
index ce7c9c6..86d0478 100644
--- a/cpu_ref/rsCpuIntrinsic3DLUT.cpp
+++ b/cpu_ref/rsCpuIntrinsic3DLUT.cpp
@@ -63,8 +63,8 @@
                                         uint32_t outstep) {
     RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
 
-    uchar4 *out = (uchar4 *)p->out + xstart;
-    uchar4 *in = (uchar4 *)p->ins[0] + xstart;
+    uchar4 *out = (uchar4 *)p->out;
+    uchar4 *in = (uchar4 *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index ebbb036..26d589e 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -128,7 +128,7 @@
         gPtr++;
     }
 
-    out->xyzw = blurredPixel;
+    out[0] = blurredPixel;
 }
 
 static void OneVU1(const RsExpandKernelParams *p, float *out, int32_t x, int32_t y,
@@ -163,6 +163,7 @@
 static void OneVFU4(float4 *out,
                     const uchar *ptrIn, int iStride, const float* gPtr, int ct,
                     int x1, int x2) {
+    out += x1;
 #if defined(ARCH_X86_HAVE_SSSE3)
     if (gArchUseSIMD) {
         int t = (x2 - x1);
@@ -195,6 +196,7 @@
                     const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
 
     int len = x2 - x1;
+    out += x1;
 
     while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
         const uchar *pi = ptrIn;
@@ -293,7 +295,7 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD) {
+    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
         rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
@@ -313,9 +315,10 @@
     int y = p->y;
     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
+        OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
     } else {
-        while(x2 > x1) {
+        x1 = 0;
+        while(p->dimX > x1) {
             OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;
@@ -362,9 +365,9 @@
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD) {
+    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
         rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
-                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
+                 stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
@@ -373,9 +376,10 @@
     int y = p->y;
     if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
         const uchar *pi = pin + (y - cp->mIradius) * stride;
-        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
+        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, p->dimX);
     } else {
-        while(x2 > x1) {
+        x1 = 0;
+        while(p->dimX > x1) {
             OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
             fout++;
             x1++;
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 4e15ea7..d9c0ded 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -885,9 +885,8 @@
 
     uint32_t instep = p->inEStrides[0];
 
-    uchar *out = (uchar *)p->out    + outstep * xstart;
-    uchar *in  = (uchar *)p->ins[0] + instep  * xstart;
-
+    uchar *out = (uchar *)p->out;
+    uchar *in = (uchar *)p->ins[0];
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index 1a546db..e263e74 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -393,7 +393,7 @@
 #if defined(ARCH_ARM_USE_INTRINSICS)
     if(gArchUseSIMD && ((x1 + 3) < x2)) {
         uint32_t len = (x2 - x1 - 3) >> 1;
-        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
+        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
         out += len << 1;
         x1 += len << 1;
     }
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 826661e..fa0e8ee 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -66,10 +66,6 @@
     mAlloc.set(static_cast<Allocation *>(data));
 }
 
-
-extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
-                                          const void *y2, const short *coef, uint32_t count);
-
 static float4 cubicInterpolate(float4 p0,float4 p1,float4 p2,float4 p3, float x) {
     return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
             + x * (3.f * (p1 - p2) + p3 - p0)));
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index e34d0c3..413f1ad 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -317,7 +317,13 @@
 
     size_t o = alloc->mHal.drvState.lod[0].stride * rsMax(alloc->mHal.drvState.lod[0].dimY, 1u) *
             rsMax(alloc->mHal.drvState.lod[0].dimZ, 1u);
-    if(alloc->mHal.drvState.lodCount > 1) {
+    if (alloc->mHal.state.yuv) {
+        o += DeriveYUVLayout(alloc->mHal.state.yuv, &alloc->mHal.drvState);
+
+        for (uint32_t ct = 1; ct < alloc->mHal.drvState.lodCount; ct++) {
+            offsets[ct] = (size_t)alloc->mHal.drvState.lod[ct].mallocPtr;
+        }
+    } else if(alloc->mHal.drvState.lodCount > 1) {
         uint32_t tx = alloc->mHal.drvState.lod[0].dimX;
         uint32_t ty = alloc->mHal.drvState.lod[0].dimY;
         uint32_t tz = alloc->mHal.drvState.lod[0].dimZ;
@@ -333,12 +339,6 @@
             if (ty > 1) ty >>= 1;
             if (tz > 1) tz >>= 1;
         }
-    } else if (alloc->mHal.state.yuv) {
-        o += DeriveYUVLayout(alloc->mHal.state.yuv, &alloc->mHal.drvState);
-
-        for (uint32_t ct = 1; ct < alloc->mHal.drvState.lodCount; ct++) {
-            offsets[ct] = (size_t)alloc->mHal.drvState.lod[ct].mallocPtr;
-        }
     }
 
     alloc->mHal.drvState.faceOffset = o;
diff --git a/driver/rsdGL.cpp b/driver/rsdGL.cpp
index e803487..e58b0f2 100644
--- a/driver/rsdGL.cpp
+++ b/driver/rsdGL.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <ui/FramebufferNativeWindow.h>
 #include <ui/PixelFormat.h>
 
 #include <system/window.h>
diff --git a/driver/rsdRuntimeStubs.cpp b/driver/rsdRuntimeStubs.cpp
index 8f5dbbe..9d2d0c3 100644
--- a/driver/rsdRuntimeStubs.cpp
+++ b/driver/rsdRuntimeStubs.cpp
@@ -46,24 +46,24 @@
 typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
 typedef unsigned char uchar3 __attribute__((ext_vector_type(3)));
 typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
-typedef short short2 __attribute__((ext_vector_type(2)));
-typedef short short3 __attribute__((ext_vector_type(3)));
-typedef short short4 __attribute__((ext_vector_type(4)));
-typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
-typedef unsigned short ushort3 __attribute__((ext_vector_type(3)));
-typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+typedef int16_t short2 __attribute__((ext_vector_type(2)));
+typedef int16_t short3 __attribute__((ext_vector_type(3)));
+typedef int16_t short4 __attribute__((ext_vector_type(4)));
+typedef uint16_t ushort2 __attribute__((ext_vector_type(2)));
+typedef uint16_t ushort3 __attribute__((ext_vector_type(3)));
+typedef uint16_t ushort4 __attribute__((ext_vector_type(4)));
 typedef int32_t int2 __attribute__((ext_vector_type(2)));
 typedef int32_t int3 __attribute__((ext_vector_type(3)));
 typedef int32_t int4 __attribute__((ext_vector_type(4)));
 typedef uint32_t uint2 __attribute__((ext_vector_type(2)));
 typedef uint32_t uint3 __attribute__((ext_vector_type(3)));
 typedef uint32_t uint4 __attribute__((ext_vector_type(4)));
-typedef long long long2 __attribute__((ext_vector_type(2)));
-typedef long long long3 __attribute__((ext_vector_type(3)));
-typedef long long long4 __attribute__((ext_vector_type(4)));
-typedef unsigned long long ulong2 __attribute__((ext_vector_type(2)));
-typedef unsigned long long ulong3 __attribute__((ext_vector_type(3)));
-typedef unsigned long long ulong4 __attribute__((ext_vector_type(4)));
+typedef int64_t long2 __attribute__((ext_vector_type(2)));
+typedef int64_t long3 __attribute__((ext_vector_type(3)));
+typedef int64_t long4 __attribute__((ext_vector_type(4)));
+typedef uint64_t ulong2 __attribute__((ext_vector_type(2)));
+typedef uint64_t ulong3 __attribute__((ext_vector_type(3)));
+typedef uint64_t ulong4 __attribute__((ext_vector_type(4)));
 
 typedef uint8_t uchar;
 typedef uint16_t ushort;
diff --git a/driver/runtime/Android.mk b/driver/runtime/Android.mk
index cec21d4..33fa630 100755
--- a/driver/runtime/Android.mk
+++ b/driver/runtime/Android.mk
@@ -157,7 +157,7 @@
 
 include $(CLEAR_VARS)
 
-BCC_RS_TRIPLE := aarch64-none-linux-gnueabi
+BCC_RS_TRIPLE := aarch64-linux-android
 LOCAL_MODULE := librsrt_arm64.bc
 LOCAL_IS_HOST_MODULE := true
 LOCAL_SRC_FILES := $(clcore_files)
diff --git a/driver/runtime/arch/clamp.c b/driver/runtime/arch/clamp.c
index c2c2226..23014ce 100644
--- a/driver/runtime/arch/clamp.c
+++ b/driver/runtime/arch/clamp.c
@@ -16,6 +16,11 @@
 
 #include "rs_types.rsh"
 
+typedef unsigned long long ull;
+typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
+typedef unsigned long long ull3 __attribute__((ext_vector_type(3)));
+typedef unsigned long long ull4 __attribute__((ext_vector_type(4)));
+
 #define S_CLAMP(T) \
 extern T __attribute__((overloadable)) clamp(T amount, T low, T high) {             \
     return amount < low ? low : (amount > high ? high : amount);                    \
@@ -97,6 +102,7 @@
 #endif
 V_CLAMP(long);
 V_CLAMP(ulong);
+V_CLAMP(ull);
 
 #undef _CLAMP
 
diff --git a/driver/runtime/ll64/allocation.ll b/driver/runtime/ll64/allocation.ll
index fcbf0f2..2c1bf4c 100644
--- a/driver/runtime/ll64/allocation.ll
+++ b/driver/runtime/ll64/allocation.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-linux-gnueabi"
+target triple = "aarch64-linux-android"
 
 %struct.rs_allocation = type { i64*, i64*, i64*, i64* }
 
@@ -43,10 +43,11 @@
 }
 
 !22 = metadata !{metadata !"char2", metadata !15}
-define void @rsSetElementAtImpl_char2(%struct.rs_allocation* %a, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #2 {
+define void @rsSetElementAtImpl_char2(%struct.rs_allocation* %a, i16 %val, i32 %x, i32 %y, i32 %z) #2 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #10
   %2 = bitcast i8* %1 to <2 x i8>*
-  store <2 x i8> %val, <2 x i8>* %2, align 2, !tbaa !22
+  %3 = bitcast i16 %val to <2 x i8>
+  store <2 x i8> %3, <2 x i8>* %2, align 2, !tbaa !26
   ret void
 }
 
@@ -58,11 +59,12 @@
 }
 
 !23 = metadata !{metadata !"char3", metadata !15}
-define void @rsSetElementAtImpl_char3(%struct.rs_allocation* %a, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #2 {
+define void @rsSetElementAtImpl_char3(%struct.rs_allocation* %a, i32 %val, i32 %x, i32 %y, i32 %z) #2 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #10
-  %2 = shufflevector <3 x i8> %val, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  %3 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %2, <4 x i8>* %3, align 4, !tbaa !23
+  %2 = bitcast i32 %val to <4 x i8>
+  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %4 = bitcast i8* %1 to <4 x i8>*
+  store <4 x i8> %3, <4 x i8>* %4, align 4, !tbaa !27
   ret void
 }
 
@@ -103,10 +105,11 @@
 }
 
 !26 = metadata !{metadata !"uchar2", metadata !15}
-define void @rsSetElementAtImpl_uchar2(%struct.rs_allocation* %a, <2 x i8> %val, i32 %x, i32 %y, i32 %z) #2 {
+define void @rsSetElementAtImpl_uchar2(%struct.rs_allocation* %a, i16 %val, i32 %x, i32 %y, i32 %z) #2 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 2, i32 %x, i32 %y, i32 %z) #10
   %2 = bitcast i8* %1 to <2 x i8>*
-  store <2 x i8> %val, <2 x i8>* %2, align 2, !tbaa !26
+  %3 = bitcast i16 %val to <2 x i8>
+  store <2 x i8> %3, <2 x i8>* %2, align 2, !tbaa !26
   ret void
 }
 
@@ -118,11 +121,12 @@
 }
 
 !27 = metadata !{metadata !"uchar3", metadata !15}
-define void @rsSetElementAtImpl_uchar3(%struct.rs_allocation* %a, <3 x i8> %val, i32 %x, i32 %y, i32 %z) #2 {
+define void @rsSetElementAtImpl_uchar3(%struct.rs_allocation* %a, i32 %val, i32 %x, i32 %y, i32 %z) #2 {
   %1 = tail call i8* @rsOffset(%struct.rs_allocation* %a, i32 4, i32 %x, i32 %y, i32 %z) #10
-  %2 = shufflevector <3 x i8> %val, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
-  %3 = bitcast i8* %1 to <4 x i8>*
-  store <4 x i8> %2, <4 x i8>* %3, align 4, !tbaa !27
+  %2 = bitcast i32 %val to <4 x i8>
+  %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %4 = bitcast i8* %1 to <4 x i8>*
+  store <4 x i8> %3, <4 x i8>* %4, align 4, !tbaa !27
   ret void
 }
 
diff --git a/driver/runtime/ll64/math.ll b/driver/runtime/ll64/math.ll
index 8c735d1..d803773 100644
--- a/driver/runtime/ll64/math.ll
+++ b/driver/runtime/ll64/math.ll
@@ -1,5 +1,5 @@
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-linux-gnueabi"
+target triple = "aarch64-linux-android"
 
 declare float @llvm.sqrt.f32(float)
 declare float @llvm.pow.f32(float, float)
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index 27b2734..a79ad2a 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -1292,23 +1292,23 @@
     return r;
 }
 
-extern int64_t _Z3minll(int64_t v1, int64_t v2) {
+extern long __attribute__((overloadable)) min(long v1, long v2) {
     return v1 < v2 ? v1 : v2;
 }
-extern long2 _Z3minDv2_lS_(long2 v1, long2 v2) {
+extern long2 __attribute__((overloadable)) min(long2 v1, long2 v2) {
     long2 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
     return r;
 }
-extern long3 _Z3minDv3_lS_(long3 v1, long3 v2) {
+extern long3 __attribute__((overloadable)) min(long3 v1, long3 v2) {
     long3 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
     r.z = v1.z < v2.z ? v1.z : v2.z;
     return r;
 }
-extern long4 _Z3minDv4_lS_(long4 v1, long4 v2) {
+extern long4 __attribute__((overloadable)) min(long4 v1, long4 v2) {
     long4 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
@@ -1317,23 +1317,23 @@
     return r;
 }
 
-extern uint64_t _Z3minyy(uint64_t v1, uint64_t v2) {
+extern ulong __attribute__((overloadable)) min(ulong v1, ulong v2) {
     return v1 < v2 ? v1 : v2;
 }
-extern ulong2 _Z3minDv2_yS_(ulong2 v1, ulong2 v2) {
+extern ulong2 __attribute__((overloadable)) min(ulong2 v1, ulong2 v2) {
     ulong2 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
     return r;
 }
-extern ulong3 _Z3minDv3_yS_(ulong3 v1, ulong3 v2) {
+extern ulong3 __attribute__((overloadable)) min(ulong3 v1, ulong3 v2) {
     ulong3 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
     r.z = v1.z < v2.z ? v1.z : v2.z;
     return r;
 }
-extern ulong4 _Z3minDv4_yS_(ulong4 v1, ulong4 v2) {
+extern ulong4 __attribute__((overloadable)) min(ulong4 v1, ulong4 v2) {
     ulong4 r;
     r.x = v1.x < v2.x ? v1.x : v2.x;
     r.y = v1.y < v2.y ? v1.y : v2.y;
@@ -1370,23 +1370,23 @@
     return r;
 }
 
-extern int64_t _Z3maxll(int64_t v1, int64_t v2) {
+extern long __attribute__((overloadable)) max(long v1, long v2) {
     return v1 > v2 ? v1 : v2;
 }
-extern long2 _Z3maxDv2_lS_(long2 v1, long2 v2) {
+extern long2 __attribute__((overloadable)) max(long2 v1, long2 v2) {
     long2 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
     return r;
 }
-extern long3 _Z3maxDv3_lS_(long3 v1, long3 v2) {
+extern long3 __attribute__((overloadable)) max(long3 v1, long3 v2) {
     long3 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
     r.z = v1.z > v2.z ? v1.z : v2.z;
     return r;
 }
-extern long4 _Z3maxDv4_lS_(long4 v1, long4 v2) {
+extern long4 __attribute__((overloadable)) max(long4 v1, long4 v2) {
     long4 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
@@ -1395,23 +1395,23 @@
     return r;
 }
 
-extern uint64_t _Z3maxyy(uint64_t v1, uint64_t v2) {
+extern ulong __attribute__((overloadable)) max(ulong v1, ulong v2) {
     return v1 > v2 ? v1 : v2;
 }
-extern ulong2 _Z3maxDv2_yS_(ulong2 v1, ulong2 v2) {
+extern ulong2 __attribute__((overloadable)) max(ulong2 v1, ulong2 v2) {
     ulong2 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
     return r;
 }
-extern ulong3 _Z3maxDv3_yS_(ulong3 v1, ulong3 v2) {
+extern ulong3 __attribute__((overloadable)) max(ulong3 v1, ulong3 v2) {
     ulong3 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
     r.z = v1.z > v2.z ? v1.z : v2.z;
     return r;
 }
-extern ulong4 _Z3maxDv4_yS_(ulong4 v1, ulong4 v2) {
+extern ulong4 __attribute__((overloadable)) max(ulong4 v1, ulong4 v2) {
     ulong4 r;
     r.x = v1.x > v2.x ? v1.x : v2.x;
     r.y = v1.y > v2.y ? v1.y : v2.y;
diff --git a/java/Android.mk b/java/Android.mk
index fdcbf97..c8f64ce 100644
--- a/java/Android.mk
+++ b/java/Android.mk
@@ -1,11 +1,7 @@
 LOCAL_PATH:=$(call my-dir)
 
-ifneq ($(TARGET_ARCH), arm64)
-
 # Only build our tests if we doing a top-level build. Do not build the
 # tests if we are just doing an mm or mmm in frameworks/rs.
 ifeq (,$(ONE_SHOT_MAKEFILE))
 include $(call all-makefiles-under,$(LOCAL_PATH))
 endif
-
-endif
\ No newline at end of file
diff --git a/java/tests/Balls/Android.mk b/java/tests/Balls/Android.mk
index 77281ce..e1cea04 100644
--- a/java/tests/Balls/Android.mk
+++ b/java/tests/Balls/Android.mk
@@ -22,5 +22,6 @@
 LOCAL_SRC_FILES := $(call all-java-files-under, src) $(call all-renderscript-files-under, src)
 
 LOCAL_PACKAGE_NAME := RsBalls
+LOCAL_SDK_VERSION := 14
 
 include $(BUILD_PACKAGE)
diff --git a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/LaunchTest.java b/java/tests/ComputePerf/src/com/example/android/rs/computeperf/LaunchTest.java
index 8e73189..fc06068 100644
--- a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/LaunchTest.java
+++ b/java/tests/ComputePerf/src/com/example/android/rs/computeperf/LaunchTest.java
@@ -23,26 +23,24 @@
     private RenderScript mRS;
     private Allocation mAllocationX;
     private Allocation mAllocationXY;
-    private ScriptC_launchtestxlw mScript_xlw;
-    private ScriptC_launchtestxyw mScript_xyw;
+    private ScriptC_launchtest mScript;
 
     LaunchTest(RenderScript rs, Resources res) {
         mRS = rs;
-        mScript_xlw = new ScriptC_launchtestxlw(mRS);
-        mScript_xyw = new ScriptC_launchtestxyw(mRS);
-        final int dim = mScript_xlw.get_dim();
+        mScript = new ScriptC_launchtest(mRS);
+        final int dim = mScript.get_dim();
 
         mAllocationX = Allocation.createSized(rs, Element.U8(rs), dim);
         Type.Builder tb = new Type.Builder(rs, Element.U8(rs));
         tb.setX(dim);
         tb.setY(dim);
         mAllocationXY = Allocation.createTyped(rs, tb.create());
-        mScript_xlw.bind_buf(mAllocationXY);
+        mScript.set_gBuf(mAllocationXY);
     }
 
     public long XLW() {
         long t = java.lang.System.currentTimeMillis();
-        mScript_xlw.forEach_root(mAllocationX);
+        mScript.forEach_k_x(mAllocationX);
         mRS.finish();
         t = java.lang.System.currentTimeMillis() - t;
         return t;
@@ -50,7 +48,7 @@
 
     public long XYW() {
         long t = java.lang.System.currentTimeMillis();
-        mScript_xyw.forEach_root(mAllocationXY);
+        mScript.forEach_k_xy(mAllocationXY);
         mRS.finish();
         t = java.lang.System.currentTimeMillis() - t;
         return t;
diff --git a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxlw.rs b/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtest.rs
similarity index 79%
rename from java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxlw.rs
rename to java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtest.rs
index 7b81dfe..cc70e4e 100644
--- a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxlw.rs
+++ b/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtest.rs
@@ -18,13 +18,15 @@
 #pragma rs java_package_name(com.example.android.rs.computeperf)
 
 const int dim = 2048;
-uint8_t *buf;
+rs_allocation gBuf;
 
-void root(uchar *v_out, uint32_t x) {
-    uint8_t *p = buf;
-    p += x * dim;
+void __attribute__((kernel)) k_x(uchar in, uint32_t x) {
     for (int i=0; i<dim; i++) {
-        p[i] = 1;
+        rsSetElementAt_uchar(gBuf, 1, i, x);
     }
 }
 
+uchar __attribute__((kernel)) k_xy(uint32_t x, uint32_t y) {
+    return 0;
+}
+
diff --git a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxyw.rs b/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxyw.rs
deleted file mode 100644
index 7f7aa95..0000000
--- a/java/tests/ComputePerf/src/com/example/android/rs/computeperf/launchtestxyw.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) 2011 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma version(1)
-#pragma rs java_package_name(com.example.android.rs.computeperf)
-
-void root(uchar *v_out, uint32_t x, uint32_t y) {
-    *v_out = 0;
-}
-
diff --git a/java/tests/HelloComputeNDK/Android.mk b/java/tests/HelloComputeNDK/Android.mk
index 5f1bd17..d4194b5 100644
--- a/java/tests/HelloComputeNDK/Android.mk
+++ b/java/tests/HelloComputeNDK/Android.mk
@@ -26,6 +26,7 @@
 
 LOCAL_PACKAGE_NAME := HelloComputeNDK
 LOCAL_SDK_VERSION := 14
+LOCAL_32_BIT_ONLY := true
 
 LOCAL_JNI_SHARED_LIBRARIES := libhellocomputendk
 
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
index 3b5e168..7cf7caf 100644
--- a/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/ImageProcessingActivity.java
@@ -101,7 +101,9 @@
         USAGE_IO ("Usage io"),
         ARTISTIC_1("Artistic 1"),
         HISTOGRAM ("Histogram"),
-        MANDELBROT_DOUBLE ("Mandelbrot fp64");
+        MANDELBROT_DOUBLE ("Mandelbrot fp64"),
+        RESIZE_BICUBIC_SCRIPT ("Resize BiCubic Script"),
+        RESIZE_BICUBIC_INTRINSIC ("Resize BiCubic Intrinsic");
 
 
         private final String name;
@@ -366,6 +368,12 @@
         case MANDELBROT_DOUBLE:
             mTest = new Mandelbrot(true);
             break;
+        case RESIZE_BICUBIC_SCRIPT:
+            mTest = new Resize(false);
+            break;
+        case RESIZE_BICUBIC_INTRINSIC:
+            mTest = new Resize(true);
+            break;
         }
 
         mTest.createBaseTest(this);
@@ -401,6 +409,7 @@
         mBitmapOut = Bitmap.createBitmap(mInPixelsAllocation.getType().getX(),
                                          mInPixelsAllocation.getType().getY(),
                                          Bitmap.Config.ARGB_8888);
+        mBitmapOut.setHasAlpha(false);
         mOutPixelsAllocation = Allocation.createFromBitmap(mRS, mBitmapOut);
 
         mDisplayView = (ImageView) findViewById(R.id.display);
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java b/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java
new file mode 100644
index 0000000..86e1645
--- /dev/null
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/Resize.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.image;
+
+import java.lang.Math;
+
+import android.renderscript.Allocation;
+import android.renderscript.Element;
+import android.renderscript.Matrix4f;
+import android.renderscript.RenderScript;
+import android.renderscript.Script;
+import android.renderscript.ScriptC;
+import android.renderscript.ScriptIntrinsicResize;
+import android.renderscript.Type;
+import android.util.Log;
+
+public class Resize extends TestBase {
+    private ScriptC_resize mScript;
+    private ScriptIntrinsicResize mIntrinsic;
+
+    private Allocation mScratchAllocation;
+    private int mWidth;
+    private int mHeight;
+    private boolean mUseIntrinsic;
+
+    public Resize(boolean useIntrinsic) {
+        mUseIntrinsic = useIntrinsic;
+    }
+
+    public void createTest(android.content.res.Resources res) {
+        mWidth = mInPixelsAllocation.getType().getX();
+        mHeight = mInPixelsAllocation.getType().getY();
+        float scale = 1.f / 32.f;
+
+        Type t = Type.createXY(mRS, mInPixelsAllocation.getElement(),
+                               (int)(mWidth * scale), (int)(mHeight * scale));
+        mScratchAllocation = Allocation.createTyped(mRS, t);
+
+        // make small buffer
+        mScript = new ScriptC_resize(mRS);
+        mScript.set_gIn(mInPixelsAllocation);
+        mScript.set_gWidthIn(mWidth);
+        mScript.set_gHeightIn(mHeight);
+        mScript.set_scale(1.f / scale);
+        mScript.forEach_nearest(mScratchAllocation);
+
+        // setup normal ops
+        mScript.set_gIn(mScratchAllocation);
+        mScript.set_gWidthIn(t.getX());
+        mScript.set_gHeightIn(t.getY());
+        mScript.set_scale(scale);
+        //mScript.forEach_nearest(mScratchAllocation);
+
+        mIntrinsic = ScriptIntrinsicResize.create(mRS);
+        mIntrinsic.setInput(mScratchAllocation);
+    }
+
+    public void runTest() {
+        if (mUseIntrinsic) {
+            mIntrinsic.forEach_bicubic(mOutPixelsAllocation);
+        } else {
+            mScript.forEach_bicubic(mOutPixelsAllocation);
+            //mScript.forEach_nearest(mOutPixelsAllocation);
+        }
+    }
+
+}
diff --git a/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs b/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs
new file mode 100644
index 0000000..ec283be
--- /dev/null
+++ b/java/tests/ImageProcessing/src/com/android/rs/image/resize.rs
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidthIn;
+int32_t gHeightIn;
+rs_allocation gIn;
+float scale;
+
+
+uchar4 __attribute__((kernel)) nearest(uint32_t x, uint32_t y) {
+    float xf = clamp(x * scale, 0.f, (float)gWidthIn - 1.f);
+    float yf = clamp(y * scale, 0.f, (float)gHeightIn - 1.f);
+    uint32_t ix = xf;
+    uint32_t iy = yf;
+
+    uchar4 tmp = rsGetElementAt_uchar4(gIn, ix, iy);
+    tmp.a = 0xff;
+    return tmp;
+}
+
+
+static float4 cubicInterpolate (float4 p0,float4 p1,float4 p2,float4 p3 , float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+uchar4 __attribute__((kernel)) bicubic(uint32_t x, uint32_t y) {
+    float xf = x * scale;
+    float yf = y * scale;
+
+    int startx = (int) floor(xf - 2);
+    int starty = (int) floor(yf - 2);
+    xf = xf - floor(xf);
+    yf = yf - floor(yf);
+    int maxx = gWidthIn - 1;
+    int maxy = gHeightIn - 1;
+
+    uint32_t xs0 = (uint32_t) max(0, startx + 0);
+    uint32_t xs1 = (uint32_t) max(0, startx + 1);
+    uint32_t xs2 = (uint32_t) min(maxx, startx + 2);
+    uint32_t xs3 = (uint32_t) min(maxx, startx + 3);
+
+    uint32_t ys0 = (uint32_t) max(0, starty + 0);
+    uint32_t ys1 = (uint32_t) max(0, starty + 1);
+    uint32_t ys2 = (uint32_t) min(maxy, starty + 2);
+    uint32_t ys3 = (uint32_t) min(maxy, starty + 3);
+
+    float4 p00 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys0));
+    float4 p01 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys0));
+    float4 p02 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys0));
+    float4 p03 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys0));
+    float4 p0  = cubicInterpolate(p00, p01, p02, p03, xf);
+
+    float4 p10 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys1));
+    float4 p11 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys1));
+    float4 p12 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys1));
+    float4 p13 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys1));
+    float4 p1  = cubicInterpolate(p10, p11, p12, p13, xf);
+
+    float4 p20 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys2));
+    float4 p21 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys2));
+    float4 p22 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys2));
+    float4 p23 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys2));
+    float4 p2  = cubicInterpolate(p20, p21, p22, p23, xf);
+
+    float4 p30 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys3));
+    float4 p31 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys3));
+    float4 p32 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys3));
+    float4 p33 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys3));
+    float4 p3  = cubicInterpolate(p30, p31, p32, p33, xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p, 0.f, 255.f);
+    return convert_uchar4(p);
+}
+
diff --git a/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java b/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
index ea6da83..e24c548 100644
--- a/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
+++ b/java/tests/ImageProcessing2/src/com/android/rs/image/Vignette.java
@@ -116,11 +116,9 @@
     public void createTest(android.content.res.Resources res) {
         if (approx) {
             if (relaxed)
-                mScript_approx_relaxed = new ScriptC_vignette_approx_relaxed(
-                        mRS, res, R.raw.vignette_approx_relaxed);
+                mScript_approx_relaxed = new ScriptC_vignette_approx_relaxed(mRS);
             else
-                mScript_approx_full = new ScriptC_vignette_approx_full(
-                        mRS, res, R.raw.vignette_approx_full);
+                mScript_approx_full = new ScriptC_vignette_approx_full(mRS);
         } else if (relaxed)
             mScript_relaxed = new ScriptC_vignette_relaxed(mRS);
         else
diff --git a/java/tests/ImageProcessing_jb/Android.mk b/java/tests/ImageProcessing_jb/Android.mk
index 680c1de..65925b8 100644
--- a/java/tests/ImageProcessing_jb/Android.mk
+++ b/java/tests/ImageProcessing_jb/Android.mk
@@ -24,6 +24,6 @@
 #LOCAL_STATIC_JAVA_LIBRARIES := android.renderscript
 
 LOCAL_PACKAGE_NAME := ImageProcessingJB
-LOCAL_SDK_VERSION := 18
+#LOCAL_SDK_VERSION := 18
 
 include $(BUILD_PACKAGE)
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
index 4bf99e3..398f9c1 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/IPTestListJB.java
@@ -71,7 +71,9 @@
         WHITE_BALANCE ("White Balance", RELAXED_FP, 160.1f),
         COLOR_CUBE ("Color Cube", RELAXED_FP, 85.3f),
         COLOR_CUBE_3D_INTRINSIC ("Color Cube (3D LUT intrinsic)", INTRINSIC, 49.5f),
-        ARTISTIC1 ("Artistic 1", RELAXED_FP, 120.f);
+        ARTISTIC1 ("Artistic 1", RELAXED_FP, 120.f),
+        RESIZE_BI_SCRIPT ("Resize BiCubic Script", RELAXED_FP, 100.f),
+        RESIZE_BI_INTRINSIC ("Resize BiCubic Intrinsic", INTRINSIC, 100.f);
 
 
         private final String name;
@@ -177,6 +179,10 @@
             return new ColorCube(true);
         case ARTISTIC1:
             return new Artistic1();
+        case RESIZE_BI_SCRIPT:
+            return new Resize(false);
+        case RESIZE_BI_INTRINSIC:
+            return new Resize(true);
         }
         return null;
     }
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Resize.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Resize.java
new file mode 100644
index 0000000..014b5d7
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Resize.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.rs.imagejb;
+
+import java.lang.Math;
+
+import android.renderscript.Allocation;
+import android.renderscript.Element;
+import android.renderscript.Matrix4f;
+import android.renderscript.RenderScript;
+import android.renderscript.Script;
+import android.renderscript.ScriptC;
+import android.renderscript.ScriptIntrinsicResize;
+import android.renderscript.Type;
+import android.util.Log;
+
+public class Resize extends TestBase {
+    private ScriptC_resize mScript;
+    private ScriptIntrinsicResize mIntrinsic;
+
+    private Allocation mScratchAllocation;
+    private int mWidth;
+    private int mHeight;
+    private boolean mUseIntrinsic;
+
+    public Resize(boolean useIntrinsic) {
+        mUseIntrinsic = useIntrinsic;
+    }
+
+    public void createTest(android.content.res.Resources res) {
+        mWidth = mInPixelsAllocation.getType().getX();
+        mHeight = mInPixelsAllocation.getType().getY();
+        float scale = 1.f / 32.f;
+
+        Type t = Type.createXY(mRS, mInPixelsAllocation.getElement(),
+                               (int)(mWidth * scale), (int)(mHeight * scale));
+        mScratchAllocation = Allocation.createTyped(mRS, t);
+
+        // make small buffer
+        mScript = new ScriptC_resize(mRS);
+        mScript.set_gIn(mInPixelsAllocation);
+        mScript.set_gWidthIn(mWidth);
+        mScript.set_gHeightIn(mHeight);
+        mScript.set_scale(1.f / scale);
+        mScript.forEach_nearest(mScratchAllocation);
+
+        // setup normal ops
+        mScript.set_gIn(mScratchAllocation);
+        mScript.set_gWidthIn(t.getX());
+        mScript.set_gHeightIn(t.getY());
+        mScript.set_scale(scale);
+        //mScript.forEach_nearest(mScratchAllocation);
+
+        mIntrinsic = ScriptIntrinsicResize.create(mRS);
+        mIntrinsic.setInput(mScratchAllocation);
+    }
+
+    public void runTest() {
+        if (mUseIntrinsic) {
+            mIntrinsic.forEach_bicubic(mOutPixelsAllocation);
+        } else {
+            mScript.forEach_bicubic(mOutPixelsAllocation);
+            //mScript.forEach_nearest(mOutPixelsAllocation);
+        }
+    }
+
+}
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
index 3f860fb..7984386 100644
--- a/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/Vignette.java
@@ -120,11 +120,9 @@
     public void createTest(android.content.res.Resources res) {
         if (approx) {
             if (relaxed)
-                mScript_approx_relaxed = new ScriptC_vignette_approx_relaxed(
-                        mRS, res, R.raw.vignette_approx_relaxed);
+                mScript_approx_relaxed = new ScriptC_vignette_approx_relaxed(mRS);
             else
-                mScript_approx_full = new ScriptC_vignette_approx_full(
-                        mRS, res, R.raw.vignette_approx_full);
+                mScript_approx_full = new ScriptC_vignette_approx_full(mRS);
         } else if (relaxed)
             mScript_relaxed = new ScriptC_vignette_relaxed(mRS);
         else
diff --git a/java/tests/ImageProcessing_jb/src/com/android/rs/image/resize.rs b/java/tests/ImageProcessing_jb/src/com/android/rs/image/resize.rs
new file mode 100644
index 0000000..ec283be
--- /dev/null
+++ b/java/tests/ImageProcessing_jb/src/com/android/rs/image/resize.rs
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ip.rsh"
+#pragma rs_fp_relaxed
+
+int32_t gWidthIn;
+int32_t gHeightIn;
+rs_allocation gIn;
+float scale;
+
+
+uchar4 __attribute__((kernel)) nearest(uint32_t x, uint32_t y) {
+    float xf = clamp(x * scale, 0.f, (float)gWidthIn - 1.f);
+    float yf = clamp(y * scale, 0.f, (float)gHeightIn - 1.f);
+    uint32_t ix = xf;
+    uint32_t iy = yf;
+
+    uchar4 tmp = rsGetElementAt_uchar4(gIn, ix, iy);
+    tmp.a = 0xff;
+    return tmp;
+}
+
+
+static float4 cubicInterpolate (float4 p0,float4 p1,float4 p2,float4 p3 , float x) {
+    return p1 + 0.5f * x * (p2 - p0 + x * (2.f * p0 - 5.f * p1 + 4.f * p2 - p3
+            + x * (3.f * (p1 - p2) + p3 - p0)));
+}
+
+uchar4 __attribute__((kernel)) bicubic(uint32_t x, uint32_t y) {
+    float xf = x * scale;
+    float yf = y * scale;
+
+    int startx = (int) floor(xf - 2);
+    int starty = (int) floor(yf - 2);
+    xf = xf - floor(xf);
+    yf = yf - floor(yf);
+    int maxx = gWidthIn - 1;
+    int maxy = gHeightIn - 1;
+
+    uint32_t xs0 = (uint32_t) max(0, startx + 0);
+    uint32_t xs1 = (uint32_t) max(0, startx + 1);
+    uint32_t xs2 = (uint32_t) min(maxx, startx + 2);
+    uint32_t xs3 = (uint32_t) min(maxx, startx + 3);
+
+    uint32_t ys0 = (uint32_t) max(0, starty + 0);
+    uint32_t ys1 = (uint32_t) max(0, starty + 1);
+    uint32_t ys2 = (uint32_t) min(maxy, starty + 2);
+    uint32_t ys3 = (uint32_t) min(maxy, starty + 3);
+
+    float4 p00 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys0));
+    float4 p01 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys0));
+    float4 p02 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys0));
+    float4 p03 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys0));
+    float4 p0  = cubicInterpolate(p00, p01, p02, p03, xf);
+
+    float4 p10 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys1));
+    float4 p11 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys1));
+    float4 p12 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys1));
+    float4 p13 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys1));
+    float4 p1  = cubicInterpolate(p10, p11, p12, p13, xf);
+
+    float4 p20 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys2));
+    float4 p21 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys2));
+    float4 p22 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys2));
+    float4 p23 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys2));
+    float4 p2  = cubicInterpolate(p20, p21, p22, p23, xf);
+
+    float4 p30 = convert_float4(rsGetElementAt_uchar4(gIn, xs0, ys3));
+    float4 p31 = convert_float4(rsGetElementAt_uchar4(gIn, xs1, ys3));
+    float4 p32 = convert_float4(rsGetElementAt_uchar4(gIn, xs2, ys3));
+    float4 p33 = convert_float4(rsGetElementAt_uchar4(gIn, xs3, ys3));
+    float4 p3  = cubicInterpolate(p30, p31, p32, p33, xf);
+
+    float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
+    p = clamp(p, 0.f, 255.f);
+    return convert_uchar4(p);
+}
+
diff --git a/rsAllocation.cpp b/rsAllocation.cpp
index fad26cd..f0b9be3 100644
--- a/rsAllocation.cpp
+++ b/rsAllocation.cpp
@@ -495,16 +495,17 @@
 #ifndef RS_COMPATIBILITY_LIB
 void Allocation::NewBufferListener::onFrameAvailable() {
     intptr_t ip = (intptr_t)alloc;
-    rsc->sendMessageToClient(nullptr, RS_MESSAGE_TO_CLIENT_NEW_BUFFER, ip, 0, true);
+    rsc->sendMessageToClient(&ip, RS_MESSAGE_TO_CLIENT_NEW_BUFFER, 0, sizeof(ip), true);
 }
 #endif
 
 void * Allocation::getSurface(const Context *rsc) {
 #ifndef RS_COMPATIBILITY_LIB
     // Configure GrallocConsumer to be in asynchronous mode
-    sp<BufferQueue> bq = new BufferQueue();
-    mGrallocConsumer = new GrallocConsumer(this, bq);
-    sp<IGraphicBufferProducer> bp = bq;
+    sp<IGraphicBufferProducer> bp;
+    sp<IGraphicBufferConsumer> bc;
+    BufferQueue::createBufferQueue(&bp, &bc);
+    mGrallocConsumer = new GrallocConsumer(this, bc);
     bp->incStrong(nullptr);
 
     mBufferListener = new NewBufferListener();
diff --git a/rsContext.cpp b/rsContext.cpp
index a83eb9f..253182c 100644
--- a/rsContext.cpp
+++ b/rsContext.cpp
@@ -23,7 +23,6 @@
 
 #ifndef RS_COMPATIBILITY_LIB
 #include "rsMesh.h"
-#include <ui/FramebufferNativeWindow.h>
 #include <gui/DisplayEventReceiver.h>
 #endif
 
@@ -313,6 +312,11 @@
     rsc->props.mLogVisual = getProp("debug.rs.visual") != 0;
     rsc->props.mDebugMaxThreads = getProp("debug.rs.max-threads");
 
+    if (getProp("debug.rs.debug") != 0) {
+        ALOGD("Forcing debug context due to debug.rs.debug.");
+        rsc->mContextType = RS_CONTEXT_TYPE_DEBUG;
+    }
+
     bool loadDefault = true;
 
     // Provide a mechanism for dropping in a different RS driver.
diff --git a/rsThreadIO.cpp b/rsThreadIO.cpp
index 87ecdf2..f259591 100644
--- a/rsThreadIO.cpp
+++ b/rsThreadIO.cpp
@@ -213,7 +213,7 @@
 
     //ALOGE("sendToClient %i %i %i", cmdID, usrID, (int)dataLen);
     ClientCmdHeader hdr;
-    hdr.bytes = dataLen;
+    hdr.bytes = (uint32_t)dataLen;
     hdr.cmdID = cmdID;
     hdr.userID = usrID;
 
diff --git a/rsType.cpp b/rsType.cpp
index d9d97b0..c0cda91 100644
--- a/rsType.cpp
+++ b/rsType.cpp
@@ -83,7 +83,11 @@
         mHal.state.lodCount = rsMax(l2x, l2y);
         mHal.state.lodCount = rsMax(mHal.state.lodCount, l2z);
     } else {
-        mHal.state.lodCount = 1;
+        if (mHal.state.dimYuv) {
+            mHal.state.lodCount = 3;
+        } else {
+            mHal.state.lodCount = 1;
+        }
     }
     if (mHal.state.lodCount != oldLODCount) {
         if (oldLODCount) {
@@ -100,14 +104,16 @@
     uint32_t ty = mHal.state.dimY;
     uint32_t tz = mHal.state.dimZ;
     mCellCount = 0;
-    for (uint32_t lod=0; lod < mHal.state.lodCount; lod++) {
-        mHal.state.lodDimX[lod] = tx;
-        mHal.state.lodDimY[lod] = ty;
-        mHal.state.lodDimZ[lod]  = tz;
-        mCellCount += tx * rsMax(ty, 1u) * rsMax(tz, 1u);
-        if (tx > 1) tx >>= 1;
-        if (ty > 1) ty >>= 1;
-        if (tz > 1) tz >>= 1;
+    if (!mHal.state.dimYuv) {
+        for (uint32_t lod=0; lod < mHal.state.lodCount; lod++) {
+            mHal.state.lodDimX[lod] = tx;
+            mHal.state.lodDimY[lod] = ty;
+            mHal.state.lodDimZ[lod]  = tz;
+            mCellCount += tx * rsMax(ty, 1u) * rsMax(tz, 1u);
+            if (tx > 1) tx >>= 1;
+            if (ty > 1) ty >>= 1;
+            if (tz > 1) tz >>= 1;
+        }
     }
 
     if (mHal.state.faces) {
@@ -121,6 +127,7 @@
         mHal.state.lodDimY[1] = mHal.state.lodDimY[0] / 2;
         mHal.state.lodDimX[2] = mHal.state.lodDimX[0] / 2;
         mHal.state.lodDimY[2] = mHal.state.lodDimY[0] / 2;
+        mCellCount += mHal.state.lodDimX[0] * mHal.state.lodDimY[0];
         mCellCount += mHal.state.lodDimX[1] * mHal.state.lodDimY[1];
         mCellCount += mHal.state.lodDimX[2] * mHal.state.lodDimY[2];
 
diff --git a/rsg_generator.c b/rsg_generator.c
index 75ea1a3..2558f67 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -712,19 +712,6 @@
             printPlaybackCpp(f);
         }
         break;
-
-        case '4': // rsgApiStream.cpp
-        {
-            printFileHeader(f);
-            printPlaybackCpp(f);
-        }
-
-        case '5': // rsgApiStreamReplay.cpp
-        {
-            printFileHeader(f);
-            printPlaybackCpp(f);
-        }
-        break;
     }
     fclose(f);
     return 0;
diff --git a/scriptc/rs_atomic.rsh b/scriptc/rs_atomic.rsh
index fef05bb..ba847cf 100644
--- a/scriptc/rs_atomic.rsh
+++ b/scriptc/rs_atomic.rsh
@@ -177,7 +177,7 @@
 
 #endif //defined(RS_VERSION) && (RS_VERSION >= 14)
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+#if (defined(RS_VERSION) && (RS_VERSION >= 20))   // TODO: api 21
 
 /**
  * Atomic add one to the value at addr.
diff --git a/scriptc/rs_core_math.rsh b/scriptc/rs_core_math.rsh
index c7cc331..44d3828 100644
--- a/scriptc/rs_core_math.rsh
+++ b/scriptc/rs_core_math.rsh
@@ -6325,290 +6325,290 @@
 }
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char __attribute__((const, overloadable))max(char v1, char v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char2 __attribute__((const, overloadable))max(char2 v1, char2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char3 __attribute__((const, overloadable))max(char3 v1, char3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char4 __attribute__((const, overloadable))max(char4 v1, char4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar __attribute__((const, overloadable))max(uchar v1, uchar v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar2 __attribute__((const, overloadable))max(uchar2 v1, uchar2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar3 __attribute__((const, overloadable))max(uchar3 v1, uchar3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar4 __attribute__((const, overloadable))max(uchar4 v1, uchar4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short __attribute__((const, overloadable))max(short v1, short v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short2 __attribute__((const, overloadable))max(short2 v1, short2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short3 __attribute__((const, overloadable))max(short3 v1, short3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short4 __attribute__((const, overloadable))max(short4 v1, short4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort __attribute__((const, overloadable))max(ushort v1, ushort v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort2 __attribute__((const, overloadable))max(ushort2 v1, ushort2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort3 __attribute__((const, overloadable))max(ushort3 v1, ushort3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort4 __attribute__((const, overloadable))max(ushort4 v1, ushort4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int __attribute__((const, overloadable))max(int v1, int v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int2 __attribute__((const, overloadable))max(int2 v1, int2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int3 __attribute__((const, overloadable))max(int3 v1, int3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int4 __attribute__((const, overloadable))max(int4 v1, int4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint __attribute__((const, overloadable))max(uint v1, uint v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint2 __attribute__((const, overloadable))max(uint2 v1, uint2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint3 __attribute__((const, overloadable))max(uint3 v1, uint3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint4 __attribute__((const, overloadable))max(uint4 v1, uint4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long __attribute__((const, overloadable))max(long v1, long v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long2 __attribute__((const, overloadable))max(long2 v1, long2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long3 __attribute__((const, overloadable))max(long3 v1, long3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long4 __attribute__((const, overloadable))max(long4 v1, long4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong __attribute__((const, overloadable))max(ulong v1, ulong v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong2 __attribute__((const, overloadable))max(ulong2 v1, ulong2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong3 __attribute__((const, overloadable))max(ulong3 v1, ulong3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the maximum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong4 __attribute__((const, overloadable))max(ulong4 v1, ulong4 v2);
 #endif
@@ -6985,290 +6985,290 @@
 }
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char __attribute__((const, overloadable))min(char v1, char v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char2 __attribute__((const, overloadable))min(char2 v1, char2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char3 __attribute__((const, overloadable))min(char3 v1, char3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern char4 __attribute__((const, overloadable))min(char4 v1, char4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar __attribute__((const, overloadable))min(uchar v1, uchar v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar2 __attribute__((const, overloadable))min(uchar2 v1, uchar2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar3 __attribute__((const, overloadable))min(uchar3 v1, uchar3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uchar4 __attribute__((const, overloadable))min(uchar4 v1, uchar4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short __attribute__((const, overloadable))min(short v1, short v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short2 __attribute__((const, overloadable))min(short2 v1, short2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short3 __attribute__((const, overloadable))min(short3 v1, short3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern short4 __attribute__((const, overloadable))min(short4 v1, short4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort __attribute__((const, overloadable))min(ushort v1, ushort v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort2 __attribute__((const, overloadable))min(ushort2 v1, ushort2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort3 __attribute__((const, overloadable))min(ushort3 v1, ushort3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ushort4 __attribute__((const, overloadable))min(ushort4 v1, ushort4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int __attribute__((const, overloadable))min(int v1, int v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int2 __attribute__((const, overloadable))min(int2 v1, int2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int3 __attribute__((const, overloadable))min(int3 v1, int3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern int4 __attribute__((const, overloadable))min(int4 v1, int4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint __attribute__((const, overloadable))min(uint v1, uint v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint2 __attribute__((const, overloadable))min(uint2 v1, uint2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint3 __attribute__((const, overloadable))min(uint3 v1, uint3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern uint4 __attribute__((const, overloadable))min(uint4 v1, uint4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long __attribute__((const, overloadable))min(long v1, long v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long2 __attribute__((const, overloadable))min(long2 v1, long2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long3 __attribute__((const, overloadable))min(long3 v1, long3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern long4 __attribute__((const, overloadable))min(long4 v1, long4 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong __attribute__((const, overloadable))min(ulong v1, ulong v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong2 __attribute__((const, overloadable))min(ulong2 v1, ulong2 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong3 __attribute__((const, overloadable))min(ulong3 v1, ulong3 v2);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * Return the minimum value from two arguments
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern ulong4 __attribute__((const, overloadable))min(ulong4 v1, ulong4 v2);
 #endif
@@ -9653,38 +9653,38 @@
 extern float4 __attribute__((const, overloadable))step(float4 edge, float v);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * if (v < edge)
  * return 0.f;
  * else
  * return 1.f;
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern float2 __attribute__((const, overloadable))step(float edge, float2 v);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * if (v < edge)
  * return 0.f;
  * else
  * return 1.f;
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern float3 __attribute__((const, overloadable))step(float edge, float3 v);
 #endif
 
-#if (defined(RS_VERSION) && (RS_VERSION >= 20))
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
 /*
  * if (v < edge)
  * return 0.f;
  * else
  * return 1.f;
  *
- * Supported by API versions 20 and newer.
+ * Supported by API versions 21 and newer.
  */
 extern float4 __attribute__((const, overloadable))step(float edge, float4 v);
 #endif
diff --git a/scriptc/rs_types.rsh b/scriptc/rs_types.rsh
index 33cd7da..f1fc60b 100644
--- a/scriptc/rs_types.rsh
+++ b/scriptc/rs_types.rsh
@@ -72,7 +72,11 @@
 /**
  * 64 bit integer type
  */
-typedef long long int64_t;
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef long int64_t;
+#else
+    typedef long long int64_t;
+#endif
 /**
  * 8 bit unsigned integer type
  */
@@ -88,7 +92,11 @@
 /**
  * 64 bit unsigned integer type
  */
-typedef unsigned long long uint64_t;
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+    typedef unsigned long uint64_t;
+#else
+    typedef unsigned long long uint64_t;
+#endif
 /**
  * 8 bit unsigned integer type
  */