Merge "Split the RsForEachStubParamStruct in two."
diff --git a/api/rs_core_math.spec b/api/rs_core_math.spec
index 7adfce0..b55ca97 100644
--- a/api/rs_core_math.spec
+++ b/api/rs_core_math.spec
@@ -1487,3 +1487,412 @@
 # test: limited
 test: noverify
 end:
+
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_acos
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ acos
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_acosh
+ret: #2#1
+arg: #2#1
+comment:
+ acosh
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_acospi
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ acospi
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_asin
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ asin
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_asinh
+ret: #2#1
+arg: #2#1
+comment:
+ asinh
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_asinpi
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ Return the inverse sine divided by PI.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_atan
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ Return the inverse tangent.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_atan2
+ret: #2#1
+arg: #2#1 y
+arg: #2#1 x
+comment:
+ Return the inverse tangent of y / x.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_atanh
+ret: #2#1
+arg: #2#1
+comment:
+ Return the inverse hyperbolic tangent.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_atanpi
+ret: #2#1
+arg: #2#1 v range(-1,1)
+comment:
+ Return the inverse tangent divided by PI.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_atan2pi
+ret: #2#1
+arg: #2#1 y
+arg: #2#1 x
+comment:
+ Return the inverse tangent of y / x, divided by PI.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_cbrt
+ret: #2#1
+arg: #2#1
+comment:
+ Return the cube root.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_cos
+ret: #2#1
+arg: #2#1
+comment:
+ Return the cosine.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_cosh
+ret: #2#1
+arg: #2#1
+comment:
+ Return the hypebolic cosine.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_cospi
+ret: #2#1
+arg: #2#1
+comment:
+ Return the cosine of the value * PI.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_expm1
+ret: #2#1
+arg: #2#1
+comment:
+ Return (e ^ value) - 1.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_distance
+ret: #2
+arg: #2#1 lhs
+arg: #2#1 rhs
+comment:
+ Compute the approximate distance between two points.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_divide
+ret: #2#1
+arg: #2#1 lhs
+arg: #2#1 rhs
+comment:
+ Compute the approximate division result of two values.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_hypot
+ret: #2#1
+arg: #2#1 x
+arg: #2#1 y
+comment:
+ Return native_sqrt(x*x + y*y)
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_normalize
+ret: #2#1
+arg: #2#1 v
+comment:
+ Normalize a vector.
+version: 21
+test: vector
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_length
+ret: #2
+arg: #2#1 v
+comment:
+ Compute the approximate length of a vector.
+version: 21
+test: vector
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_log1p
+ret: #2#1
+arg: #2#1
+comment:
+ Return the natural logarithm of (v + 1.0f)
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_recip
+ret: #2#1
+arg: #2#1 v
+comment:
+ Return the approximate reciprocal of a value.
+version: 21
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_rootn
+ret: #2#1
+arg: #2#1 v
+arg: int#1 n
+comment:
+ Compute the Nth root of a value.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_rsqrt
+ret: #2#1
+arg: #2#1
+comment:
+ Return (1 / sqrt(value)).
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_sin
+ret: #2#1
+arg: #2#1
+comment:
+ Return the sine of a value specified in radians.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_sincos
+ret: #2#1
+arg: #2#1 v
+arg: #2#1 *cosptr
+comment:
+ Return the sine and cosine of a value.
+
+ @return sine
+ @param v The incoming value in radians
+ @param *cosptr cosptr[0] will be set to the cosine value.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_sinh
+ret: #2#1
+arg: #2#1
+comment:
+ Return the hyperbolic sine of a value specified in radians.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_sinpi
+ret: #2#1
+arg: #2#1
+comment:
+ Return the sin(v * PI).
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_sqrt
+ret: #2#1
+arg: #2#1
+comment:
+ Return the aproximate sqrt(v).
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_tan
+ret: #2#1
+arg: #2#1
+comment:
+ Return the tangent of a value.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_tanh
+ret: #2#1
+arg: #2#1
+comment:
+ Return the hyperbolic tangent of a value.
+version: 21
+test: noverify
+end:
+
+start:
+w: 1, 2, 3, 4
+t: f32
+name: native_tanpi
+ret: #2#1
+arg: #2#1
+comment:
+ Return tan(v * PI)
+version: 21
+test: noverify
+end:
+
+
diff --git a/cpu_ref/linkloader/android/librsloader.cpp b/cpu_ref/linkloader/android/librsloader.cpp
index 71ad6c5..7fbaa7c 100644
--- a/cpu_ref/linkloader/android/librsloader.cpp
+++ b/cpu_ref/linkloader/android/librsloader.cpp
@@ -25,7 +25,6 @@
 #define LOG_TAG "bcc"
 #include "cutils/log.h"
 
-#include <llvm/ADT/OwningPtr.h>
 #include <llvm/Support/ELF.h>
 
 #if defined(__LP64__) || defined(__x86_64__)
@@ -70,16 +69,16 @@
   ArchiveReaderLE AR(buf, buf_size);
 
 #if defined(__LP64__) || defined(__x86_64__)
-  llvm::OwningPtr<ELFObject<64> > object(ELFObject<64>::read(AR));
+  std::unique_ptr<ELFObject<64> > object(ELFObject<64>::read(AR));
 #else
-  llvm::OwningPtr<ELFObject<32> > object(ELFObject<32>::read(AR));
+  std::unique_ptr<ELFObject<32> > object(ELFObject<32>::read(AR));
 #endif
   if (!object) {
     ALOGE("Unable to load the ELF object.");
     return NULL;
   }
 
-  return wrap(object.take());
+  return wrap(object.release());
 }
 
 extern "C" int rsloaderRelocateExecutable(RSExecRef object_,
diff --git a/cpu_ref/linkloader/include/ELFHeader.h b/cpu_ref/linkloader/include/ELFHeader.h
index 98fa920..e6c66f6 100644
--- a/cpu_ref/linkloader/include/ELFHeader.h
+++ b/cpu_ref/linkloader/include/ELFHeader.h
@@ -20,8 +20,6 @@
 #include "ELFTypes.h"
 #include "ELF.h"
 
-#include <llvm/ADT/OwningPtr.h>
-
 #include <string.h>
 
 class ELFHeaderHelperMixin {
@@ -155,7 +153,7 @@
       return 0;
     }
 
-    llvm::OwningPtr<ELFHeader> header(new ELFHeader());
+    std::unique_ptr<ELFHeader> header(new ELFHeader());
     if (!header->serialize(AR)) {
       // Unable to read the structure.  Return NULL.
       return 0;
@@ -166,7 +164,7 @@
       return 0;
     }
 
-    return header.take();
+    return header.release();
   }
 
   void print();
diff --git a/cpu_ref/linkloader/include/ELFObject.h b/cpu_ref/linkloader/include/ELFObject.h
index d000c58..0c195b9 100644
--- a/cpu_ref/linkloader/include/ELFObject.h
+++ b/cpu_ref/linkloader/include/ELFObject.h
@@ -22,8 +22,6 @@
 
 #include "utils/rsl_assert.h"
 
-#include <llvm/ADT/OwningPtr.h>
-
 #include <string>
 #include <vector>
 
@@ -33,8 +31,8 @@
   ELF_TYPE_INTRO_TO_TEMPLATE_SCOPE(Bitwidth);
 
 private:
-  llvm::OwningPtr<ELFHeaderTy> header;
-  llvm::OwningPtr<ELFSectionHeaderTableTy> shtab;
+  std::unique_ptr<ELFHeaderTy> header;
+  std::unique_ptr<ELFSectionHeaderTableTy> shtab;
   std::vector<ELFSectionTy *> stab;
 
   MemChunk SHNCommonData;
diff --git a/cpu_ref/linkloader/include/ELFReloc.h b/cpu_ref/linkloader/include/ELFReloc.h
index c6bf759..84754b9 100644
--- a/cpu_ref/linkloader/include/ELFReloc.h
+++ b/cpu_ref/linkloader/include/ELFReloc.h
@@ -20,7 +20,6 @@
 #include "ELFTypes.h"
 #include "utils/rsl_assert.h"
 
-#include <llvm/ADT/OwningPtr.h>
 #include <string>
 #include <stdint.h>
 
diff --git a/cpu_ref/linkloader/include/ELFSection.h b/cpu_ref/linkloader/include/ELFSection.h
index 74c4c13..1548413 100644
--- a/cpu_ref/linkloader/include/ELFSection.h
+++ b/cpu_ref/linkloader/include/ELFSection.h
@@ -18,7 +18,6 @@
 #define ELF_SECTION_H
 
 #include "ELFTypes.h"
-#include <llvm/ADT/OwningPtr.h>
 
 template <unsigned Bitwidth>
 class ELFSection {
diff --git a/cpu_ref/linkloader/include/ELFSectionBits.h b/cpu_ref/linkloader/include/ELFSectionBits.h
index 2d80c1a..b6e4590 100644
--- a/cpu_ref/linkloader/include/ELFSectionBits.h
+++ b/cpu_ref/linkloader/include/ELFSectionBits.h
@@ -21,8 +21,6 @@
 #include "ELFSection.h"
 #include "MemChunk.h"
 
-#include <llvm/ADT/OwningPtr.h>
-
 template <unsigned Bitwidth>
 class ELFSectionBits : public ELFSection<Bitwidth> {
 protected:
diff --git a/cpu_ref/linkloader/include/ELFSectionHeader.h b/cpu_ref/linkloader/include/ELFSectionHeader.h
index 7871315..a8881f2 100644
--- a/cpu_ref/linkloader/include/ELFSectionHeader.h
+++ b/cpu_ref/linkloader/include/ELFSectionHeader.h
@@ -19,7 +19,6 @@
 
 #include "ELFTypes.h"
 
-#include <llvm/ADT/OwningPtr.h>
 #include <stdint.h>
 
 class ELFSectionHeaderHelperMixin {
diff --git a/cpu_ref/linkloader/include/ELFSectionHeaderTable.h b/cpu_ref/linkloader/include/ELFSectionHeaderTable.h
index fb7f9ed..3192cfe 100644
--- a/cpu_ref/linkloader/include/ELFSectionHeaderTable.h
+++ b/cpu_ref/linkloader/include/ELFSectionHeaderTable.h
@@ -19,7 +19,6 @@
 
 #include "ELFTypes.h"
 
-#include <llvm/ADT/OwningPtr.h>
 #include <llvm/ADT/StringMap.h>
 
 #include <vector>
diff --git a/cpu_ref/linkloader/include/ELFSymbol.h b/cpu_ref/linkloader/include/ELFSymbol.h
index d78ba5a..65a97cc 100644
--- a/cpu_ref/linkloader/include/ELFSymbol.h
+++ b/cpu_ref/linkloader/include/ELFSymbol.h
@@ -20,8 +20,6 @@
 #include "ELFTypes.h"
 #include "ELF.h"
 
-#include <llvm/ADT/OwningPtr.h>
-
 #include <string>
 #include <algorithm>
 
diff --git a/cpu_ref/linkloader/include/impl/ELFObject.hxx b/cpu_ref/linkloader/include/impl/ELFObject.hxx
index 27badd9..d0307de 100644
--- a/cpu_ref/linkloader/include/impl/ELFObject.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFObject.hxx
@@ -36,7 +36,7 @@
 template <typename Archiver>
 inline ELFObject<Bitwidth> *
 ELFObject<Bitwidth>::read(Archiver &AR) {
-  llvm::OwningPtr<ELFObjectTy> object(new ELFObjectTy());
+  std::unique_ptr<ELFObjectTy> object(new ELFObjectTy());
 
   // Read header
   object->header.reset(ELFHeaderTy::read(AR));
@@ -57,9 +57,9 @@
       object->stab.push_back(NULL);
       progbits_ndx.push_back(i);
     } else {
-      llvm::OwningPtr<ELFSectionTy> sec(
+      std::unique_ptr<ELFSectionTy> sec(
         ELFSectionTy::read(AR, object.get(), (*object->shtab)[i]));
-      object->stab.push_back(sec.take());
+      object->stab.push_back(sec.release());
     }
   }
 
@@ -72,12 +72,12 @@
   for (size_t i = 0; i < progbits_ndx.size(); ++i) {
     size_t index = progbits_ndx[i];
 
-    llvm::OwningPtr<ELFSectionTy> sec(
+    std::unique_ptr<ELFSectionTy> sec(
       ELFSectionTy::read(AR, object.get(), (*object->shtab)[index]));
-    object->stab[index] = sec.take();
+    object->stab[index] = sec.release();
   }
 
-  return object.take();
+  return object.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFReloc.hxx b/cpu_ref/linkloader/include/impl/ELFReloc.hxx
index 6b55e02..e60b66e 100644
--- a/cpu_ref/linkloader/include/impl/ELFReloc.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFReloc.hxx
@@ -32,7 +32,7 @@
     return 0;
   }
 
-  llvm::OwningPtr<ELFRelocTy> sh(new ELFRelocTy());
+  std::unique_ptr<ELFRelocTy> sh(new ELFRelocTy());
 
   if (!sh->serializeRela(AR)) {
     // Unable to read the structure.  Return NULL.
@@ -47,7 +47,7 @@
   // Set the section header index
   sh->index = index;
 
-  return sh.take();
+  return sh.release();
 }
 
 template <unsigned Bitwidth>
@@ -60,7 +60,7 @@
     return 0;
   }
 
-  llvm::OwningPtr<ELFRelocTy> sh(new ELFRelocTy());
+  std::unique_ptr<ELFRelocTy> sh(new ELFRelocTy());
 
   sh->r_addend = 0;
   if (!sh->serializeRel(AR)) {
@@ -75,7 +75,7 @@
   // Set the section header index
   sh->index = index;
 
-  return sh.take();
+  return sh.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionHeader.hxx b/cpu_ref/linkloader/include/impl/ELFSectionHeader.hxx
index 78a5495..0c9568d 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionHeader.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionHeader.hxx
@@ -41,7 +41,7 @@
     return 0;
   }
 
-  llvm::OwningPtr<ELFSectionHeaderTy> sh(new ELFSectionHeaderTy());
+  std::unique_ptr<ELFSectionHeaderTy> sh(new ELFSectionHeaderTy());
 
   if (!sh->serialize(AR)) {
     // Unable to read the structure.  Return NULL.
@@ -59,7 +59,7 @@
   // Set the owner elf object
   sh->owner = owner;
 
-  return sh.take();
+  return sh.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionHeaderTable.hxx b/cpu_ref/linkloader/include/impl/ELFSectionHeaderTable.hxx
index 1f693a4..570cae5 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionHeaderTable.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionHeaderTable.hxx
@@ -41,7 +41,7 @@
   }
 
   // Allocate a new section header table and assign the owner.
-  llvm::OwningPtr<ELFSectionHeaderTable> tab(new ELFSectionHeaderTable());
+  std::unique_ptr<ELFSectionHeaderTable> tab(new ELFSectionHeaderTable());
 
   // Get ELF header
   ELFHeaderTy const *header = owner->getHeader();
@@ -53,7 +53,7 @@
   AR.seek(header->getSectionHeaderTableOffset(), true);
 
   for (size_t i = 0; i < header->getSectionHeaderNum(); ++i) {
-    llvm::OwningPtr<ELFSectionHeaderTy> sh(
+    std::unique_ptr<ELFSectionHeaderTy> sh(
       ELFSectionHeaderTy::read(AR, owner, i));
 
     if (!sh) {
@@ -61,10 +61,10 @@
       return 0;
     }
 
-    tab->table.push_back(sh.take());
+    tab->table.push_back(sh.release());
   }
 
-  return tab.take();
+  return tab.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionNoBits.hxx b/cpu_ref/linkloader/include/impl/ELFSectionNoBits.hxx
index 6f9f713..d77398d 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionNoBits.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionNoBits.hxx
@@ -28,7 +28,7 @@
 template <typename Archiver>
 inline ELFSectionNoBits<Bitwidth> *
 ELFSectionNoBits<Bitwidth>::read(Archiver &AR, ELFSectionHeaderTy const *sh) {
-  llvm::OwningPtr<ELFSectionNoBits> result(new ELFSectionNoBits());
+  std::unique_ptr<ELFSectionNoBits> result(new ELFSectionNoBits());
 
   if (!result->chunk.allocate(sh->getSize())) {
     return NULL;
@@ -36,7 +36,7 @@
 
   result->sh = sh;
 
-  return result.take();
+  return result.release();
 }
 
 #endif // ELF_SECTION_NOBITS_HXX
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx b/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
index cdec650..93919c8 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionProgBits.hxx
@@ -35,7 +35,7 @@
                                    ELFSectionHeaderTy const *sh) {
   int machine = owner->getHeader()->getMachine();
   ELFSectionProgBits *secp = new ELFSectionProgBits(machine);
-  llvm::OwningPtr<ELFSectionProgBits> result(secp);
+  std::unique_ptr<ELFSectionProgBits> result(secp);
   size_t max_num_stubs = 0;
   // Align section boundary to 4 bytes.
   size_t section_size = (sh->getSize() + 3) / 4 * 4;
@@ -82,7 +82,7 @@
     return NULL;
   }
 
-  return result.take();
+  return result.release();
 }
 
 #endif // ELF_SECTION_PROGBITS_HXX
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx b/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
index 5a1a853..42c7a7a 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionRelTable.hxx
@@ -57,7 +57,7 @@
 
   rsl_assert(sh->getType() == SHT_REL || sh->getType() == SHT_RELA);
 
-  llvm::OwningPtr<ELFSectionRelTable> rt(new ELFSectionRelTable());
+  std::unique_ptr<ELFSectionRelTable> rt(new ELFSectionRelTable());
 
   // Seek to the start of the table
   AR.seek(sh->getOffset(), true);
@@ -84,7 +84,7 @@
     return 0;
   }
 
-  return rt.take();
+  return rt.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionStrTab.hxx b/cpu_ref/linkloader/include/impl/ELFSectionStrTab.hxx
index 4e9eb4e..f8944ca 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionStrTab.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionStrTab.hxx
@@ -20,7 +20,6 @@
 #include "utils/helper.h"
 #include "utils/raw_ostream.h"
 
-#include <llvm/ADT/OwningPtr.h>
 #include <llvm/Support/Format.h>
 #include <llvm/Support/raw_ostream.h>
 
@@ -30,7 +29,7 @@
 ELFSectionStrTab<Bitwidth>::read(Archiver &AR,
                                  ELFSectionHeaderTy const *sh) {
 
-  llvm::OwningPtr<ELFSectionStrTab> st(new ELFSectionStrTab());
+  std::unique_ptr<ELFSectionStrTab> st(new ELFSectionStrTab());
   st->buf.resize(sh->getSize());
 
   // Save section_header
@@ -46,7 +45,7 @@
     return 0;
   }
 
-  return st.take();
+  return st.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSectionSymTab.hxx b/cpu_ref/linkloader/include/impl/ELFSectionSymTab.hxx
index b179383..2b796e3 100644
--- a/cpu_ref/linkloader/include/impl/ELFSectionSymTab.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSectionSymTab.hxx
@@ -89,7 +89,7 @@
                                  ELFObjectTy *owner,
                                  ELFSectionHeaderTy const *sh) {
 
-  llvm::OwningPtr<ELFSectionSymTabTy> st(new ELFSectionSymTabTy());
+  std::unique_ptr<ELFSectionSymTabTy> st(new ELFSectionSymTabTy());
 
   // Assert that entry size will be the same as standard.
   rsl_assert(sh->getEntrySize() == TypeTraits<ELFSymbolTy>::size);
@@ -108,7 +108,7 @@
     return 0;
   }
 
-  return st.take();
+  return st.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/include/impl/ELFSymbol.hxx b/cpu_ref/linkloader/include/impl/ELFSymbol.hxx
index b3c6087..d4579ed 100644
--- a/cpu_ref/linkloader/include/impl/ELFSymbol.hxx
+++ b/cpu_ref/linkloader/include/impl/ELFSymbol.hxx
@@ -51,7 +51,7 @@
     return 0;
   }
 
-  llvm::OwningPtr<ELFSymbolTy> sh(new ELFSymbolTy());
+  std::unique_ptr<ELFSymbolTy> sh(new ELFSymbolTy());
 
   if (!sh->serialize(AR)) {
     // Unable to read the structure.  Return NULL.
@@ -69,7 +69,7 @@
   // Set the owner elf object
   sh->owner = owner;
 
-  return sh.take();
+  return sh.release();
 }
 
 template <unsigned Bitwidth>
diff --git a/cpu_ref/linkloader/main.cpp b/cpu_ref/linkloader/main.cpp
index 2ead7ec..072595f 100644
--- a/cpu_ref/linkloader/main.cpp
+++ b/cpu_ref/linkloader/main.cpp
@@ -19,8 +19,6 @@
 #include "utils/serialize.h"
 #include "ELF.h"
 
-#include <llvm/ADT/OwningPtr.h>
-
 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>
@@ -142,7 +140,7 @@
 
 template <unsigned Bitwidth, typename Archiver>
 void dump_and_run_object(Archiver &AR, int argc, char **argv) {
-  llvm::OwningPtr<ELFObject<Bitwidth> > object(ELFObject<Bitwidth>::read(AR));
+  std::unique_ptr<ELFObject<Bitwidth> > object(ELFObject<Bitwidth>::read(AR));
 
   if (!object) {
     llvm::errs() << "ERROR: Unable to load object\n";
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 0d296ea..a194048 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -130,24 +130,32 @@
     void (*column[4])(void);
     void (*store)(void);
     void (*load)(void);
+    void (*store_end)(void);
+    void (*load_end)(void);
 } FunctionTab_t;
 
-extern "C" size_t rsdIntrinsicColorMatrix_int_K(
+extern "C" void rsdIntrinsicColorMatrix_int_K(
              void *out, void const *in, size_t count,
              FunctionTab_t const *fns,
              int16_t const *mult, int32_t const *add);
 
-extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
-             FunctionTab_t const *fns,
-             uint32_t mask, int dt, int st);
-
-extern "C" size_t rsdIntrinsicColorMatrix_float_K(
+extern "C" void rsdIntrinsicColorMatrix_float_K(
              void *out, void const *in, size_t count,
              FunctionTab_t const *fns,
              float const *mult, float const *add);
 
+/* The setup functions fill in function tables to be used by above functions;
+ * this code also eliminates jump-to-another-jump cases by short-circuiting
+ * empty functions.  While it's not performance critical, it works out easier
+ * to write the set-up code in assembly than to try to expose the same symbols
+ * and write the code in C.
+ */
+extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
+             FunctionTab_t *fns,
+             uint32_t mask, int dt, int st);
+
 extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
-             FunctionTab_t const *fns,
+             FunctionTab_t *fns,
              uint32_t mask, int dt, int st);
 #endif
 
@@ -874,8 +882,8 @@
                                               uint32_t xstart, uint32_t xend,
                                               uint32_t instep, uint32_t outstep) {
     RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
-    uchar *out = (uchar *)p->out;
-    uchar *in = (uchar *)p->in;
+    uchar *out = (uchar *)p->out + outstep * xstart;
+    uchar *in = (uchar *)p->in + instep * xstart;
     uint32_t x1 = xstart;
     uint32_t x2 = xend;
 
@@ -902,15 +910,14 @@
             }
 #if defined(ARCH_ARM64_USE_INTRINSICS)
             else {
-                size_t done;
                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
-                    done = len - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
                 } else {
-                    done = len - rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                    rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
                 }
-                x1 += done;
-                out += outstep * done;
-                in += instep * done;
+                x1 += len;
+                out += outstep * len;
+                in += instep * len;
             }
 #endif
         }
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
index 7a6d4c5..3fcfa25 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_ColorMatrix.S
@@ -595,8 +595,377 @@
             ld1         {v20.4s}, [x1], #16
             br          x4
 
+colormatrix_int_stu1_end:
+            uqxtn       v12.8b, v8.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_int_realend
 
-/* size_t rsdIntrinsicColorMatrix_int_K(
+colormatrix_int_stu2_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            zip1        v12.16b, v12.16b, v13.16b
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu3_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+colormatrix_int_stu4_end:
+            uqxtn       v12.8b, v8.8h
+            uqxtn       v13.8b, v9.8h
+            uqxtn       v14.8b, v10.8h
+            uqxtn       v15.8b, v11.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_int_realend
+
+
+colormatrix_int_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[3], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[5], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[9], [x1], #1
+1:          uxtl2       v12.8h, v15.16b
+            br          x4
+
+colormatrix_int_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uzp1        v14.16b, v15.16b, v15.16b
+            uzp2        v15.16b, v15.16b, v15.16b
+            uxtl        v12.8h, v14.8b
+            uxtl        v13.8h, v15.8b
+            br          x4
+
+colormatrix_int_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            br          x4
+
+colormatrix_int_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
+            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
+1:          uxtl        v12.8h, v12.8b
+            uxtl        v13.8h, v13.8b
+            uxtl        v14.8h, v14.8b
+            uxtl        v15.8h, v15.8b
+            br          x4
+
+colormatrix_float_stu1_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v16.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun2   v12.8h, v13.4s, #1
+            uqxtn       v12.8b, v12.8h
+            tbz         x2, #2, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #1, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          tbz         x2, #0, 1f
+            st1         {v12.b}[1], [x0], #1
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu2_end:
+            fcvtzs      v12.4s, v8.4s, #1
+            fcvtzs      v13.4s, v9.4s, #1
+            fcvtzs      v14.4s, v16.4s, #1
+            fcvtzs      v15.4s, v17.4s, #1
+            sqrshrun    v12.4h, v12.4s, #1
+            sqrshrun    v13.4h, v13.4s, #1
+            sqrshrun    v14.4h, v14.4s, #1
+            sqrshrun    v15.4h, v15.4s, #1
+            zip1        v12.8h, v12.8h, v13.8h
+            zip1        v13.8h, v14.8h, v15.8h
+            uqxtn       v12.8b, v12.8h
+            uqxtn2      v12.16b, v13.8h
+            tbz         x2, #2, 1f
+            st1         {v12.d}[1], [x0], #8
+1:          tbz         x2, #1, 1f
+            st1         {v12.s}[1], [x0], #4
+1:          tbz         x2, #0, 1f
+            st1         {v12.h}[1], [x0], #2
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu3_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            movi        v15.8b, #0
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stu4_end:
+            fcvtzs      v24.4s, v8.4s, #1
+            fcvtzs      v25.4s, v9.4s, #1
+            fcvtzs      v26.4s, v10.4s, #1
+            fcvtzs      v27.4s, v11.4s, #1
+            fcvtzs      v28.4s, v16.4s, #1
+            fcvtzs      v29.4s, v17.4s, #1
+            fcvtzs      v30.4s, v18.4s, #1
+            fcvtzs      v31.4s, v19.4s, #1
+            sqrshrun    v24.4h, v24.4s, #1
+            sqrshrun    v25.4h, v25.4s, #1
+            sqrshrun    v26.4h, v26.4s, #1
+            sqrshrun    v27.4h, v27.4s, #1
+            sqrshrun2   v24.8h, v28.4s, #1
+            sqrshrun2   v25.8h, v29.4s, #1
+            sqrshrun2   v26.8h, v30.4s, #1
+            sqrshrun2   v27.8h, v31.4s, #1
+            uqxtn       v12.8b, v24.8h
+            uqxtn       v13.8b, v25.8h
+            uqxtn       v14.8b, v26.8h
+            uqxtn       v15.8b, v27.8h
+            tbz         x2, #2, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
+1:          tbz         x2, #1, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
+            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
+1:          tbz         x2, #0, 1f
+            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf1_end:
+            tbz         x2, #2, 1f
+            st1         {v16.4s}, [x0], #16
+1:          tbz         x2, #1, 1f
+            st1         {v8.d}[1], [x0], #8
+1:          tbz         x2, #0, 1f
+            st1         {v8.s}[1], [x0], #4
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf2_end:
+            tbz         x2, #2, 1f
+            st2         {v16.4s, v17.4s}, [x0], #32
+1:          tbz         x2, #1, 1f
+            st2         {v8.s,v9.s}[2], [x0], #8
+            st2         {v8.s,v9.s}[3], [x0], #8
+1:          tbz         x2, #0, 1f
+            st2         {v8.s,v9.s}[1], [x0], #8
+1:          b           colormatrix_float_realend
+
+colormatrix_float_stf3_end:
+            movi        v11.16b, #0
+            movi        v19.16b, #0
+colormatrix_float_stf4_end:
+            tbz         x2, #2, 1f
+            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
+1:          tbz         x2, #1, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
+            st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
+1:          tbz         x2, #0, 1f
+            st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
+1:          b           colormatrix_float_realend
+
+colormatrix_float_ldu1_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          tbz         x2, #0, 1f
+            ld1         {v15.b}[1], [x1], #1
+1:          uxtl        v15.8h, v15.8b
+            uxtl        v12.4s, v15.4h
+            uxtl2       v20.4s, v15.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v20.4s, v20.4s
+            br          x4
+
+colormatrix_float_ldu2_end:
+            tbz         x2, #2, 1f
+            ld1         {v15.d}[1], [x1], #8
+1:          tbz         x2, #1, 1f
+            ld1         {v15.s}[1], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld1         {v15.h}[1], [x1], #2
+1:          uxtl        v14.8h, v15.8b
+            uxtl2       v15.8h, v15.16b
+            uzp1        v12.8h, v14.8h, v14.8h
+            uzp2        v13.8h, v14.8h, v14.8h
+            uzp1        v20.8h, v15.8h, v15.8h
+            uzp2        v21.8h, v15.8h, v15.8h
+            uxtl        v12.4s, v12.4h
+            uxtl        v13.4s, v13.4h
+            uxtl        v20.4s, v20.4h
+            uxtl        v21.4s, v21.4h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            br          x4
+
+colormatrix_float_ldu3_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            br          x4
+
+colormatrix_float_ldu4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
+1:          tbz         x2, #1, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
+            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
+1:          tbz         x2, #0, 1f
+            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
+1:          uxtl        v20.8h, v20.8b
+            uxtl        v21.8h, v21.8b
+            uxtl        v22.8h, v22.8b
+            uxtl        v23.8h, v23.8b
+            uxtl        v12.4s, v20.4h
+            uxtl        v13.4s, v21.4h
+            uxtl        v14.4s, v22.4h
+            uxtl        v15.4s, v23.4h
+            uxtl2       v20.4s, v20.8h
+            uxtl2       v21.4s, v21.8h
+            uxtl2       v22.4s, v22.8h
+            uxtl2       v23.4s, v23.8h
+            ucvtf       v12.4s, v12.4s
+            ucvtf       v13.4s, v13.4s
+            ucvtf       v14.4s, v14.4s
+            ucvtf       v15.4s, v15.4s
+            ucvtf       v20.4s, v20.4s
+            ucvtf       v21.4s, v21.4s
+            ucvtf       v22.4s, v22.4s
+            ucvtf       v23.4s, v23.4s
+            br          x4
+
+colormatrix_float_ldf1_end:
+            tbz         x2, #2, 1f
+            ld1         {v20.4s}, [x1], #16
+1:          tbz         x2, #1, 1f
+            ld1         {v12.d}[1], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld1         {v12.s}[1], [x1], #4
+1:          br          x4
+
+colormatrix_float_ldf2_end:
+            tbz         x2, #2, 1f
+            ld2         {v20.4s,v21.4s}, [x1], #32
+1:          tbz         x2, #1, 1f
+            ld2         {v12.s,v13.s}[2], [x1], #8
+            ld2         {v12.s,v13.s}[3], [x1], #8
+1:          tbz         x2, #0, 1f
+            ld2         {v12.s,v13.s}[1], [x1], #8
+1:          br          x4
+
+colormatrix_float_ldf3_end:
+colormatrix_float_ldf4_end:
+            tbz         x2, #2, 1f
+            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
+1:          tbz         x2, #1, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
+            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
+1:          tbz         x2, #0, 1f
+            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
+1:          br          x4
+
+/* void rsdIntrinsicColorMatrix_int_K(
  *          void *out,              // x0
  *          void const *in,         // x1
  *          size_t count,           // x2
@@ -605,7 +974,6 @@
  *          int32_t const *add);    // x5
  */
 ENTRY(rsdIntrinsicColorMatrix_int_K)
-            stp         x8,x9, [sp, #-16]!
             sub         x7, sp, #32
             sub         sp, sp, #64
             st1         {v8.1d-v11.1d}, [sp]
@@ -636,10 +1004,23 @@
             br          x9
 
 colormatrix_int_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8, x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_int_realend:
             ld1         {v8.1d-v11.1d}, [sp], #32
             ld1         {v12.1d-v15.1d}, [sp], #32
-            ldp         x8,x9, [sp], #16
-            add         x0, x2, #8
             ret
 END(rsdIntrinsicColorMatrix_int_K)
 
@@ -650,28 +1031,35 @@
  *          int st);            // x3
  */
 ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
-            adr         x4, 2f
-            ldrsh       x2, [x4, x2, LSL #1]
-            add         x2, x2, x4
-            adr         x4, 3f
-            ldrsh       x3, [x4, x3, LSL #1]
-            add         x3, x3, x4
+            adr         x7, 2f
+            add         x4, x7, x2, LSL #2
+            ldrsh       x2, [x4], #2
+            ldrsh       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adr         x7, 3f
+            add         x5, x7, x3, LSL #2
+            ldrsh       x3, [x5], #2
+            ldrsh       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
             stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
 
 /* For each column function, if the matrix is all zeroes then write NULL,
  * otherwise look up the appropriate function and store that. */
 
             mov         x3, #4
-            adr         x4, 4f
+            adr         x7, 4f
 1:          ands        x2, x1, #15
             beq         9f
             and         x2, x1, #31
             lsl         x2, x2, #3
-            ldrsh       x2, [x4, x2]
-            add         x2, x2, x4
+            ldrsh       x2, [x7, x2]
+            add         x2, x2, x7
 9:          str         x2, [x0], #8
             lsr         x1, x1, #5
-            add         x4, x4, #2
+            add         x7, x7, #2
             subs        x3, x3, #1
             bne         1b
 
@@ -690,13 +1078,21 @@
 
             .align 4
 2:          .hword      colormatrix_int_stu1-2b
+            .hword      colormatrix_int_stu1_end-2b
             .hword      colormatrix_int_stu2-2b
+            .hword      colormatrix_int_stu2_end-2b
             .hword      colormatrix_int_stu3-2b
+            .hword      colormatrix_int_stu3_end-2b
             .hword      colormatrix_int_stu4-2b
+            .hword      colormatrix_int_stu4_end-2b
 3:          .hword      colormatrix_int_ldu1-3b
+            .hword      colormatrix_int_ldu1_end-3b
             .hword      colormatrix_int_ldu2-3b
+            .hword      colormatrix_int_ldu2_end-3b
             .hword      colormatrix_int_ldu3-3b
+            .hword      colormatrix_int_ldu3_end-3b
             .hword      colormatrix_int_ldu4-3b
+            .hword      colormatrix_int_ldu4_end-3b
 4:
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
             .hword      colormatrix_int_col0_\i-4b
@@ -713,7 +1109,7 @@
 END(rsdIntrinsicColorMatrixSetup_int_K)
 
 
-/* size_t rsdIntrinsicColorMatrix_float_K(
+/* void rsdIntrinsicColorMatrix_float_K(
  *          void *out,              // x0
  *          void const *in,         // x1
  *          size_t count,           // x2
@@ -722,7 +1118,6 @@
  *          float const *add);      // x5
  */
 ENTRY(rsdIntrinsicColorMatrix_float_K)
-            stp         x8,x9, [sp, #-16]!
             sub         x7, sp, #32
             sub         sp, sp, #64
             st1         {v8.1d-v11.1d}, [sp]
@@ -753,10 +1148,23 @@
             br          x9
 
 colormatrix_float_end:
+            adds        x2, x2, #8
+            bls         colormatrix_int_realend
+            mov         x16, x8
+            ldp         x8,x9, [x3], #16
+            cmp         x4, x16
+            csel        x4, x8, x4, eq
+            cmp         x5, x16
+            csel        x5, x8, x5, eq
+            cmp         x6, x16
+            csel        x6, x8, x6, eq
+            cmp         x7, x16
+            csel        x7, x8, x7, eq
+            br          x9
+
+colormatrix_float_realend:
             ld1         {v8.1d-v11.1d}, [sp], #32
             ld1         {v12.1d-v15.1d}, [sp], #32
-            ldp         x8,x9, [sp], #16
-            add         x0, x2, #8
             ret
 END(rsdIntrinsicColorMatrix_float_K)
 
@@ -767,28 +1175,35 @@
  *          int st);            // x3
  */
 ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
-            adr         x4, 2f
-            ldrsh       x2, [x4, x2, LSL #1]
-            add         x2, x2, x4
-            adr         x4, 3f
-            ldrsh       x3, [x4, x3, LSL #1]
-            add         x3, x3, x4
+            adr         x7, 2f
+            add         x4, x7, x2, LSL #2
+            ldrsh       x2, [x4], #2
+            ldrsh       x4, [x4]
+            add         x2, x2, x7
+            add         x4, x4, x7
+            adr         x7, 3f
+            add         x5, x7, x3, LSL #2
+            ldrsh       x3, [x5], #2
+            ldrsh       x5, [x5]
+            add         x3, x3, x7
+            add         x5, x5, x7
             stp         x2, x3, [x0, #32]
+            stp         x4, x5, [x0, #48]
 
 /* For each column function, if the matrix is all zeroes then write NULL,
  * otherwise look up the appropriate function and store that. */
 
             mov         x3, #4
-            adr         x4, 4f
+            adr         x7, 4f
 1:          ands        x2, x1, #15
             beq         9f
             and         x2, x1, #31
             lsl         x2, x2, #3
-            ldrsh       x2, [x4, x2]
-            add         x2, x2, x4
+            ldrsh       x2, [x7, x2]
+            add         x2, x2, x7
 9:          str         x2, [x0], #8
             lsr         x1, x1, #5
-            add         x4, x4, #2
+            add         x7, x7, #2
             subs        x3, x3, #1
             bne         1b
 
@@ -807,21 +1222,37 @@
 
             .align 4
 2:          .hword      colormatrix_float_stu1-2b
+            .hword      colormatrix_float_stu1_end-2b
             .hword      colormatrix_float_stu2-2b
+            .hword      colormatrix_float_stu2_end-2b
             .hword      colormatrix_float_stu3-2b
+            .hword      colormatrix_float_stu3_end-2b
             .hword      colormatrix_float_stu4-2b
+            .hword      colormatrix_float_stu4_end-2b
             .hword      colormatrix_float_stf1-2b
+            .hword      colormatrix_float_stf1_end-2b
             .hword      colormatrix_float_stf2-2b
+            .hword      colormatrix_float_stf2_end-2b
             .hword      colormatrix_float_stf3-2b
+            .hword      colormatrix_float_stf3_end-2b
             .hword      colormatrix_float_stf4-2b
+            .hword      colormatrix_float_stf4_end-2b
 3:          .hword      colormatrix_float_ldu1-3b
+            .hword      colormatrix_float_ldu1_end-3b
             .hword      colormatrix_float_ldu2-3b
+            .hword      colormatrix_float_ldu2_end-3b
             .hword      colormatrix_float_ldu3-3b
+            .hword      colormatrix_float_ldu3_end-3b
             .hword      colormatrix_float_ldu4-3b
+            .hword      colormatrix_float_ldu4_end-3b
             .hword      colormatrix_float_ldf1-3b
+            .hword      colormatrix_float_ldf1_end-3b
             .hword      colormatrix_float_ldf2-3b
+            .hword      colormatrix_float_ldf2_end-3b
             .hword      colormatrix_float_ldf3-3b
+            .hword      colormatrix_float_ldf3_end-3b
             .hword      colormatrix_float_ldf4-3b
+            .hword      colormatrix_float_ldf4_end-3b
 4:
 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
             .hword      colormatrix_float_col0_\i-4b
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index c4fec9f..d6e0c4b 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -733,7 +733,7 @@
     // Check for a platform specific library
 #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON)
     enum bcinfo::RSFloatPrecision prec = ME.getRSFloatPrecision();
-    if (prec == bcinfo::RS_FP_Imprecise || prec == bcinfo::RS_FP_Relaxed) {
+    if (prec == bcinfo::RS_FP_Relaxed) {
         // NEON-capable ARMv7a devices can use an accelerated math library
         // for all reduced precision scripts.
         // ARMv8 does not use NEON, as ASIMD can be used with all precision
diff --git a/driver/rsdAllocation.cpp b/driver/rsdAllocation.cpp
index 248e7b6..bbe0d7a 100644
--- a/driver/rsdAllocation.cpp
+++ b/driver/rsdAllocation.cpp
@@ -1213,5 +1213,3 @@
     }
 #endif
 }
-
-
diff --git a/driver/runtime/rs_cl.c b/driver/runtime/rs_cl.c
index 5970676..d91d975 100644
--- a/driver/runtime/rs_cl.c
+++ b/driver/runtime/rs_cl.c
@@ -590,6 +590,10 @@
 
 #if !defined(__i386__) && !defined(__x86_64__)
 FN_FUNC_FN(sqrt)
+#else
+extern float2 __attribute__((overloadable)) sqrt(float2);
+extern float3 __attribute__((overloadable)) sqrt(float3);
+extern float4 __attribute__((overloadable)) sqrt(float4);
 #endif // !defined(__i386__) && !defined(__x86_64__)
 
 FN_FUNC_FN(rsqrt)
@@ -1416,6 +1420,92 @@
     return r;
 }
 
+#define THUNK_NATIVE_F(fn) \
+    float __attribute__((overloadable)) native_##fn(float v) { return fn(v);} \
+    float2 __attribute__((overloadable)) native_##fn(float2 v) { return fn(v);} \
+    float3 __attribute__((overloadable)) native_##fn(float3 v) { return fn(v);} \
+    float4 __attribute__((overloadable)) native_##fn(float4 v) { return fn(v);}
+
+#define THUNK_NATIVE_F_F(fn) \
+    float __attribute__((overloadable)) native_##fn(float v1, float v2) { return fn(v1, v2);} \
+    float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 v2) { return fn(v1, v2);} \
+    float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 v2) { return fn(v1, v2);} \
+    float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 v2) { return fn(v1, v2);}
+
+#define THUNK_NATIVE_F_FP(fn) \
+    float __attribute__((overloadable)) native_##fn(float v1, float *v2) { return fn(v1, v2);} \
+    float2 __attribute__((overloadable)) native_##fn(float2 v1, float2 *v2) { return fn(v1, v2);} \
+    float3 __attribute__((overloadable)) native_##fn(float3 v1, float3 *v2) { return fn(v1, v2);} \
+    float4 __attribute__((overloadable)) native_##fn(float4 v1, float4 *v2) { return fn(v1, v2);}
+
+#define THUNK_NATIVE_F_I(fn) \
+    float __attribute__((overloadable)) native_##fn(float v1, int v2) { return fn(v1, v2);} \
+    float2 __attribute__((overloadable)) native_##fn(float2 v1, int2 v2) { return fn(v1, v2);} \
+    float3 __attribute__((overloadable)) native_##fn(float3 v1, int3 v2) { return fn(v1, v2);} \
+    float4 __attribute__((overloadable)) native_##fn(float4 v1, int4 v2) { return fn(v1, v2);}
+
+THUNK_NATIVE_F(acos)
+THUNK_NATIVE_F(acosh)
+THUNK_NATIVE_F(acospi)
+THUNK_NATIVE_F(asin)
+THUNK_NATIVE_F(asinh)
+THUNK_NATIVE_F(asinpi)
+THUNK_NATIVE_F(atan)
+THUNK_NATIVE_F_F(atan2)
+THUNK_NATIVE_F(atanh)
+THUNK_NATIVE_F(atanpi)
+THUNK_NATIVE_F_F(atan2pi)
+THUNK_NATIVE_F(cbrt)
+THUNK_NATIVE_F(cos)
+THUNK_NATIVE_F(cosh)
+THUNK_NATIVE_F(cospi)
+THUNK_NATIVE_F(expm1)
+THUNK_NATIVE_F_F(hypot)
+THUNK_NATIVE_F(log1p)
+THUNK_NATIVE_F_I(rootn)
+THUNK_NATIVE_F(rsqrt)
+THUNK_NATIVE_F(sqrt)
+THUNK_NATIVE_F(sin)
+THUNK_NATIVE_F_FP(sincos)
+THUNK_NATIVE_F(sinh)
+THUNK_NATIVE_F(sinpi)
+THUNK_NATIVE_F(tan)
+THUNK_NATIVE_F(tanh)
+THUNK_NATIVE_F(tanpi)
+
+#undef THUNK_NATIVE_F
+#undef THUNK_NATIVE_F_F
+#undef THUNK_NATIVE_F_I
+#undef THUNK_NATIVE_F_FP
+
+float __attribute__((overloadable)) native_normalize(float v) { return fast_normalize(v);}
+float2 __attribute__((overloadable)) native_normalize(float2 v) { return fast_normalize(v);}
+float3 __attribute__((overloadable)) native_normalize(float3 v) { return fast_normalize(v);}
+float4 __attribute__((overloadable)) native_normalize(float4 v) { return fast_normalize(v);}
+
+float __attribute__((overloadable)) native_distance(float v1, float v2) { return fast_distance(v1, v2);}
+float __attribute__((overloadable)) native_distance(float2 v1, float2 v2) { return fast_distance(v1, v2);}
+float __attribute__((overloadable)) native_distance(float3 v1, float3 v2) { return fast_distance(v1, v2);}
+float __attribute__((overloadable)) native_distance(float4 v1, float4 v2) { return fast_distance(v1, v2);}
+
+float __attribute__((overloadable)) native_length(float v) { return fast_length(v);}
+float __attribute__((overloadable)) native_length(float2 v) { return fast_length(v);}
+float __attribute__((overloadable)) native_length(float3 v) { return fast_length(v);}
+float __attribute__((overloadable)) native_length(float4 v) { return fast_length(v);}
+
+float __attribute__((overloadable)) native_divide(float v1, float v2) { return v1 / v2;}
+float2 __attribute__((overloadable)) native_divide(float2 v1, float2 v2) { return v1 / v2;}
+float3 __attribute__((overloadable)) native_divide(float3 v1, float3 v2) { return v1 / v2;}
+float4 __attribute__((overloadable)) native_divide(float4 v1, float4 v2) { return v1 / v2;}
+
+float __attribute__((overloadable)) native_recip(float v) { return 1.f / v;}
+float2 __attribute__((overloadable)) native_recip(float2 v) { return ((float2)1.f) / v;}
+float3 __attribute__((overloadable)) native_recip(float3 v) { return ((float3)1.f) / v;}
+float4 __attribute__((overloadable)) native_recip(float4 v) { return ((float4)1.f) / v;}
+
+
+
+
 
 #undef FN_FUNC_FN
 #undef IN_FUNC_FN
diff --git a/java/tests/HelloComputeNDK/Android.mk b/java/tests/HelloComputeNDK/Android.mk
index 5dbe19f..58b95aa 100644
--- a/java/tests/HelloComputeNDK/Android.mk
+++ b/java/tests/HelloComputeNDK/Android.mk
@@ -28,4 +28,4 @@
 LOCAL_JNI_SHARED_LIBRARIES := libhellocomputendk
 
 include $(BUILD_PACKAGE)
-include $(LOCAL_PATH)/libhellocomputendk/Android.mk
\ No newline at end of file
+include $(LOCAL_PATH)/libhellocomputendk/Android.mk
diff --git a/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk b/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
index 815d530..2b197f1 100644
--- a/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
+++ b/java/tests/HelloComputeNDK/libhellocomputendk/Android.mk
@@ -28,8 +28,9 @@
 LOCAL_C_INCLUDES += frameworks/rs
 LOCAL_C_INCLUDES += external/stlport/stlport bionic/ bionic/libstdc++/include
 
+LOCAL_LDFLAGS := -Wl,-Bsymbolic
 LOCAL_SHARED_LIBRARIES := libdl liblog libjnigraphics
-LOCAL_STATIC_LIBRARIES := libRScpp_static libstlport_static
+LOCAL_STATIC_LIBRARIES := libRScpp_static
 LOCAL_32_BIT_ONLY := true
 
 include $(BUILD_SHARED_LIBRARY)
diff --git a/rsScript.cpp b/rsScript.cpp
index dd962d1..ea1b3ac 100644
--- a/rsScript.cpp
+++ b/rsScript.cpp
@@ -192,13 +192,6 @@
                        const void *params, size_t paramLen,
                        const RsScriptCall *sc, size_t scLen) {
     Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
     s->runForEach(rsc, slot,
                   static_cast<const Allocation *>(vain), static_cast<Allocation *>(vaout),
                   params, paramLen, sc);
@@ -211,14 +204,6 @@
                             size_t paramLen, const RsScriptCall *sc,
                             size_t scLen) {
     Script *s = static_cast<Script *>(vs);
-    // The rs.spec generated code does not handle the absence of an actual
-    // input for sc. Instead, it retains an existing pointer value (the prior
-    // field in the packed data object). This can cause confusion because
-    // drivers might now inspect bogus sc data.
-    if (scLen == 0) {
-        sc = NULL;
-    }
-
     Allocation **ains = (Allocation**)(vains);
 
     s->runForEach(rsc, slot,
diff --git a/rsg_generator.c b/rsg_generator.c
index be2dacc..75ea1a3 100644
--- a/rsg_generator.c
+++ b/rsg_generator.c
@@ -294,7 +294,9 @@
                 const VarType *vt = &api->params[ct2];
                 needFlush += vt->ptrLevel;
                 if (vt->ptrLevel && hasInlineDataPointers(api)) {
-                    fprintf(f, "    if (dataSize < io->getMaxInlineSize()) {\n");
+                    fprintf(f, "    if (%s_length == 0) {\n", vt->name);
+                    fprintf(f, "        cmd->%s = NULL;\n", vt->name);
+                    fprintf(f, "    } else if (dataSize < io->getMaxInlineSize()) {\n");
                     fprintf(f, "        memcpy(payload, %s, %s_length);\n", vt->name, vt->name);
                     fprintf(f, "        cmd->%s = (", vt->name);
                     printVarType(f, vt);
@@ -489,7 +491,8 @@
             needFlush += vt->ptrLevel;
 
             if (hasInlineDataPointers(api) && vt->ptrLevel) {
-                fprintf(f, ",\n           (const %s *)&baseData[(intptr_t)cmd->%s]", vt->typeName, vt->name);
+                fprintf(f, ",\n           cmd->%s_length == 0 ? NULL : (const %s *)&baseData[(intptr_t)cmd->%s]",
+                        vt->name, vt->typeName, vt->name);
             } else {
                 fprintf(f, ",\n           cmd->%s", vt->name);
             }
diff --git a/scriptc/rs_core_math.rsh b/scriptc/rs_core_math.rsh
index 585b91a..c7cc331 100644
--- a/scriptc/rs_core_math.rsh
+++ b/scriptc/rs_core_math.rsh
@@ -7397,6 +7397,618 @@
 extern float __attribute__((const, overloadable))nan(uint);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acos(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acos(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acos(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acos
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acos(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acosh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_acospi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_acospi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_acospi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * acospi
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_acospi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asin(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asin(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asin(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asin
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asin(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * asinh
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_asinpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_asinpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_asinpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse sine divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_asinpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atan2pi(float y, float x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atan2pi(float2 y, float2 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atan2pi(float3 y, float3 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent of y / x, divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atan2pi(float4 y, float4 x);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse hyperbolic tangent.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_atanpi(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_atanpi(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_atanpi(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the inverse tangent divided by PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_atanpi(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cbrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cbrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cbrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cube root.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cbrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cos(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cos(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cos(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cos(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cosh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cosh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cosh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hypebolic cosine.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cosh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_cospi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_cospi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_cospi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the cosine of the value * PI.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_cospi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate distance between two points.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_distance(float4 lhs, float4 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_divide(float lhs, float rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_divide(float2 lhs, float2 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_divide(float3 lhs, float3 rhs);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate division result of two values.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_divide(float4 lhs, float4 rhs);
+#endif
+
 #if (defined(RS_VERSION) && (RS_VERSION >= 18))
 /*
  * Fast approximate exp
@@ -7529,6 +8141,114 @@
 extern float4 __attribute__((const, overloadable))native_exp2(float4 v);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_expm1(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_expm1(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_expm1(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (e ^ value) - 1.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_expm1(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_hypot(float x, float y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_hypot(float2 x, float2 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_hypot(float3 x, float3 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return native_sqrt(x*x + y*y)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_hypot(float4 x, float4 y);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the approximate length of a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_length(float4 v);
+#endif
+
 #if (defined(RS_VERSION) && (RS_VERSION >= 18))
 /*
  * Fast approximate log
@@ -7601,6 +8321,42 @@
 extern float4 __attribute__((const, overloadable))native_log10(float4 v);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_log1p(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_log1p(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_log1p(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the natural logarithm of (v + 1.0f)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_log1p(float4);
+#endif
+
 #if (defined(RS_VERSION) && (RS_VERSION >= 18))
 /*
  * Fast approximate log2
@@ -7637,6 +8393,42 @@
 extern float4 __attribute__((const, overloadable))native_log2(float4 v);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_normalize(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_normalize(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_normalize(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Normalize a vector.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_normalize(float4 v);
+#endif
+
 #if (defined(RS_VERSION) && (RS_VERSION >= 18))
 /*
  * Fast approximate v ^ y
@@ -7681,6 +8473,418 @@
 extern float4 __attribute__((const, overloadable))native_powr(float4 v, float4 y);
 #endif
 
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_recip(float v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_recip(float2 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_recip(float3 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the approximate reciprocal of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_recip(float4 v);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rootn(float v, int n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rootn(float2 v, int2 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rootn(float3 v, int3 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Compute the Nth root of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rootn(float4 v, int4 n);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_rsqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_rsqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_rsqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return (1 / sqrt(value)).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_rsqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sin(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sin(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sin(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sin(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((overloadable))native_sincos(float v, float* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((overloadable))native_sincos(float2 v, float2* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((overloadable))native_sincos(float3 v, float3* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sine and cosine of a value.
+ *
+ * @return sine
+ * @param v The incoming value in radians
+ * @param *cosptr cosptr[0] will be set to the cosine value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((overloadable))native_sincos(float4 v, float4* cosptr);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic sine of a value specified in radians.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sinpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sinpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sinpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the sin(v * PI).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sinpi(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_sqrt(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_sqrt(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_sqrt(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the aproximate sqrt(v).
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_sqrt(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tan(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tan(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tan(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tan(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanh(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanh(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanh(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return the hyperbolic tangent of a value.
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanh(float4);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float __attribute__((const, overloadable))native_tanpi(float);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float2 __attribute__((const, overloadable))native_tanpi(float2);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float3 __attribute__((const, overloadable))native_tanpi(float3);
+#endif
+
+#if (defined(RS_VERSION) && (RS_VERSION >= 21))
+/*
+ * Return tan(v * PI)
+ *
+ * Supported by API versions 21 and newer.
+ */
+extern float4 __attribute__((const, overloadable))native_tanpi(float4);
+#endif
+
 #if (defined(RS_VERSION) && (RS_VERSION >= 9))
 /*
  * Return the next floating point number from x towards y.