am 83d9692e: Restore missing min and max API.

* commit '83d9692ee33de81b19738356a342682593686c34':
  Restore missing min and max API.
diff --git a/Android.mk b/Android.mk
index e0556e2..4a453e2 100644
--- a/Android.mk
+++ b/Android.mk
@@ -14,7 +14,18 @@
 # limitations under the License.
 #
 LOCAL_PATH := $(call my-dir)
-SLANG_ENABLE_ASSERTIONS := false
+
+FORCE_BUILD_LLVM_DISABLE_NDEBUG ?= false
+# Legality check: FORCE_BUILD_LLVM_DISABLE_NDEBUG should consist of one word -- either "true" or "false".
+ifneq "$(words $(FORCE_BUILD_LLVM_DISABLE_NDEBUG))$(words $(filter-out true false,$(FORCE_BUILD_LLVM_DISABLE_NDEBUG)))" "10"
+  $(error FORCE_BUILD_LLVM_DISABLE_NDEBUG may only be true, false, or unset)
+endif
+
+FORCE_BUILD_LLVM_DEBUG ?= false
+# Legality check: FORCE_BUILD_LLVM_DEBUG should consist of one word -- either "true" or "false".
+ifneq "$(words $(FORCE_BUILD_LLVM_DEBUG))$(words $(filter-out true false,$(FORCE_BUILD_LLVM_DEBUG)))" "10"
+  $(error FORCE_BUILD_LLVM_DEBUG may only be true, false, or unset)
+endif
 
 # The prebuilt tools should be used when we are doing app-only build.
 ifeq ($(TARGET_BUILD_APPS),)
@@ -22,7 +33,7 @@
 
 local_cflags_for_slang := -Wall -Werror -std=c++11
 ifeq ($(TARGET_BUILD_VARIANT),eng)
-local_cflags_for_slang += -O0
+local_cflags_for_slang += -O0 -D__ENABLE_INTERNAL_OPTIONS
 else
 ifeq ($(TARGET_BUILD_VARIANT),userdebug)
 else
@@ -34,10 +45,6 @@
 include $(LOCAL_PATH)/rs_version.mk
 local_cflags_for_slang += $(RS_VERSION_DEFINE)
 
-ifeq ($(SLANG_ENABLE_ASSERTIONS),true)
-local_cflags_for_slang += -D_DEBUG -UNDEBUG
-endif
-
 static_libraries_needed_by_slang := \
 	libLLVMBitWriter_2_9 \
 	libLLVMBitWriter_2_9_func \
@@ -55,12 +62,16 @@
 
 LOCAL_MODULE := libslang
 LOCAL_MODULE_TAGS := optional
-ifneq ($(HOST_OS),windows)
-LOCAL_CLANG := true
-endif
 
 LOCAL_CFLAGS += $(local_cflags_for_slang)
 
+ifeq ($(HOST_OS),windows)
+# Skip missing-field-initializer warnings for mingw.
+LOCAL_CFLAGS += -Wno-error=missing-field-initializers
+else
+LOCAL_CLANG := true
+endif
+
 TBLGEN_TABLES :=    \
 	AttrList.inc	\
 	Attrs.inc	\
@@ -74,6 +85,7 @@
 
 LOCAL_SRC_FILES :=	\
 	slang.cpp	\
+	slang_bitcode_gen.cpp	\
 	slang_backend.cpp	\
 	slang_pragma_recorder.cpp	\
 	slang_diagnostic_buffer.cpp
@@ -118,15 +130,19 @@
 
 LOCAL_IS_HOST_MODULE := true
 LOCAL_MODULE := llvm-rs-cc
-ifneq ($(HOST_OS),windows)
+
+LOCAL_CFLAGS += $(local_cflags_for_slang)
+
+ifeq ($(HOST_OS),windows)
+# Skip missing-field-initializer warnings for mingw.
+LOCAL_CFLAGS += -Wno-error=missing-field-initializers
+else
 LOCAL_CLANG := true
 endif
 LOCAL_MODULE_TAGS := optional
 
 LOCAL_MODULE_CLASS := EXECUTABLES
 
-LOCAL_CFLAGS += $(local_cflags_for_slang)
-
 TBLGEN_TABLES :=    \
 	AttrList.inc    \
 	Attrs.inc    \
@@ -152,11 +168,13 @@
 	slang_rs_export_element.cpp	\
 	slang_rs_export_var.cpp	\
 	slang_rs_export_func.cpp	\
-	slang_rs_export_foreach.cpp \
+	slang_rs_export_foreach.cpp	\
+	slang_rs_export_reduce.cpp	\
 	slang_rs_object_ref_count.cpp	\
 	slang_rs_reflection.cpp \
 	slang_rs_reflection_cpp.cpp \
 	slang_rs_reflect_utils.cpp \
+	slang_rs_special_func.cpp	\
 	strip_unknown_attributes.cpp
 
 LOCAL_C_INCLUDES += frameworks/compile/libbcc/include
diff --git a/BitWriter_2_9/BitcodeWriter.cpp b/BitWriter_2_9/BitcodeWriter.cpp
index 4b914c0..c1b4ca8 100644
--- a/BitWriter_2_9/BitcodeWriter.cpp
+++ b/BitWriter_2_9/BitcodeWriter.cpp
@@ -615,7 +615,7 @@
   }
 
   unsigned MDLocationAbbrev = 0;
-  if (VE.hasMDLocation()) {
+  if (VE.hasDILocation()) {
     // TODO(srhines): Should be unreachable for RenderScript.
     // Abbrev for METADATA_LOCATION.
     //
@@ -1250,7 +1250,7 @@
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     break;
   case Instruction::Store:
-    Code = bitc::FUNC_CODE_INST_STORE;
+    Code = bitc::FUNC_CODE_INST_STORE_OLD;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
     Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
@@ -1376,7 +1376,7 @@
 
   bool NeedsMetadataAttachment = false;
 
-  MDLocation *LastDL = nullptr;;
+  DILocation *LastDL = nullptr;;
 
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -1391,7 +1391,7 @@
       NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      MDLocation *DL = I->getDebugLoc();
+      DILocation *DL = I->getDebugLoc();
       if (!DL)
         continue;
 
@@ -1408,8 +1408,6 @@
       Stream.EmitRecord(FUNC_CODE_DEBUG_LOC_2_7, Vals);
       Vals.clear();
 
-      // Fixme(pirama): The following line is missing from upstream
-      // https://llvm.org/bugs/show_bug.cgi?id=23436
       LastDL = DL;
     }
 
diff --git a/BitWriter_2_9/ValueEnumerator.cpp b/BitWriter_2_9/ValueEnumerator.cpp
index 02a6754..59d15df 100644
--- a/BitWriter_2_9/ValueEnumerator.cpp
+++ b/BitWriter_2_9/ValueEnumerator.cpp
@@ -33,7 +33,7 @@
 
 /// ValueEnumerator - Enumerate module-level information.
 ValueEnumerator::ValueEnumerator(const llvm::Module &M)
-    : HasMDString(false), HasMDLocation(false) {
+    : HasMDString(false), HasDILocation(false) {
   // Enumerate the global variables.
   for (llvm::Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I)
@@ -111,7 +111,7 @@
 
         // Don't enumerate the location directly -- it has a special record
         // type -- but enumerate its operands.
-        if (MDLocation *L = I.getDebugLoc())
+        if (DILocation *L = I.getDebugLoc())
           EnumerateMDNodeOperands(L);
       }
   }
@@ -277,7 +277,7 @@
     EnumerateValue(C->getValue());
 
   HasMDString |= isa<MDString>(MD);
-  HasMDLocation |= isa<MDLocation>(MD);
+  HasDILocation |= isa<DILocation>(MD);
 
   // Replace the dummy ID inserted above with the correct one.  MDValueMap may
   // have changed by inserting operands, so we need a fresh lookup here.
diff --git a/BitWriter_2_9/ValueEnumerator.h b/BitWriter_2_9/ValueEnumerator.h
index 7d8a746..7dcfe78 100644
--- a/BitWriter_2_9/ValueEnumerator.h
+++ b/BitWriter_2_9/ValueEnumerator.h
@@ -61,7 +61,7 @@
   typedef llvm::DenseMap<const llvm::Metadata *, unsigned> MetadataMapType;
   MetadataMapType MDValueMap;
   bool HasMDString;
-  bool HasMDLocation;
+  bool HasDILocation;
 
   typedef llvm::DenseMap<llvm::AttributeSet, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
@@ -115,7 +115,7 @@
   }
 
   bool hasMDString() const { return HasMDString; }
-  bool hasMDLocation() const { return HasMDLocation; }
+  bool hasDILocation() const { return HasDILocation; }
 
   unsigned getTypeID(llvm::Type *T) const {
     TypeMapType::const_iterator I = TypeMap.find(T);
diff --git a/BitWriter_2_9_func/BitcodeWriter.cpp b/BitWriter_2_9_func/BitcodeWriter.cpp
index d42803d..a187c0f 100644
--- a/BitWriter_2_9_func/BitcodeWriter.cpp
+++ b/BitWriter_2_9_func/BitcodeWriter.cpp
@@ -614,7 +614,7 @@
   }
 
   unsigned MDLocationAbbrev = 0;
-  if (VE.hasMDLocation()) {
+  if (VE.hasDILocation()) {
     // TODO(srhines): Should be unreachable for RenderScript.
     // Abbrev for METADATA_LOCATION.
     //
@@ -1281,7 +1281,7 @@
     if (cast<StoreInst>(I).isAtomic())
       Code = bitc::FUNC_CODE_INST_STOREATOMIC;
     else
-      Code = bitc::FUNC_CODE_INST_STORE;
+      Code = bitc::FUNC_CODE_INST_STORE_OLD;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
     Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
@@ -1438,7 +1438,7 @@
 
   bool NeedsMetadataAttachment = false;
 
-  MDLocation *LastDL = nullptr;
+  DILocation *LastDL = nullptr;
 
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -1453,7 +1453,7 @@
       NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      MDLocation *DL = I->getDebugLoc();
+      DILocation *DL = I->getDebugLoc();
       if (!DL)
         continue;
 
diff --git a/BitWriter_2_9_func/ValueEnumerator.cpp b/BitWriter_2_9_func/ValueEnumerator.cpp
index 0602318..79b6f98 100644
--- a/BitWriter_2_9_func/ValueEnumerator.cpp
+++ b/BitWriter_2_9_func/ValueEnumerator.cpp
@@ -33,7 +33,7 @@
 
 /// ValueEnumerator - Enumerate module-level information.
 ValueEnumerator::ValueEnumerator(const llvm::Module &M)
-    : HasMDString(false), HasMDLocation(false) {
+    : HasMDString(false), HasDILocation(false) {
   // Enumerate the global variables.
   for (llvm::Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I)
@@ -111,7 +111,7 @@
 
         // Don't enumerate the location directly -- it has a special record
         // type -- but enumerate its operands.
-        if (MDLocation *L = I.getDebugLoc())
+        if (DILocation *L = I.getDebugLoc())
           EnumerateMDNodeOperands(L);
       }
   }
@@ -277,7 +277,7 @@
     EnumerateValue(C->getValue());
 
   HasMDString |= isa<MDString>(MD);
-  HasMDLocation |= isa<MDLocation>(MD);
+  HasDILocation |= isa<DILocation>(MD);
 
   // Replace the dummy ID inserted above with the correct one.  MDValueMap may
   // have changed by inserting operands, so we need a fresh lookup here.
diff --git a/BitWriter_2_9_func/ValueEnumerator.h b/BitWriter_2_9_func/ValueEnumerator.h
index a1b1a7e..677dcd3 100644
--- a/BitWriter_2_9_func/ValueEnumerator.h
+++ b/BitWriter_2_9_func/ValueEnumerator.h
@@ -61,7 +61,7 @@
   typedef llvm::DenseMap<const llvm::Metadata *, unsigned> MetadataMapType;
   MetadataMapType MDValueMap;
   bool HasMDString;
-  bool HasMDLocation;
+  bool HasDILocation;
 
   typedef llvm::DenseMap<llvm::AttributeSet, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
@@ -115,7 +115,7 @@
   }
 
   bool hasMDString() const { return HasMDString; }
-  bool hasMDLocation() const { return HasMDLocation; }
+  bool hasDILocation() const { return HasDILocation; }
 
   unsigned getTypeID(llvm::Type *T) const {
     TypeMapType::const_iterator I = TypeMap.find(T);
diff --git a/BitWriter_3_2/BitcodeWriter.cpp b/BitWriter_3_2/BitcodeWriter.cpp
index 3d12f17..fff4490 100644
--- a/BitWriter_3_2/BitcodeWriter.cpp
+++ b/BitWriter_3_2/BitcodeWriter.cpp
@@ -643,7 +643,7 @@
   }
 
   unsigned MDLocationAbbrev = 0;
-  if (VE.hasMDLocation()) {
+  if (VE.hasDILocation()) {
     // TODO(srhines): Should be unreachable for RenderScript.
     // Abbrev for METADATA_LOCATION.
     //
@@ -1323,7 +1323,7 @@
     if (cast<StoreInst>(I).isAtomic())
       Code = bitc::FUNC_CODE_INST_STOREATOMIC;
     else
-      Code = bitc::FUNC_CODE_INST_STORE;
+      Code = bitc::FUNC_CODE_INST_STORE_OLD;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
     Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
@@ -1513,7 +1513,7 @@
 
   bool NeedsMetadataAttachment = false;
 
-  MDLocation *LastDL = nullptr;
+  DILocation *LastDL = nullptr;
 
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -1528,7 +1528,7 @@
       NeedsMetadataAttachment |= I->hasMetadataOtherThanDebugLoc();
 
       // If the instruction has a debug location, emit it.
-      MDLocation *DL = I->getDebugLoc();
+      DILocation *DL = I->getDebugLoc();
       if (!DL)
         continue;
 
diff --git a/BitWriter_3_2/ValueEnumerator.cpp b/BitWriter_3_2/ValueEnumerator.cpp
index 6b1a33d..ff96493 100644
--- a/BitWriter_3_2/ValueEnumerator.cpp
+++ b/BitWriter_3_2/ValueEnumerator.cpp
@@ -33,7 +33,7 @@
 
 /// ValueEnumerator - Enumerate module-level information.
 ValueEnumerator::ValueEnumerator(const llvm::Module &M)
-    : HasMDString(false), HasMDLocation(false) {
+    : HasMDString(false), HasDILocation(false) {
   // Enumerate the global variables.
   for (llvm::Module::const_global_iterator I = M.global_begin(), E = M.global_end();
        I != E; ++I)
@@ -111,7 +111,7 @@
 
         // Don't enumerate the location directly -- it has a special record
         // type -- but enumerate its operands.
-        if (MDLocation *L = I.getDebugLoc())
+        if (DILocation *L = I.getDebugLoc())
           EnumerateMDNodeOperands(L);
       }
   }
@@ -277,7 +277,7 @@
     EnumerateValue(C->getValue());
 
   HasMDString |= isa<MDString>(MD);
-  HasMDLocation |= isa<MDLocation>(MD);
+  HasDILocation |= isa<DILocation>(MD);
 
   // Replace the dummy ID inserted above with the correct one.  MDValueMap may
   // have changed by inserting operands, so we need a fresh lookup here.
diff --git a/BitWriter_3_2/ValueEnumerator.h b/BitWriter_3_2/ValueEnumerator.h
index 2895a21..74e1638 100644
--- a/BitWriter_3_2/ValueEnumerator.h
+++ b/BitWriter_3_2/ValueEnumerator.h
@@ -64,7 +64,7 @@
   typedef llvm::DenseMap<const llvm::Metadata *, unsigned> MetadataMapType;
   MetadataMapType MDValueMap;
   bool HasMDString;
-  bool HasMDLocation;
+  bool HasDILocation;
 
   typedef llvm::DenseMap<llvm::AttributeSet, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
@@ -118,7 +118,7 @@
   }
 
   bool hasMDString() const { return HasMDString; }
-  bool hasMDLocation() const { return HasMDLocation; }
+  bool hasDILocation() const { return HasDILocation; }
 
   unsigned getTypeID(llvm::Type *T) const {
     TypeMapType::const_iterator I = TypeMap.find(T);
diff --git a/RSCCOptions.td b/RSCCOptions.td
index 96cdaa8..2339cd7 100644
--- a/RSCCOptions.td
+++ b/RSCCOptions.td
@@ -60,8 +60,8 @@
   HelpText<"Build ASTs then convert to LLVM, but emit nothing">;
 }
 
-def m32 : Flag<["-"], "m32">, HelpText<"Emit 32-bit C++ code">;
-def m64 : Flag<["-"], "m64">, HelpText<"Emit 64-bit C++ code">;
+def m32 : Flag<["-"], "m32">, HelpText<"Emit 32-bit code (only for C++, unless eng build)">;
+def m64 : Flag<["-"], "m64">, HelpText<"Emit 64-bit code (only for C++, unless eng build)">;
 
 def emit_g : Flag<["-"], "g">,
   HelpText<"Emit LLVM Debug Metadata">;
@@ -102,6 +102,7 @@
 let Group = M_Group in {
 
   def MD : Flag<["-"], "MD">, HelpText<"Emit .d dependency files">;
+  def MP : Flag<["-"], "MP">, HelpText<"Also emit phony target for dependency files">;
 
   def M : Flag<["-"], "M">;
   def emit_dep : Flag<["-"], "emit-dep">, Alias<M>;
@@ -123,6 +124,21 @@
   HelpText<"Reflect C++ classes">;
 
 //===----------------------------------------------------------------------===//
+// Diagnostic Options
+//===----------------------------------------------------------------------===//
+
+def ast_print : Flag<["-"], "ast-print">,
+  HelpText<"Print clang AST prior to llvm IR generation">;
+
+def debug : Flag<["-"], "debug">,
+  HelpText<"Enable debug output">;
+
+def print_after_all : Flag<["-"], "print-after-all">,
+  HelpText<"Print llvm IR after each pass">;
+def print_before_all : Flag<["-"], "print-before-all">,
+  HelpText<"Print llvm IR before each pass">;
+
+//===----------------------------------------------------------------------===//
 // Misc Options
 //===----------------------------------------------------------------------===//
 
diff --git a/lit-tests/bitcode_wrapper/bitcode_wrapper_test.ll b/lit-tests/bitcode_wrapper/bitcode_wrapper_test.ll
new file mode 100644
index 0000000..77e32c0
--- /dev/null
+++ b/lit-tests/bitcode_wrapper/bitcode_wrapper_test.ll
@@ -0,0 +1,51 @@
+; This test assembles this file to bitcode with all supported target
+; API versions, then checks that the bitcode file was generated and
+; has the right magic number.
+
+; RUN: %llvm-rs-as -target-api 11 %s -o %t11
+; RUN: xxd -ps -l 4 %t11 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 12 %s -o %t12
+; RUN: xxd -ps -l 4 %t12 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 13 %s -o %t13
+; RUN: xxd -ps -l 4 %t13 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 14 %s -o %t14
+; RUN: xxd -ps -l 4 %t14 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 15 %s -o %t15
+; RUN: xxd -ps -l 4 %t15 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 16 %s -o %t16
+; RUN: xxd -ps -l 4 %t16 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 17 %s -o %t17
+; RUN: xxd -ps -l 4 %t17 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 18 %s -o %t18
+; RUN: xxd -ps -l 4 %t18 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 19 %s -o %t19
+; RUN: xxd -ps -l 4 %t19 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 20 %s -o %t20
+; RUN: xxd -ps -l 4 %t20 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 21 %s -o %t21
+; RUN: xxd -ps -l 4 %t21 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 22 %s -o %t22
+; RUN: xxd -ps -l 4 %t22 | FileCheck %s
+; RUN: %llvm-rs-as -target-api 23 %s -o %t23
+; RUN: xxd -ps -l 4 %t23 | FileCheck %s
+
+; RUN: %llvm-rs-as -target-api 0 %s -o %t0
+; RUN: xxd -ps -l 4 %t0 | FileCheck %s
+
+; Check for the magic number.
+
+; CHECK: dec0170b
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-p:32:32-i64:64-v128:64:128-n32-S64"
+target triple = "armv7-none-linux-gnueabi"
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+!\23pragma = !{!3, !4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 3.6 "}
+!3 = !{!"version", !"1"}
+!4 = !{!"java_package_name", !"foo"}
diff --git a/lit-tests/debug/debug_disabled.rs b/lit-tests/debug/debug_disabled.rs
index 4485697..d3ddd57 100644
--- a/lit-tests/debug/debug_disabled.rs
+++ b/lit-tests/debug/debug_disabled.rs
@@ -1,6 +1,6 @@
 // RUN: %Slang %s
 // RUN: %rs-filecheck-wrapper %s
-// CHECK-NOT: DW_TAG_subprogram
+// CHECK-NOT: DILocation
 
 #pragma version(1)
 #pragma rs java_package_name(foo)
diff --git a/lit-tests/debug/debug_enabled.rs b/lit-tests/debug/debug_enabled.rs
index 4632744..7f2856a 100644
--- a/lit-tests/debug/debug_enabled.rs
+++ b/lit-tests/debug/debug_enabled.rs
@@ -1,6 +1,6 @@
 // RUN: %Slang -g %s
 // RUN: %rs-filecheck-wrapper %s
-// CHECK: DW_TAG_subprogram
+// CHECK: DILocation
 
 #pragma version(1)
 #pragma rs java_package_name(foo)
diff --git a/lit-tests/lit.cfg b/lit-tests/lit.cfg
index de16162..e123f09 100644
--- a/lit-tests/lit.cfg
+++ b/lit-tests/lit.cfg
@@ -6,9 +6,10 @@
 config.name = 'slang_lit_tests'
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.rs']
+config.suffixes = ['.rs', '.ll']
 
 # testFormat: The test format to use to interpret tests.
+import lit.formats
 config.test_format = lit.formats.ShTest()
 
 # Get the base build directory for the android source tree from environment.
@@ -34,18 +35,21 @@
         return tool
 
     # Otherwise look in the path.
+    import lit.util
     tool = lit.util.which(binary_name, PATH)
 
     if not tool:
-        lit.fatal("couldn't find " + binary_name + " program in " + PATH + " , try setting "
-                  + env_var + " in your environment")
+        lit_config.fatal("couldn't find " + binary_name + " program in " + PATH + " , try setting "
+                         + env_var + " in your environment")
 
     return os.path.abspath(tool)
 
 config.slang = inferTool('llvm-rs-cc', 'SLANG', os.path.join(config.base_path, 'out', 'host', 'linux-x86', 'bin')).replace('\\', '/')
+config.llvm_rs_as = inferTool('llvm-rs-as', 'LLVM_RS_AS', os.path.join(config.base_path, 'out', 'host', 'linux-x86', 'bin')).replace('\\', '/')
 
 config.filecheck = inferTool('FileCheck', 'FILECHECK', config.environment['PATH'])
 config.rs_filecheck_wrapper = inferTool('rs-filecheck-wrapper.sh', 'RS_FILECHECK_WRAPPER', os.path.join(config.base_path, 'frameworks', 'compile', 'slang', 'lit-tests'))
+config.scriptc_filecheck_wrapper = inferTool('scriptc-filecheck-wrapper.sh', 'SCRIPTC_FILECHECK_WRAPPER', os.path.join(config.base_path, 'frameworks', 'compile', 'slang', 'lit-tests'))
 
 # Use most up-to-date headers for includes.
 config.slang_includes = "-I " + os.path.join(config.base_path, 'frameworks', 'rs', 'scriptc') + " " \
@@ -55,12 +59,16 @@
                      + " -output-dep-dir " + config.test_exec_root \
                      + " -java-reflection-path-base " + config.test_exec_root
 
-if not lit.quiet:
-    lit.note('using slang: %r' % config.slang)
-    lit.note('using FileCheck: %r' % config.filecheck)
-    lit.note('using rs-filecheck-wrapper.sh: %r' % config.rs_filecheck_wrapper)
-    lit.note('using output directory: %r' % config.test_exec_root)
+if not lit_config.quiet:
+    lit_config.note('using slang: %r' % config.slang)
+    lit_config.note('using llvm-rs-as: %r' % config.llvm_rs_as)
+    lit_config.note('using FileCheck: %r' % config.filecheck)
+    lit_config.note('using rs-filecheck-wrapper.sh: %r' % config.rs_filecheck_wrapper)
+    lit_config.note('using output directory: %r' % config.test_exec_root)
 
 # Tools configuration substitutions
 config.substitutions.append( ('%Slang', ' ' + config.slang + ' ' + config.slang_includes + ' ' + config.slang_options ) )
+config.substitutions.append( ('%llvm-rs-as', config.llvm_rs_as) )
 config.substitutions.append( ('%rs-filecheck-wrapper', ' ' + config.rs_filecheck_wrapper + ' ' + config.test_exec_root + ' ' + config.filecheck + ' ') )
+config.substitutions.append( ('%scriptc-filecheck-wrapper', ' ' + config.scriptc_filecheck_wrapper + ' --output=' + config.test_exec_root + ' --filecheck=' + config.filecheck + ' ') )
+lit_config.note(config.substitutions)
diff --git a/lit-tests/reduce_metadata/reduce.rs b/lit-tests/reduce_metadata/reduce.rs
new file mode 100644
index 0000000..82da76d
--- /dev/null
+++ b/lit-tests/reduce_metadata/reduce.rs
@@ -0,0 +1,349 @@
+// Check for generation of reduce metadata.
+
+// RUN: %Slang -target-api 0 %s
+// RUN: %rs-filecheck-wrapper %s
+
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+// CHECK-NOT: foreach
+// CHECK: !\23rs_export_reduce =
+// CHECK-NOT: foreach
+
+// CHECK: !{!"mul_bool"}
+bool __attribute__((kernel("reduce")))
+mul_bool(bool lhs, bool rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_char"}
+char __attribute__((kernel("reduce")))
+mul_char(char lhs, char rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_char2"}
+char2 __attribute__((kernel("reduce")))
+mul_char2(char2 lhs, char2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_char3"}
+char3 __attribute__((kernel("reduce")))
+mul_char3(char3 lhs, char3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_char4"}
+char4 __attribute__((kernel("reduce")))
+mul_char4(char4 lhs, char4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_double"}
+double __attribute__((kernel("reduce")))
+mul_double(double lhs, double rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_double2"}
+double2 __attribute__((kernel("reduce")))
+mul_double2(double2 lhs, double2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_double3"}
+double3 __attribute__((kernel("reduce")))
+mul_double3(double3 lhs, double3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_double4"}
+double4 __attribute__((kernel("reduce")))
+mul_double4(double4 lhs, double4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_float"}
+float __attribute__((kernel("reduce")))
+mul_float(float lhs, float rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_float2"}
+float2 __attribute__((kernel("reduce")))
+mul_float2(float2 lhs, float2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_float3"}
+float3 __attribute__((kernel("reduce")))
+mul_float3(float3 lhs, float3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_float4"}
+float4 __attribute__((kernel("reduce")))
+mul_float4(float4 lhs, float4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_int"}
+int __attribute__((kernel("reduce")))
+mul_int(int lhs, int rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_int2"}
+int2 __attribute__((kernel("reduce")))
+mul_int2(int2 lhs, int2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_int3"}
+int3 __attribute__((kernel("reduce")))
+mul_int3(int3 lhs, int3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_int4"}
+int4 __attribute__((kernel("reduce")))
+mul_int4(int4 lhs, int4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_long"}
+long __attribute__((kernel("reduce")))
+mul_long(long lhs, long rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_long2"}
+long2 __attribute__((kernel("reduce")))
+mul_long2(long2 lhs, long2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_long3"}
+long3 __attribute__((kernel("reduce")))
+mul_long3(long3 lhs, long3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_long4"}
+long4 __attribute__((kernel("reduce")))
+mul_long4(long4 lhs, long4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_short"}
+short __attribute__((kernel("reduce")))
+mul_short(short lhs, short rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_short2"}
+short2 __attribute__((kernel("reduce")))
+mul_short2(short2 lhs, short2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_short3"}
+short3 __attribute__((kernel("reduce")))
+mul_short3(short3 lhs, short3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_short4"}
+short4 __attribute__((kernel("reduce")))
+mul_short4(short4 lhs, short4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uchar"}
+uchar __attribute__((kernel("reduce")))
+mul_uchar(uchar lhs, uchar rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uchar2"}
+uchar2 __attribute__((kernel("reduce")))
+mul_uchar2(uchar2 lhs, uchar2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uchar3"}
+uchar3 __attribute__((kernel("reduce")))
+mul_uchar3(uchar3 lhs, uchar3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uchar4"}
+uchar4 __attribute__((kernel("reduce")))
+mul_uchar4(uchar4 lhs, uchar4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uint"}
+uint __attribute__((kernel("reduce")))
+mul_uint(uint lhs, uint rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uint2"}
+uint2 __attribute__((kernel("reduce")))
+mul_uint2(uint2 lhs, uint2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uint3"}
+uint3 __attribute__((kernel("reduce")))
+mul_uint3(uint3 lhs, uint3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_uint4"}
+uint4 __attribute__((kernel("reduce")))
+mul_uint4(uint4 lhs, uint4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ulong"}
+ulong __attribute__((kernel("reduce")))
+mul_ulong(ulong lhs, ulong rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ulong2"}
+ulong2 __attribute__((kernel("reduce")))
+mul_ulong2(ulong2 lhs, ulong2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ulong3"}
+ulong3 __attribute__((kernel("reduce")))
+mul_ulong3(ulong3 lhs, ulong3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ulong4"}
+ulong4 __attribute__((kernel("reduce")))
+mul_ulong4(ulong4 lhs, ulong4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ushort"}
+ushort __attribute__((kernel("reduce")))
+mul_ushort(ushort lhs, ushort rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ushort2"}
+ushort2 __attribute__((kernel("reduce")))
+mul_ushort2(ushort2 lhs, ushort2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ushort3"}
+ushort3 __attribute__((kernel("reduce")))
+mul_ushort3(ushort3 lhs, ushort3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: !{!"mul_ushort4"}
+ushort4 __attribute__((kernel("reduce")))
+mul_ushort4(ushort4 lhs, ushort4 rhs) {
+  return lhs * rhs;
+}
+
+
+struct indirect {
+  bool elem_bool;
+  char elem_char;
+  char2 elem_char2;
+  char3 elem_char3;
+  char4 elem_char4;
+  double elem_double;
+  double2 elem_double2;
+  double3 elem_double3;
+  double4 elem_double4;
+  float elem_float;
+  float2 elem_float2;
+  float3 elem_float3;
+  float4 elem_float4;
+  int elem_int;
+  int2 elem_int2;
+  int3 elem_int3;
+  int4 elem_int4;
+  long elem_long;
+  long2 elem_long2;
+  long3 elem_long3;
+  long4 elem_long4;
+  short elem_short;
+  short2 elem_short2;
+  short3 elem_short3;
+  short4 elem_short4;
+  uchar elem_uchar;
+  uchar2 elem_uchar2;
+  uchar3 elem_uchar3;
+  uchar4 elem_uchar4;
+  uint elem_uint;
+  uint2 elem_uint2;
+  uint3 elem_uint3;
+  uint4 elem_uint4;
+  ulong elem_ulong;
+  ulong2 elem_ulong2;
+  ulong3 elem_ulong3;
+  ulong4 elem_ulong4;
+  ushort elem_ushort;
+  ushort2 elem_ushort2;
+  ushort3 elem_ushort3;
+  ushort4 elem_ushort4;
+};
+
+// CHECK: !{!"mul_indirect"}
+struct indirect __attribute__((kernel("reduce")))
+mul_indirect(struct indirect lhs, struct indirect rhs) {
+  lhs.elem_bool *= rhs.elem_bool;
+  lhs.elem_char *= rhs.elem_char;
+  lhs.elem_char2 *= rhs.elem_char2;
+  lhs.elem_char3 *= rhs.elem_char3;
+  lhs.elem_char4 *= rhs.elem_char4;
+  lhs.elem_double *= rhs.elem_double;
+  lhs.elem_double2 *= rhs.elem_double2;
+  lhs.elem_double3 *= rhs.elem_double3;
+  lhs.elem_double4 *= rhs.elem_double4;
+  lhs.elem_float *= rhs.elem_float;
+  lhs.elem_float2 *= rhs.elem_float2;
+  lhs.elem_float3 *= rhs.elem_float3;
+  lhs.elem_float4 *= rhs.elem_float4;
+  lhs.elem_int *= rhs.elem_int;
+  lhs.elem_int2 *= rhs.elem_int2;
+  lhs.elem_int3 *= rhs.elem_int3;
+  lhs.elem_int4 *= rhs.elem_int4;
+  lhs.elem_long *= rhs.elem_long;
+  lhs.elem_long2 *= rhs.elem_long2;
+  lhs.elem_long3 *= rhs.elem_long3;
+  lhs.elem_long4 *= rhs.elem_long4;
+  lhs.elem_short *= rhs.elem_short;
+  lhs.elem_short2 *= rhs.elem_short2;
+  lhs.elem_short3 *= rhs.elem_short3;
+  lhs.elem_short4 *= rhs.elem_short4;
+  lhs.elem_uchar *= rhs.elem_uchar;
+  lhs.elem_uchar2 *= rhs.elem_uchar2;
+  lhs.elem_uchar3 *= rhs.elem_uchar3;
+  lhs.elem_uchar4 *= rhs.elem_uchar4;
+  lhs.elem_uint *= rhs.elem_uint;
+  lhs.elem_uint2 *= rhs.elem_uint2;
+  lhs.elem_uint3 *= rhs.elem_uint3;
+  lhs.elem_uint4 *= rhs.elem_uint4;
+  lhs.elem_ulong *= rhs.elem_ulong;
+  lhs.elem_ulong2 *= rhs.elem_ulong2;
+  lhs.elem_ulong3 *= rhs.elem_ulong3;
+  lhs.elem_ulong4 *= rhs.elem_ulong4;
+  lhs.elem_ushort *= rhs.elem_ushort;
+  lhs.elem_ushort2 *= rhs.elem_ushort2;
+  lhs.elem_ushort3 *= rhs.elem_ushort3;
+  lhs.elem_ushort4 *= rhs.elem_ushort4;
+  return lhs;
+}
diff --git a/lit-tests/reduce_reflection/reduce_reflection.rs b/lit-tests/reduce_reflection/reduce_reflection.rs
new file mode 100644
index 0000000..93e850b
--- /dev/null
+++ b/lit-tests/reduce_reflection/reduce_reflection.rs
@@ -0,0 +1,542 @@
+// RUN: %Slang -target-api 0 %s
+// RUN: %scriptc-filecheck-wrapper --lang=Java %s
+
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+// CHECK: mExportReduceIdx_mul_half
+// Array variants of kernels with the half type are unsupported.
+// CHECK-NOT: reduce_mul_half({{.*}} in[],
+// CHECK: public void reduce_mul_half(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_half(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+half __attribute__((kernel("reduce"))) mul_half(half lhs, half rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_half2
+// CHECK-NOT: reduce_mul_half2({{.*}} in[],
+// CHECK: public void reduce_mul_half2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_half2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+half2 __attribute__((kernel("reduce"))) mul_half2(half2 lhs, half2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_half3
+// CHECK-NOT: reduce_mul_half3({{.*}} in[],
+// CHECK: public void reduce_mul_half3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_half3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+half3 __attribute__((kernel("reduce"))) mul_half3(half3 lhs, half3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_half4
+// CHECK-NOT: reduce_mul_half4({{.*}} in[],
+// CHECK: public void reduce_mul_half4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_half4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+half4 __attribute__((kernel("reduce"))) mul_half4(half4 lhs, half4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_bool
+// CHECK: public boolean reduce_mul_bool(byte[] in)
+// CHECK: public boolean reduce_mul_bool(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_bool(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_bool(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+bool __attribute__((kernel("reduce")))
+mul_bool(bool lhs, bool rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_char
+// CHECK: public byte reduce_mul_char(byte[] in)
+// CHECK: public byte reduce_mul_char(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_char(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_char(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+char __attribute__((kernel("reduce")))
+mul_char(char lhs, char rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_char2
+// CHECK: public Byte2 reduce_mul_char2(byte[] in)
+// CHECK: public Byte2 reduce_mul_char2(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_char2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_char2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+char2 __attribute__((kernel("reduce")))
+mul_char2(char2 lhs, char2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_char3
+// CHECK: public Byte3 reduce_mul_char3(byte[] in)
+// CHECK: public Byte3 reduce_mul_char3(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_char3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_char3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+char3 __attribute__((kernel("reduce")))
+mul_char3(char3 lhs, char3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_char4
+// CHECK: public Byte4 reduce_mul_char4(byte[] in)
+// CHECK: public Byte4 reduce_mul_char4(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_char4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_char4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+char4 __attribute__((kernel("reduce")))
+mul_char4(char4 lhs, char4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_double
+// CHECK: public double reduce_mul_double(double[] in)
+// CHECK: public double reduce_mul_double(double[] in, int x1, int x2)
+// CHECK: public void reduce_mul_double(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_double(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+double __attribute__((kernel("reduce")))
+mul_double(double lhs, double rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_double2
+// CHECK: public Double2 reduce_mul_double2(double[] in)
+// CHECK: public Double2 reduce_mul_double2(double[] in, int x1, int x2)
+// CHECK: public void reduce_mul_double2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_double2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+double2 __attribute__((kernel("reduce")))
+mul_double2(double2 lhs, double2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_double3
+// CHECK: public Double3 reduce_mul_double3(double[] in)
+// CHECK: public Double3 reduce_mul_double3(double[] in, int x1, int x2)
+// CHECK: public void reduce_mul_double3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_double3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+double3 __attribute__((kernel("reduce")))
+mul_double3(double3 lhs, double3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_double4
+// CHECK: public Double4 reduce_mul_double4(double[] in)
+// CHECK: public Double4 reduce_mul_double4(double[] in, int x1, int x2)
+// CHECK: public void reduce_mul_double4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_double4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+double4 __attribute__((kernel("reduce")))
+mul_double4(double4 lhs, double4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_float
+// CHECK: public float reduce_mul_float(float[] in)
+// CHECK: public float reduce_mul_float(float[] in, int x1, int x2)
+// CHECK: public void reduce_mul_float(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_float(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+float __attribute__((kernel("reduce")))
+mul_float(float lhs, float rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_float2
+// CHECK: public Float2 reduce_mul_float2(float[] in)
+// CHECK: public Float2 reduce_mul_float2(float[] in, int x1, int x2)
+// CHECK: public void reduce_mul_float2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_float2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+float2 __attribute__((kernel("reduce")))
+mul_float2(float2 lhs, float2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_float3
+// CHECK: public Float3 reduce_mul_float3(float[] in)
+// CHECK: public Float3 reduce_mul_float3(float[] in, int x1, int x2)
+// CHECK: public void reduce_mul_float3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_float3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+float3 __attribute__((kernel("reduce")))
+mul_float3(float3 lhs, float3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_float4
+// CHECK: public Float4 reduce_mul_float4(float[] in)
+// CHECK: public Float4 reduce_mul_float4(float[] in, int x1, int x2)
+// CHECK: public void reduce_mul_float4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_float4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+float4 __attribute__((kernel("reduce")))
+mul_float4(float4 lhs, float4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_int
+// CHECK: public int reduce_mul_int(int[] in)
+// CHECK: public int reduce_mul_int(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_int(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_int(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+int __attribute__((kernel("reduce")))
+mul_int(int lhs, int rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_int2
+// CHECK: public Int2 reduce_mul_int2(int[] in)
+// CHECK: public Int2 reduce_mul_int2(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_int2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_int2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+int2 __attribute__((kernel("reduce")))
+mul_int2(int2 lhs, int2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_int3
+// CHECK: public Int3 reduce_mul_int3(int[] in)
+// CHECK: public Int3 reduce_mul_int3(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_int3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_int3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+int3 __attribute__((kernel("reduce")))
+mul_int3(int3 lhs, int3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_int4
+// CHECK: public Int4 reduce_mul_int4(int[] in)
+// CHECK: public Int4 reduce_mul_int4(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_int4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_int4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+int4 __attribute__((kernel("reduce")))
+mul_int4(int4 lhs, int4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_long
+// CHECK: public long reduce_mul_long(long[] in)
+// CHECK: public long reduce_mul_long(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_long(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_long(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+long __attribute__((kernel("reduce")))
+mul_long(long lhs, long rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_long2
+// CHECK: public Long2 reduce_mul_long2(long[] in)
+// CHECK: public Long2 reduce_mul_long2(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_long2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_long2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+long2 __attribute__((kernel("reduce")))
+mul_long2(long2 lhs, long2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_long3
+// CHECK: public Long3 reduce_mul_long3(long[] in)
+// CHECK: public Long3 reduce_mul_long3(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_long3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_long3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+long3 __attribute__((kernel("reduce")))
+mul_long3(long3 lhs, long3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_long4
+// CHECK: public Long4 reduce_mul_long4(long[] in)
+// CHECK: public Long4 reduce_mul_long4(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_long4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_long4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+long4 __attribute__((kernel("reduce")))
+mul_long4(long4 lhs, long4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_short
+// CHECK: public short reduce_mul_short(short[] in)
+// CHECK: public short reduce_mul_short(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_short(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_short(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+short __attribute__((kernel("reduce")))
+mul_short(short lhs, short rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_short2
+// CHECK: public Short2 reduce_mul_short2(short[] in)
+// CHECK: public Short2 reduce_mul_short2(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_short2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_short2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+short2 __attribute__((kernel("reduce")))
+mul_short2(short2 lhs, short2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_short3
+// CHECK: public Short3 reduce_mul_short3(short[] in)
+// CHECK: public Short3 reduce_mul_short3(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_short3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_short3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+short3 __attribute__((kernel("reduce")))
+mul_short3(short3 lhs, short3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_short4
+// CHECK: public Short4 reduce_mul_short4(short[] in)
+// CHECK: public Short4 reduce_mul_short4(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_short4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_short4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+short4 __attribute__((kernel("reduce")))
+mul_short4(short4 lhs, short4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uchar
+// CHECK: public short reduce_mul_uchar(byte[] in)
+// CHECK: public short reduce_mul_uchar(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uchar(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uchar(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uchar __attribute__((kernel("reduce")))
+mul_uchar(uchar lhs, uchar rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uchar2
+// CHECK: public Short2 reduce_mul_uchar2(byte[] in)
+// CHECK: public Short2 reduce_mul_uchar2(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uchar2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uchar2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uchar2 __attribute__((kernel("reduce")))
+mul_uchar2(uchar2 lhs, uchar2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uchar3
+// CHECK: public Short3 reduce_mul_uchar3(byte[] in)
+// CHECK: public Short3 reduce_mul_uchar3(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uchar3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uchar3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uchar3 __attribute__((kernel("reduce")))
+mul_uchar3(uchar3 lhs, uchar3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uchar4
+// CHECK: public Short4 reduce_mul_uchar4(byte[] in)
+// CHECK: public Short4 reduce_mul_uchar4(byte[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uchar4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uchar4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uchar4 __attribute__((kernel("reduce")))
+mul_uchar4(uchar4 lhs, uchar4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uint
+// CHECK: public long reduce_mul_uint(int[] in)
+// CHECK: public long reduce_mul_uint(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uint(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uint(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uint __attribute__((kernel("reduce")))
+mul_uint(uint lhs, uint rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uint2
+// CHECK: public Long2 reduce_mul_uint2(int[] in)
+// CHECK: public Long2 reduce_mul_uint2(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uint2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uint2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uint2 __attribute__((kernel("reduce")))
+mul_uint2(uint2 lhs, uint2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uint3
+// CHECK: public Long3 reduce_mul_uint3(int[] in)
+// CHECK: public Long3 reduce_mul_uint3(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uint3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uint3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uint3 __attribute__((kernel("reduce")))
+mul_uint3(uint3 lhs, uint3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_uint4
+// CHECK: public Long4 reduce_mul_uint4(int[] in)
+// CHECK: public Long4 reduce_mul_uint4(int[] in, int x1, int x2)
+// CHECK: public void reduce_mul_uint4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_uint4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+uint4 __attribute__((kernel("reduce")))
+mul_uint4(uint4 lhs, uint4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ulong
+// CHECK: public long reduce_mul_ulong(long[] in)
+// CHECK: public long reduce_mul_ulong(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ulong(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ulong(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ulong __attribute__((kernel("reduce")))
+mul_ulong(ulong lhs, ulong rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ulong2
+// CHECK: public Long2 reduce_mul_ulong2(long[] in)
+// CHECK: public Long2 reduce_mul_ulong2(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ulong2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ulong2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ulong2 __attribute__((kernel("reduce")))
+mul_ulong2(ulong2 lhs, ulong2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ulong3
+// CHECK: public Long3 reduce_mul_ulong3(long[] in)
+// CHECK: public Long3 reduce_mul_ulong3(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ulong3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ulong3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ulong3 __attribute__((kernel("reduce")))
+mul_ulong3(ulong3 lhs, ulong3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ulong4
+// CHECK: public Long4 reduce_mul_ulong4(long[] in)
+// CHECK: public Long4 reduce_mul_ulong4(long[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ulong4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ulong4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ulong4 __attribute__((kernel("reduce")))
+mul_ulong4(ulong4 lhs, ulong4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ushort
+// CHECK: public int reduce_mul_ushort(short[] in)
+// CHECK: public int reduce_mul_ushort(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ushort(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ushort(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ushort __attribute__((kernel("reduce")))
+mul_ushort(ushort lhs, ushort rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ushort2
+// CHECK: public Int2 reduce_mul_ushort2(short[] in)
+// CHECK: public Int2 reduce_mul_ushort2(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ushort2(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ushort2(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ushort2 __attribute__((kernel("reduce")))
+mul_ushort2(ushort2 lhs, ushort2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ushort3
+// CHECK: public Int3 reduce_mul_ushort3(short[] in)
+// CHECK: public Int3 reduce_mul_ushort3(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ushort3(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ushort3(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ushort3 __attribute__((kernel("reduce")))
+mul_ushort3(ushort3 lhs, ushort3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: mExportReduceIdx_mul_ushort4
+// CHECK: public Int4 reduce_mul_ushort4(short[] in)
+// CHECK: public Int4 reduce_mul_ushort4(short[] in, int x1, int x2)
+// CHECK: public void reduce_mul_ushort4(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_ushort4(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+ushort4 __attribute__((kernel("reduce")))
+mul_ushort4(ushort4 lhs, ushort4 rhs) {
+  return lhs * rhs;
+}
+
+
+struct indirect {
+  bool elem_bool;
+  char elem_char;
+  char2 elem_char2;
+  char3 elem_char3;
+  char4 elem_char4;
+  double elem_double;
+  double2 elem_double2;
+  double3 elem_double3;
+  double4 elem_double4;
+  float elem_float;
+  float2 elem_float2;
+  float3 elem_float3;
+  float4 elem_float4;
+  int elem_int;
+  int2 elem_int2;
+  int3 elem_int3;
+  int4 elem_int4;
+  long elem_long;
+  long2 elem_long2;
+  long3 elem_long3;
+  long4 elem_long4;
+  short elem_short;
+  short2 elem_short2;
+  short3 elem_short3;
+  short4 elem_short4;
+  uchar elem_uchar;
+  uchar2 elem_uchar2;
+  uchar3 elem_uchar3;
+  uchar4 elem_uchar4;
+  uint elem_uint;
+  uint2 elem_uint2;
+  uint3 elem_uint3;
+  uint4 elem_uint4;
+  ulong elem_ulong;
+  ulong2 elem_ulong2;
+  ulong3 elem_ulong3;
+  ulong4 elem_ulong4;
+  ushort elem_ushort;
+  ushort2 elem_ushort2;
+  ushort3 elem_ushort3;
+  ushort4 elem_ushort4;
+};
+
+// CHECK: mExportReduceIdx_mul_indirect
+// CHECK: public void reduce_mul_indirect(Allocation ain, Allocation aout)
+// CHECK: public void reduce_mul_indirect(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+struct indirect __attribute__((kernel("reduce")))
+mul_indirect(struct indirect lhs, struct indirect rhs) {
+  lhs.elem_bool *= rhs.elem_bool;
+  lhs.elem_char *= rhs.elem_char;
+  lhs.elem_char2 *= rhs.elem_char2;
+  lhs.elem_char3 *= rhs.elem_char3;
+  lhs.elem_char4 *= rhs.elem_char4;
+  lhs.elem_double *= rhs.elem_double;
+  lhs.elem_double2 *= rhs.elem_double2;
+  lhs.elem_double3 *= rhs.elem_double3;
+  lhs.elem_double4 *= rhs.elem_double4;
+  lhs.elem_float *= rhs.elem_float;
+  lhs.elem_float2 *= rhs.elem_float2;
+  lhs.elem_float3 *= rhs.elem_float3;
+  lhs.elem_float4 *= rhs.elem_float4;
+  lhs.elem_int *= rhs.elem_int;
+  lhs.elem_int2 *= rhs.elem_int2;
+  lhs.elem_int3 *= rhs.elem_int3;
+  lhs.elem_int4 *= rhs.elem_int4;
+  lhs.elem_long *= rhs.elem_long;
+  lhs.elem_long2 *= rhs.elem_long2;
+  lhs.elem_long3 *= rhs.elem_long3;
+  lhs.elem_long4 *= rhs.elem_long4;
+  lhs.elem_short *= rhs.elem_short;
+  lhs.elem_short2 *= rhs.elem_short2;
+  lhs.elem_short3 *= rhs.elem_short3;
+  lhs.elem_short4 *= rhs.elem_short4;
+  lhs.elem_uchar *= rhs.elem_uchar;
+  lhs.elem_uchar2 *= rhs.elem_uchar2;
+  lhs.elem_uchar3 *= rhs.elem_uchar3;
+  lhs.elem_uchar4 *= rhs.elem_uchar4;
+  lhs.elem_uint *= rhs.elem_uint;
+  lhs.elem_uint2 *= rhs.elem_uint2;
+  lhs.elem_uint3 *= rhs.elem_uint3;
+  lhs.elem_uint4 *= rhs.elem_uint4;
+  lhs.elem_ulong *= rhs.elem_ulong;
+  lhs.elem_ulong2 *= rhs.elem_ulong2;
+  lhs.elem_ulong3 *= rhs.elem_ulong3;
+  lhs.elem_ulong4 *= rhs.elem_ulong4;
+  lhs.elem_ushort *= rhs.elem_ushort;
+  lhs.elem_ushort2 *= rhs.elem_ushort2;
+  lhs.elem_ushort3 *= rhs.elem_ushort3;
+  lhs.elem_ushort4 *= rhs.elem_ushort4;
+  return lhs;
+}
diff --git a/lit-tests/reduce_reflection/reduce_reflection_cpp.rs b/lit-tests/reduce_reflection/reduce_reflection_cpp.rs
new file mode 100644
index 0000000..0da4d0a
--- /dev/null
+++ b/lit-tests/reduce_reflection/reduce_reflection_cpp.rs
@@ -0,0 +1,408 @@
+// RUN: %Slang -target-api 0 -reflect-c++ %s
+// RUN: %scriptc-filecheck-wrapper --lang=C++ %s
+
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+// CHECK: void reduce_mul_half(android::RSC::sp<android::RSC::Allocation> ain,
+// Array variants of kernels with the half type are unsupported.
+// CHECK-NOT: half reduce_mul_half(const half in[],
+half __attribute__((kernel("reduce"))) mul_half(half lhs, half rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_half2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK-NOT: Half2 reduce_mul_half2(const half in[],
+half2 __attribute__((kernel("reduce"))) mul_half2(half2 lhs, half2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_half3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK-NOT: Half3 reduce_mul_half3(const half in[],
+half3 __attribute__((kernel("reduce"))) mul_half3(half3 lhs, half3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_half4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK-NOT: Half4 reduce_mul_half4(const half in[],
+half4 __attribute__((kernel("reduce"))) mul_half4(half4 lhs, half4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_bool(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: bool reduce_mul_bool(const bool in[],
+bool __attribute__((kernel("reduce")))
+mul_bool(bool lhs, bool rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_char(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: int8_t reduce_mul_char(const int8_t in[],
+char __attribute__((kernel("reduce")))
+mul_char(char lhs, char rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_char2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Byte2 reduce_mul_char2(const int8_t in[],
+char2 __attribute__((kernel("reduce")))
+mul_char2(char2 lhs, char2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_char3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Byte3 reduce_mul_char3(const int8_t in[],
+char3 __attribute__((kernel("reduce")))
+mul_char3(char3 lhs, char3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_char4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Byte4 reduce_mul_char4(const int8_t in[],
+char4 __attribute__((kernel("reduce")))
+mul_char4(char4 lhs, char4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_double(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: double reduce_mul_double(const double in[],
+double __attribute__((kernel("reduce")))
+mul_double(double lhs, double rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_double2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Double2 reduce_mul_double2(const double in[],
+double2 __attribute__((kernel("reduce")))
+mul_double2(double2 lhs, double2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_double3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Double3 reduce_mul_double3(const double in[],
+double3 __attribute__((kernel("reduce")))
+mul_double3(double3 lhs, double3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_double4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Double4 reduce_mul_double4(const double in[],
+double4 __attribute__((kernel("reduce")))
+mul_double4(double4 lhs, double4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_float(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: float reduce_mul_float(const float in[],
+float __attribute__((kernel("reduce")))
+mul_float(float lhs, float rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_float2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Float2 reduce_mul_float2(const float in[],
+float2 __attribute__((kernel("reduce")))
+mul_float2(float2 lhs, float2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_float3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Float3 reduce_mul_float3(const float in[],
+float3 __attribute__((kernel("reduce")))
+mul_float3(float3 lhs, float3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_float4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Float4 reduce_mul_float4(const float in[],
+float4 __attribute__((kernel("reduce")))
+mul_float4(float4 lhs, float4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_int(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: int32_t reduce_mul_int(const int32_t in[],
+int __attribute__((kernel("reduce")))
+mul_int(int lhs, int rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_int2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Int2 reduce_mul_int2(const int32_t in[],
+int2 __attribute__((kernel("reduce")))
+mul_int2(int2 lhs, int2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_int3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Int3 reduce_mul_int3(const int32_t in[],
+int3 __attribute__((kernel("reduce")))
+mul_int3(int3 lhs, int3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_int4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Int4 reduce_mul_int4(const int32_t in[],
+int4 __attribute__((kernel("reduce")))
+mul_int4(int4 lhs, int4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_long(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: int64_t reduce_mul_long(const int64_t in[],
+long __attribute__((kernel("reduce")))
+mul_long(long lhs, long rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_long2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Long2 reduce_mul_long2(const int64_t in[],
+long2 __attribute__((kernel("reduce")))
+mul_long2(long2 lhs, long2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_long3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Long3 reduce_mul_long3(const int64_t in[],
+long3 __attribute__((kernel("reduce")))
+mul_long3(long3 lhs, long3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_long4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Long4 reduce_mul_long4(const int64_t in[],
+long4 __attribute__((kernel("reduce")))
+mul_long4(long4 lhs, long4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_short(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: int16_t reduce_mul_short(const int16_t in[],
+short __attribute__((kernel("reduce")))
+mul_short(short lhs, short rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_short2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Short2 reduce_mul_short2(const int16_t in[],
+short2 __attribute__((kernel("reduce")))
+mul_short2(short2 lhs, short2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_short3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Short3 reduce_mul_short3(const int16_t in[],
+short3 __attribute__((kernel("reduce")))
+mul_short3(short3 lhs, short3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_short4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: Short4 reduce_mul_short4(const int16_t in[],
+short4 __attribute__((kernel("reduce")))
+mul_short4(short4 lhs, short4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uchar(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: uint8_t reduce_mul_uchar(const uint8_t in[],
+uchar __attribute__((kernel("reduce")))
+mul_uchar(uchar lhs, uchar rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uchar2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UByte2 reduce_mul_uchar2(const uint8_t in[],
+uchar2 __attribute__((kernel("reduce")))
+mul_uchar2(uchar2 lhs, uchar2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uchar3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UByte3 reduce_mul_uchar3(const uint8_t in[],
+uchar3 __attribute__((kernel("reduce")))
+mul_uchar3(uchar3 lhs, uchar3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uchar4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UByte4 reduce_mul_uchar4(const uint8_t in[],
+uchar4 __attribute__((kernel("reduce")))
+mul_uchar4(uchar4 lhs, uchar4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uint(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: uint32_t reduce_mul_uint(const uint32_t in[],
+uint __attribute__((kernel("reduce")))
+mul_uint(uint lhs, uint rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uint2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UInt2 reduce_mul_uint2(const uint32_t in[],
+uint2 __attribute__((kernel("reduce")))
+mul_uint2(uint2 lhs, uint2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uint3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UInt3 reduce_mul_uint3(const uint32_t in[],
+uint3 __attribute__((kernel("reduce")))
+mul_uint3(uint3 lhs, uint3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_uint4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UInt4 reduce_mul_uint4(const uint32_t in[],
+uint4 __attribute__((kernel("reduce")))
+mul_uint4(uint4 lhs, uint4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ulong(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: uint64_t reduce_mul_ulong(const uint64_t in[],
+ulong __attribute__((kernel("reduce")))
+mul_ulong(ulong lhs, ulong rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ulong2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: ULong2 reduce_mul_ulong2(const uint64_t in[],
+ulong2 __attribute__((kernel("reduce")))
+mul_ulong2(ulong2 lhs, ulong2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ulong3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: ULong3 reduce_mul_ulong3(const uint64_t in[],
+ulong3 __attribute__((kernel("reduce")))
+mul_ulong3(ulong3 lhs, ulong3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ulong4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: ULong4 reduce_mul_ulong4(const uint64_t in[],
+ulong4 __attribute__((kernel("reduce")))
+mul_ulong4(ulong4 lhs, ulong4 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ushort(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: uint16_t reduce_mul_ushort(const uint16_t in[],
+ushort __attribute__((kernel("reduce")))
+mul_ushort(ushort lhs, ushort rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ushort2(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UShort2 reduce_mul_ushort2(const uint16_t in[],
+ushort2 __attribute__((kernel("reduce")))
+mul_ushort2(ushort2 lhs, ushort2 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ushort3(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UShort3 reduce_mul_ushort3(const uint16_t in[],
+ushort3 __attribute__((kernel("reduce")))
+mul_ushort3(ushort3 lhs, ushort3 rhs) {
+  return lhs * rhs;
+}
+
+// CHECK: void reduce_mul_ushort4(android::RSC::sp<android::RSC::Allocation> ain,
+// CHECK: UShort4 reduce_mul_ushort4(const uint16_t in[],
+ushort4 __attribute__((kernel("reduce")))
+mul_ushort4(ushort4 lhs, ushort4 rhs) {
+  return lhs * rhs;
+}
+
+struct indirect {
+  bool elem_bool;
+  char elem_char;
+  char2 elem_char2;
+  char3 elem_char3;
+  char4 elem_char4;
+  double elem_double;
+  double2 elem_double2;
+  double3 elem_double3;
+  double4 elem_double4;
+  float elem_float;
+  float2 elem_float2;
+  float3 elem_float3;
+  float4 elem_float4;
+  int elem_int;
+  int2 elem_int2;
+  int3 elem_int3;
+  int4 elem_int4;
+  long elem_long;
+  long2 elem_long2;
+  long3 elem_long3;
+  long4 elem_long4;
+  short elem_short;
+  short2 elem_short2;
+  short3 elem_short3;
+  short4 elem_short4;
+  uchar elem_uchar;
+  uchar2 elem_uchar2;
+  uchar3 elem_uchar3;
+  uchar4 elem_uchar4;
+  uint elem_uint;
+  uint2 elem_uint2;
+  uint3 elem_uint3;
+  uint4 elem_uint4;
+  ulong elem_ulong;
+  ulong2 elem_ulong2;
+  ulong3 elem_ulong3;
+  ulong4 elem_ulong4;
+  ushort elem_ushort;
+  ushort2 elem_ushort2;
+  ushort3 elem_ushort3;
+  ushort4 elem_ushort4;
+};
+
+// CHECK: void reduce_mul_indirect(android::RSC::sp<android::RSC::Allocation> ain,
+struct indirect __attribute__((kernel("reduce")))
+mul_indirect(struct indirect lhs, struct indirect rhs) {
+  lhs.elem_bool *= rhs.elem_bool;
+  lhs.elem_char *= rhs.elem_char;
+  lhs.elem_char2 *= rhs.elem_char2;
+  lhs.elem_char3 *= rhs.elem_char3;
+  lhs.elem_char4 *= rhs.elem_char4;
+  lhs.elem_double *= rhs.elem_double;
+  lhs.elem_double2 *= rhs.elem_double2;
+  lhs.elem_double3 *= rhs.elem_double3;
+  lhs.elem_double4 *= rhs.elem_double4;
+  lhs.elem_float *= rhs.elem_float;
+  lhs.elem_float2 *= rhs.elem_float2;
+  lhs.elem_float3 *= rhs.elem_float3;
+  lhs.elem_float4 *= rhs.elem_float4;
+  lhs.elem_int *= rhs.elem_int;
+  lhs.elem_int2 *= rhs.elem_int2;
+  lhs.elem_int3 *= rhs.elem_int3;
+  lhs.elem_int4 *= rhs.elem_int4;
+  lhs.elem_long *= rhs.elem_long;
+  lhs.elem_long2 *= rhs.elem_long2;
+  lhs.elem_long3 *= rhs.elem_long3;
+  lhs.elem_long4 *= rhs.elem_long4;
+  lhs.elem_short *= rhs.elem_short;
+  lhs.elem_short2 *= rhs.elem_short2;
+  lhs.elem_short3 *= rhs.elem_short3;
+  lhs.elem_short4 *= rhs.elem_short4;
+  lhs.elem_uchar *= rhs.elem_uchar;
+  lhs.elem_uchar2 *= rhs.elem_uchar2;
+  lhs.elem_uchar3 *= rhs.elem_uchar3;
+  lhs.elem_uchar4 *= rhs.elem_uchar4;
+  lhs.elem_uint *= rhs.elem_uint;
+  lhs.elem_uint2 *= rhs.elem_uint2;
+  lhs.elem_uint3 *= rhs.elem_uint3;
+  lhs.elem_uint4 *= rhs.elem_uint4;
+  lhs.elem_ulong *= rhs.elem_ulong;
+  lhs.elem_ulong2 *= rhs.elem_ulong2;
+  lhs.elem_ulong3 *= rhs.elem_ulong3;
+  lhs.elem_ulong4 *= rhs.elem_ulong4;
+  lhs.elem_ushort *= rhs.elem_ushort;
+  lhs.elem_ushort2 *= rhs.elem_ushort2;
+  lhs.elem_ushort3 *= rhs.elem_ushort3;
+  lhs.elem_ushort4 *= rhs.elem_ushort4;
+  return lhs;
+}
diff --git a/lit-tests/rs-filecheck-wrapper.sh b/lit-tests/rs-filecheck-wrapper.sh
index 816c80a..8f6d718 100755
--- a/lit-tests/rs-filecheck-wrapper.sh
+++ b/lit-tests/rs-filecheck-wrapper.sh
@@ -9,4 +9,6 @@
 
 FILECHECK_INPUTFILE=`basename $SOURCEFILE | sed 's/\.rs\$/.ll/'`
 
-$FILECHECK -input-file $OUTDIR/$FILECHECK_INPUTFILE $SOURCEFILE
+# This runs FileCheck on both the 32 bit and the 64 bit bitcode files.
+$FILECHECK -input-file $OUTDIR/bc32/$FILECHECK_INPUTFILE $SOURCEFILE
+$FILECHECK -input-file $OUTDIR/bc64/$FILECHECK_INPUTFILE $SOURCEFILE
diff --git a/lit-tests/run-lit-tests.sh b/lit-tests/run-lit-tests.sh
new file mode 100755
index 0000000..28a9696
--- /dev/null
+++ b/lit-tests/run-lit-tests.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -e
+
+LIT_PATH=$ANDROID_BUILD_TOP/frameworks/compile/libbcc/tests/debuginfo/llvm-lit
+TESTS=$ANDROID_BUILD_TOP/frameworks/compile/slang/lit-tests
+
+$LIT_PATH $TESTS $@
diff --git a/lit-tests/scriptc-filecheck-wrapper.sh b/lit-tests/scriptc-filecheck-wrapper.sh
new file mode 100755
index 0000000..7e16872
--- /dev/null
+++ b/lit-tests/scriptc-filecheck-wrapper.sh
@@ -0,0 +1,76 @@
+#!/bin/bash -e
+
+# RS Invocation script to FileCheck, used to check generated Java
+# files or C++ files. This assumes that the .rs source file has the
+# Java package name "foo".
+
+print_help() {
+  help_str="Usage: %s --output=<output-dir> \
+--filecheck=<path-to-filecheck> \
+--lang=[Java/C++] \
+<.rs file>\n"
+
+  printf "$help_str" $0
+}
+
+for arg in "$@"
+do
+  case $arg in
+  --output=*)
+    outdir="${arg#*=}"
+    ;;
+  --filecheck*)
+    filecheck="${arg#*=}"
+    ;;
+  --lang*)
+    lang="${arg#*=}"
+    ;;
+  --help)
+    print_help
+    exit 0
+    ;;
+  *)
+    rsfile="$arg"
+    ;;
+  esac
+done
+
+if [[ (-z $outdir) || (-z $filecheck) || (-z $rsfile) ]]
+then
+  print_help
+  exit 1
+fi
+
+if [[ ! -f $rsfile ]]
+then
+  echo "Input file $rsfile doesn't exist"
+  exit 1
+fi
+
+rsfile_basename=$(basename "$rsfile")
+
+if [[ $lang == "Java" ]]
+then
+  filecheck_inputfile=foo/ScriptC_${rsfile_basename%.*}.java
+elif [[ $lang == "C++" ]]
+then
+  filecheck_inputfile=ScriptC_${rsfile_basename%.*}.h
+else
+  echo Unknown language "$lang"
+  print_help
+  exit 1
+fi
+
+if [[ ! -f $filecheck ]]
+then
+  echo "No file at supplied FileCheck path $filecheck"
+  exit 1
+fi
+
+if [[ ! -f $outdir/$filecheck_inputfile ]]
+then
+  echo "Input file $outdir/$filecheck_inputfile doesn't exist"
+  exit 1
+fi
+
+"$filecheck" -input-file "$outdir"/$filecheck_inputfile "$rsfile"
diff --git a/llvm-rs-as.cpp b/llvm-rs-as.cpp
index 1f81b14..c63a1ac 100644
--- a/llvm-rs-as.cpp
+++ b/llvm-rs-as.cpp
@@ -29,9 +29,8 @@
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 
-#include "BitWriter_3_2/ReaderWriter_3_2.h"
-#include "BitWriter_2_9/ReaderWriter_2_9.h"
-#include "BitWriter_2_9_func/ReaderWriter_2_9_func.h"
+#include "slang_bitcode_gen.h"
+#include "slang_version.h"
 
 #include <memory>
 using namespace llvm;
@@ -49,6 +48,11 @@
 static cl::opt<bool>
 DisableOutput("disable-output", cl::desc("Disable output"), cl::init(false));
 
+static cl::opt<uint32_t>
+TargetAPI("target-api", cl::desc("Specify RenderScript target API version "
+                                 "(0 = development API) (default is 0)"),
+          cl::init(0));
+
 static cl::opt<bool>
 DumpAsm("d", cl::desc("Print assembly as parsed"), cl::Hidden);
 
@@ -56,20 +60,8 @@
 DisableVerify("disable-verify", cl::Hidden,
               cl::desc("Do not run verifier on input LLVM (dangerous!)"));
 
-enum BCVersion {
-  BC29, BC29Func, BC32, BCHEAD
-};
 
-cl::opt<BCVersion> BitcodeVersion("bitcode-version",
-  cl::desc("Set the bitcode version to be written:"),
-  cl::values(
-    clEnumValN(BC29, "BC29", "Version 2.9"),
-     clEnumVal(BC29Func,     "Version 2.9 func"),
-     clEnumVal(BC32,         "Version 3.2"),
-     clEnumVal(BCHEAD,       "Most current version"),
-    clEnumValEnd), cl::init(BC32));
-
-static void WriteOutputFile(const Module *M) {
+static void WriteOutputFile(const Module *M, uint32_t ModuleTargetAPI) {
   // Infer the output filename if needed.
   if (OutputFilename.empty()) {
     if (InputFilename == "-") {
@@ -97,24 +89,15 @@
   }
 
   if (Force || !CheckBitcodeOutputToConsole(Out->os(), true)) {
-    switch(BitcodeVersion) {
-      case BC29:
-        llvm_2_9::WriteBitcodeToFile(M, Out->os());
-        break;
-      case BC29Func:
-        llvm_2_9_func::WriteBitcodeToFile(M, Out->os());
-        break;
-      case BC32:
-        llvm_3_2::WriteBitcodeToFile(M, Out->os());
-        break;
-      case BCHEAD:
-        llvm::WriteBitcodeToFile(M, Out->os());
-        break;
+    slang::writeBitcode(Out->os(), *M,
+        /* TargetAPI = */ ModuleTargetAPI,
+        /* OptimizationLevel = */ 3);
+
+    if (!Out->os().has_error()) {
+      // Declare success.
+      Out->keep();
     }
   }
-
-  // Declare success.
-  Out->keep();
 }
 
 int main(int argc, char **argv) {
@@ -125,6 +108,18 @@
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   cl::ParseCommandLineOptions(argc, argv, "llvm .ll -> .bc assembler\n");
 
+  // Check target API.
+  uint32_t ActualTargetAPI = (TargetAPI == 0) ? RS_DEVELOPMENT_API : TargetAPI;
+
+  if (ActualTargetAPI != RS_DEVELOPMENT_API &&
+      (ActualTargetAPI < SLANG_MINIMUM_TARGET_API ||
+       ActualTargetAPI > SLANG_MAXIMUM_TARGET_API)) {
+    errs() << "target API level '" << ActualTargetAPI << "' is out of range "
+           << "('" << SLANG_MINIMUM_TARGET_API << "' - '"
+           << SLANG_MAXIMUM_TARGET_API << "')\n";
+    return 1;
+  }
+
   // Parse the file now...
   SMDiagnostic Err;
   std::unique_ptr<Module> M(parseAssemblyFile(InputFilename, Err, Context));
@@ -147,7 +142,7 @@
   if (DumpAsm) errs() << "Here's the assembly:\n" << *M.get();
 
   if (!DisableOutput)
-    WriteOutputFile(M.get());
+    WriteOutputFile(M.get(), ActualTargetAPI);
 
   return 0;
 }
diff --git a/llvm-rs-cc.cpp b/llvm-rs-cc.cpp
index 905f2e7..631e9b7 100644
--- a/llvm-rs-cc.cpp
+++ b/llvm-rs-cc.cpp
@@ -26,13 +26,16 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 
 #include "llvm/Option/OptTable.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/StringSaver.h"
 #include "llvm/Target/TargetMachine.h"
 
+#include "os_sep.h"
 #include "rs_cc_options.h"
 #include "slang.h"
 #include "slang_assert.h"
@@ -44,14 +47,17 @@
 #include <string>
 
 namespace {
-class StringSet : public llvm::cl::StringSaver {
+class StringSet : public llvm::StringSaver {
 public:
-  const char *SaveString(const char *Str) override {
-    return Strings.insert(Str).first->c_str();
+  const char *saveImpl(llvm::StringRef Str) override {
+    return Strings.insert(Str.str()).first->c_str();
   }
 
+  StringSet() : llvm::StringSaver(A), A() {}
+
 private:
   std::set<std::string> Strings;
+  llvm::BumpPtrAllocator A;
 };
 }
 
@@ -111,7 +117,7 @@
     }
   }
 
-  return SavedStrings->SaveString(OutputFile.c_str());
+  return SavedStrings->save(OutputFile.c_str());
 }
 
 typedef std::list<std::pair<const char*, const char*> > NamePairList;
diff --git a/rs_cc_options.cpp b/rs_cc_options.cpp
index 94d1453..1f989b2 100644
--- a/rs_cc_options.cpp
+++ b/rs_cc_options.cpp
@@ -86,7 +86,7 @@
 bool ParseArguments(const llvm::ArrayRef<const char *> &ArgsIn,
                     llvm::SmallVectorImpl<const char *> &Inputs,
                     RSCCOptions &Opts, clang::DiagnosticOptions &DiagOpts,
-                    llvm::cl::StringSaver &StringSaver) {
+                    llvm::StringSaver &StringSaver) {
   // We use a different diagnostic engine for argument parsing from the rest of
   // the work.  This mimics what's done in clang.  I believe it is so the
   // argument parsing errors are well formatted while the full errors can be
@@ -153,6 +153,12 @@
       Opts.mOutputType = Slang::OT_Bitcode;
       break;
     }
+    case OPT_MP: {
+      Opts.mEmitDependency = true;
+      Opts.mOutputType = Slang::OT_Bitcode;
+      Opts.mEmitPhonyDependency = true;
+      break;
+    }
     default: { slangAssert(false && "Invalid option in M group!"); }
     }
   }
@@ -222,11 +228,22 @@
       }
     }
   } else if (lastBitwidthArg) {
-    // -m32/-m64 are forbidden for non-C++ reflection paths.
-    DiagEngine.Report(
-        DiagEngine.getCustomDiagID(clang::DiagnosticsEngine::Error,
-                                   "cannot use -m32/-m64 without specifying "
-                                   "C++ reflection (-reflect-c++)"));
+      // -m32/-m64 are forbidden for non-C++ reflection paths for non-eng builds
+      // (they would make it too easy for a developer to accidentally create and
+      // release an APK that has 32-bit or 64-bit bitcode but not both).
+#ifdef __ENABLE_INTERNAL_OPTIONS
+      if (lastBitwidthArg->getOption().matches(OPT_m32)) {
+        Opts.mBitWidth = 32;
+      } else {
+        Opts.mBitWidth = 64;
+      }
+      Opts.mEmit3264 = false;
+#else
+      DiagEngine.Report(
+          DiagEngine.getCustomDiagID(clang::DiagnosticsEngine::Error,
+                                     "cannot use -m32/-m64 without specifying "
+                                     "C++ reflection (-reflect-c++)"));
+#endif
   }
 
   Opts.mDependencyOutputDir =
@@ -237,6 +254,25 @@
   Opts.mShowVersion = Args->hasArg(OPT_version);
   Opts.mDebugEmission = Args->hasArg(OPT_emit_g);
   Opts.mVerbose = Args->hasArg(OPT_verbose);
+  Opts.mASTPrint = Args->hasArg(OPT_ast_print);
+
+  // Delegate options
+
+  std::vector<std::string> DelegatedStrings;
+  for (int Opt : std::vector<unsigned>{OPT_debug, OPT_print_after_all, OPT_print_before_all}) {
+    if (Args->hasArg(Opt)) {
+      // TODO: Don't assume that the option begins with "-"; determine this programmatically instead.
+      DelegatedStrings.push_back(std::string("-") + std::string(OptParser->getOptionName(Opt)));
+      slangAssert(OptParser->getOptionKind(Opt) == llvm::opt::Option::FlagClass);
+    }
+  }
+  if (DelegatedStrings.size()) {
+    std::vector<const char *> DelegatedCStrs;
+    DelegatedCStrs.push_back(*ArgVector.data()); // program name
+    std::for_each(DelegatedStrings.cbegin(), DelegatedStrings.cend(),
+                  [&DelegatedCStrs](const std::string &String) { DelegatedCStrs.push_back(String.c_str()); });
+    llvm::cl::ParseCommandLineOptions(DelegatedCStrs.size(), DelegatedCStrs.data());
+  }
 
   // If we are emitting both 32-bit and 64-bit bitcode, we must embed it.
 
@@ -253,11 +289,10 @@
     Opts.mTargetAPI = UINT_MAX;
   }
 
-  Opts.mEmit3264 =
-      (Opts.mTargetAPI >= 21) && (Opts.mBitcodeStorage != BCST_CPP_CODE);
-  if (Opts.mEmit3264) {
+  if ((Opts.mTargetAPI < 21) || (Opts.mBitcodeStorage == BCST_CPP_CODE))
+    Opts.mEmit3264 = false;
+  if (Opts.mEmit3264)
     Opts.mBitcodeStorage = BCST_JAVA_CODE;
-  }
 
   if (DiagEngine.hasErrorOccurred()) {
     llvm::errs() << DiagsBuffer.str();
diff --git a/rs_cc_options.h b/rs_cc_options.h
index e45dae0..cc60560 100644
--- a/rs_cc_options.h
+++ b/rs_cc_options.h
@@ -75,6 +75,10 @@
   // Emit output dependency file for each input file.
   bool mEmitDependency;
 
+  // Emit phony targets for each header dependency, which can avoid make errors
+  // when the header gets deleted. See -MP option of cc.
+  bool mEmitPhonyDependency;
+
   // The output directory for writing dependency files
   // (i.e. out/target/common/obj/APPS/.../src/renderscript).
   std::string mDependencyOutputDir;
@@ -98,6 +102,9 @@
   // Display verbose information about the compilation on stdout.
   bool mVerbose;
 
+  // Display AST.
+  bool mASTPrint;
+
   // Emit both 32-bit and 64-bit bitcode (embedded in the reflected sources).
   bool mEmit3264;
 
@@ -106,13 +113,15 @@
     mBitWidth = 32;
     mBitcodeStorage = slang::BCST_APK_RESOURCE;
     mEmitDependency = 0;
+    mEmitPhonyDependency = 0;
     mShowHelp = 0;
     mShowVersion = 0;
     mTargetAPI = RS_VERSION;
     mDebugEmission = 0;
     mOptimizationLevel = llvm::CodeGenOpt::Aggressive;
     mVerbose = false;
-    mEmit3264 = false;
+    mASTPrint = false;
+    mEmit3264 = true;
   }
 };
 
@@ -132,7 +141,7 @@
 bool ParseArguments(const llvm::ArrayRef<const char *> &ArgsIn,
                     llvm::SmallVectorImpl<const char *> &Inputs,
                     RSCCOptions &Opts, clang::DiagnosticOptions &DiagOpts,
-                    llvm::cl::StringSaver &StringSaver);
+                    llvm::StringSaver &StringSaver);
 
 } // namespace slang
 
diff --git a/slang.cpp b/slang.cpp
index 2c38359..02a77bb 100644
--- a/slang.cpp
+++ b/slang.cpp
@@ -226,9 +226,9 @@
 }
 
 clang::ASTConsumer *
-Slang::createBackend(const clang::CodeGenOptions &CodeGenOpts,
+Slang::createBackend(const RSCCOptions &Opts, const clang::CodeGenOptions &CodeGenOpts,
                      llvm::raw_ostream *OS, OutputType OT) {
-  return new Backend(mRSContext, &getDiagnostics(), CodeGenOpts,
+  return new Backend(mRSContext, &getDiagnostics(), Opts, CodeGenOpts,
                      getTargetOptions(), &mPragmas, OS, OT, getSourceManager(),
                      mAllowRSPrefix, mIsFilterscript);
 }
@@ -339,7 +339,7 @@
   return true;
 }
 
-int Slang::generateDepFile() {
+int Slang::generateDepFile(bool PhonyTarget) {
   if (mDiagEngine->hasErrorOccurred())
     return 1;
   if (mDOS.get() == nullptr)
@@ -348,6 +348,8 @@
   // Initialize options for generating dependency file
   clang::DependencyOutputOptions DepOpts;
   DepOpts.IncludeSystemHeaders = 1;
+  if (PhonyTarget)
+    DepOpts.UsePhonyTargets = 1;
   DepOpts.OutputFile = mDepOutputFileName;
   DepOpts.Targets = mAdditionalDepTargets;
   DepOpts.Targets.push_back(mDepTargetBCFileName);
@@ -386,7 +388,7 @@
   return mDiagEngine->hasErrorOccurred() ? 1 : 0;
 }
 
-int Slang::compile() {
+int Slang::compile(const RSCCOptions &Opts) {
   if (mDiagEngine->hasErrorOccurred())
     return 1;
   if (mOS.get() == nullptr)
@@ -396,7 +398,7 @@
   createPreprocessor();
   createASTContext();
 
-  mBackend.reset(createBackend(CodeGenOpts, &mOS->os(), mOT));
+  mBackend.reset(createBackend(Opts, CodeGenOpts, &mOS->os(), mOT));
 
   // Inform the diagnostic client we are processing a source file
   mDiagClient->BeginSourceFile(LangOpts, mPP.get());
@@ -534,8 +536,9 @@
           llvm::StringMapEntry<ReflectedDefinitionTy>::Create(RDKey);
       ME->setValue(std::make_pair(ERT, CurInputFile));
 
-      if (!ReflectedDefinitions.insert(ME))
-        delete ME;
+      if (!ReflectedDefinitions.insert(ME)) {
+        slangAssert(false && "Type shouldn't be in map yet!");
+      }
 
       // Take the ownership of ERT such that it won't be freed in ~RSContext().
       ERT->keep();
@@ -669,7 +672,9 @@
 
     mIsFilterscript = isFilterscript(InputFile);
 
-    if (Slang::compile() > 0)
+    CodeGenOpts.MainFileName = mInputFileName;
+
+    if (Slang::compile(Opts) > 0)
       return false;
 
     if (!Opts.mJavaReflectionPackageName.empty()) {
@@ -745,7 +750,7 @@
       if (SuppressAllWarnings) {
         getDiagnostics().setSuppressAllDiagnostics(true);
       }
-      if (generateDepFile() > 0)
+      if (generateDepFile(Opts.mEmitPhonyDependency) > 0)
         return false;
       if (SuppressAllWarnings) {
         getDiagnostics().setSuppressAllDiagnostics(false);
diff --git a/slang.h b/slang.h
index 0848451..8855ab2 100644
--- a/slang.h
+++ b/slang.h
@@ -189,7 +189,8 @@
   void initPreprocessor();
   void initASTContext();
 
-  clang::ASTConsumer *createBackend(const clang::CodeGenOptions &CodeGenOpts,
+  clang::ASTConsumer *createBackend(const RSCCOptions &Opts,
+                                    const clang::CodeGenOptions &CodeGenOpts,
                                     llvm::raw_ostream *OS,
                                     OutputType OT);
 
@@ -237,9 +238,9 @@
     mGeneratedFileNames.push_back(GeneratedFileName);
   }
 
-  int generateDepFile();
+  int generateDepFile(bool PhonyTarget);
 
-  int compile();
+  int compile(const RSCCOptions &Opts);
 
   char const *getErrorMessage() { return mDiagClient->str().c_str(); }
 
@@ -271,8 +272,7 @@
 
   void makeModuleVisible(clang::Module *Mod,
                          clang::Module::NameVisibilityKind Visibility,
-                         clang::SourceLocation ImportLoc,
-                         bool Complain = false) override {}
+                         clang::SourceLocation ImportLoc) override {}
 
   clang::GlobalModuleIndex *
   loadGlobalModuleIndex(clang::SourceLocation TriggerLoc) override {
diff --git a/slang_backend.cpp b/slang_backend.cpp
index 8f4a255..0936494 100644
--- a/slang_backend.cpp
+++ b/slang_backend.cpp
@@ -19,8 +19,6 @@
 #include <string>
 #include <vector>
 
-#include "bcinfo/BitcodeWrapper.h"
-
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclGroup.h"
@@ -64,17 +62,18 @@
 
 #include "slang_assert.h"
 #include "slang.h"
+#include "slang_bitcode_gen.h"
 #include "slang_rs_context.h"
 #include "slang_rs_export_foreach.h"
 #include "slang_rs_export_func.h"
+#include "slang_rs_export_reduce.h"
 #include "slang_rs_export_type.h"
 #include "slang_rs_export_var.h"
 #include "slang_rs_metadata.h"
 
+#include "rs_cc_options.h"
+
 #include "strip_unknown_attributes.h"
-#include "BitWriter_2_9/ReaderWriter_2_9.h"
-#include "BitWriter_2_9_func/ReaderWriter_2_9_func.h"
-#include "BitWriter_3_2/ReaderWriter_3_2.h"
 
 namespace slang {
 
@@ -138,17 +137,10 @@
   // Target Machine Options
   llvm::TargetOptions Options;
 
-  Options.NoFramePointerElim = mCodeGenOpts.DisableFPElim;
-
-  // Use hardware FPU.
-  //
-  // FIXME: Need to detect the CPU capability and decide whether to use softfp.
-  // To use softfp, change following 2 lines to
-  //
-  // Options.FloatABIType = llvm::FloatABI::Soft;
-  // Options.UseSoftFloat = true;
-  Options.FloatABIType = llvm::FloatABI::Hard;
-  Options.UseSoftFloat = false;
+  // Use soft-float ABI for ARM (which is the target used by Slang during code
+  // generation).  Codegen still uses hardware FPU by default.  To use software
+  // floating point, add 'soft-float' feature to FeaturesStr below.
+  Options.FloatABIType = llvm::FloatABI::Soft;
 
   // BCC needs all unknown symbols resolved at compilation time. So we don't
   // need any relocation model.
@@ -215,7 +207,7 @@
 }
 
 Backend::Backend(RSContext *Context, clang::DiagnosticsEngine *DiagEngine,
-                 const clang::CodeGenOptions &CodeGenOpts,
+                 const RSCCOptions &Opts, const clang::CodeGenOptions &CodeGenOpts,
                  const clang::TargetOptions &TargetOpts, PragmaList *Pragmas,
                  llvm::raw_ostream *OS, Slang::OutputType OT,
                  clang::SourceManager &SourceMgr, bool AllowRSPrefix,
@@ -224,11 +216,12 @@
       mOT(OT), mGen(nullptr), mPerFunctionPasses(nullptr),
       mPerModulePasses(nullptr), mCodeGenPasses(nullptr),
       mBufferOutStream(*mpOS), mContext(Context),
-      mSourceMgr(SourceMgr), mAllowRSPrefix(AllowRSPrefix),
+      mSourceMgr(SourceMgr), mASTPrint(Opts.mASTPrint), mAllowRSPrefix(AllowRSPrefix),
       mIsFilterscript(IsFilterscript), mExportVarMetadata(nullptr),
       mExportFuncMetadata(nullptr), mExportForEachNameMetadata(nullptr),
-      mExportForEachSignatureMetadata(nullptr), mExportTypeMetadata(nullptr),
-      mRSObjectSlotsMetadata(nullptr), mRefCount(mContext->getASTContext()),
+      mExportForEachSignatureMetadata(nullptr), mExportReduceMetadata(nullptr),
+      mExportTypeMetadata(nullptr), mRSObjectSlotsMetadata(nullptr),
+      mRefCount(mContext->getASTContext()),
       mASTChecker(Context, Context->getTargetAPI(), IsFilterscript),
       mLLVMContext(llvm::getGlobalContext()), mDiagEngine(*DiagEngine),
       mCodeGenOpts(CodeGenOpts), mPragmas(Pragmas) {
@@ -241,25 +234,12 @@
   mpModule = mGen->GetModule();
 }
 
-// Encase the Bitcode in a wrapper containing RS version information.
-void Backend::WrapBitcode(llvm::raw_string_ostream &Bitcode) {
-  bcinfo::AndroidBitcodeWrapper wrapper;
-  size_t actualWrapperLen = bcinfo::writeAndroidBitcodeWrapper(
-      &wrapper, Bitcode.str().length(), getTargetAPI(),
-      SlangVersion::CURRENT, mCodeGenOpts.OptimizationLevel);
-
-  slangAssert(actualWrapperLen > 0);
-
-  // Write out the bitcode wrapper.
-  mBufferOutStream.write(reinterpret_cast<char*>(&wrapper), actualWrapperLen);
-
-  // Write out the actual encoded bitcode.
-  mBufferOutStream << Bitcode.str();
-}
-
 void Backend::HandleTranslationUnit(clang::ASTContext &Ctx) {
   HandleTranslationUnitPre(Ctx);
 
+  if (mASTPrint)
+    Ctx.getTranslationUnitDecl()->dump();
+
   mGen->HandleTranslationUnit(Ctx);
 
   // Here, we complete a translation unit (whole translation unit is now in LLVM
@@ -346,40 +326,8 @@
       break;
     }
     case Slang::OT_Bitcode: {
-      llvm::legacy::PassManager *BCEmitPM = new llvm::legacy::PassManager();
-      std::string BCStr;
-      llvm::raw_string_ostream Bitcode(BCStr);
-      unsigned int TargetAPI = getTargetAPI();
-      switch (TargetAPI) {
-        case SLANG_HC_TARGET_API:
-        case SLANG_HC_MR1_TARGET_API:
-        case SLANG_HC_MR2_TARGET_API: {
-          // Pre-ICS targets must use the LLVM 2.9 BitcodeWriter
-          BCEmitPM->add(llvm_2_9::createBitcodeWriterPass(Bitcode));
-          break;
-        }
-        case SLANG_ICS_TARGET_API:
-        case SLANG_ICS_MR1_TARGET_API: {
-          // ICS targets must use the LLVM 2.9_func BitcodeWriter
-          BCEmitPM->add(llvm_2_9_func::createBitcodeWriterPass(Bitcode));
-          break;
-        }
-        default: {
-          if (TargetAPI != SLANG_DEVELOPMENT_TARGET_API &&
-              (TargetAPI < SLANG_MINIMUM_TARGET_API ||
-               TargetAPI > SLANG_MAXIMUM_TARGET_API)) {
-            slangAssert(false && "Invalid target API value");
-          }
-          // Switch to the 3.2 BitcodeWriter by default, and don't use
-          // LLVM's included BitcodeWriter at all (for now).
-          BCEmitPM->add(llvm_3_2::createBitcodeWriterPass(Bitcode));
-          //BCEmitPM->add(llvm::createBitcodeWriterPass(Bitcode));
-          break;
-        }
-      }
-
-      BCEmitPM->run(*mpModule);
-      WrapBitcode(Bitcode);
+      writeBitcode(mBufferOutStream, *mpModule, getTargetAPI(),
+                   mCodeGenOpts.OptimizationLevel);
       break;
     }
     case Slang::OT_Nothing: {
@@ -778,6 +726,26 @@
   }
 }
 
+void Backend::dumpExportReduceInfo(llvm::Module *M) {
+  if (!mExportReduceMetadata) {
+    mExportReduceMetadata = M->getOrInsertNamedMetadata(RS_EXPORT_REDUCE_MN);
+  }
+
+  llvm::SmallVector<llvm::Metadata *, 1> ExportReduceInfo;
+
+  // Add the names of the reduce-style kernel functions to the metadata node.
+  for (auto I = mContext->export_reduce_begin(),
+            E = mContext->export_reduce_end(); I != E; ++I) {
+    ExportReduceInfo.clear();
+
+    ExportReduceInfo.push_back(
+      llvm::MDString::get(mLLVMContext, (*I)->getName().c_str()));
+
+    mExportReduceMetadata->addOperand(
+      llvm::MDNode::get(mLLVMContext, ExportReduceInfo));
+  }
+}
+
 void Backend::dumpExportTypeInfo(llvm::Module *M) {
   llvm::SmallVector<llvm::Metadata *, 1> ExportTypeInfo;
 
@@ -856,6 +824,9 @@
   if (mContext->hasExportForEach())
     dumpExportForEachInfo(M);
 
+  if (mContext->hasExportReduce())
+    dumpExportReduceInfo(M);
+
   if (mContext->hasExportType())
     dumpExportTypeInfo(M);
 }
diff --git a/slang_backend.h b/slang_backend.h
index e3dbdef..83912fd 100644
--- a/slang_backend.h
+++ b/slang_backend.h
@@ -81,12 +81,12 @@
   void CreateModulePasses();
   bool CreateCodeGenPasses();
 
-  void WrapBitcode(llvm::raw_string_ostream &Bitcode);
-
   RSContext *mContext;
 
   clang::SourceManager &mSourceMgr;
 
+  bool mASTPrint;
+
   bool mAllowRSPrefix;
 
   bool mIsFilterscript;
@@ -95,6 +95,7 @@
   llvm::NamedMDNode *mExportFuncMetadata;
   llvm::NamedMDNode *mExportForEachNameMetadata;
   llvm::NamedMDNode *mExportForEachSignatureMetadata;
+  llvm::NamedMDNode *mExportReduceMetadata;
   llvm::NamedMDNode *mExportTypeMetadata;
   llvm::NamedMDNode *mRSObjectSlotsMetadata;
 
@@ -107,6 +108,7 @@
   void dumpExportVarInfo(llvm::Module *M);
   void dumpExportFunctionInfo(llvm::Module *M);
   void dumpExportForEachInfo(llvm::Module *M);
+  void dumpExportReduceInfo(llvm::Module *M);
   void dumpExportTypeInfo(llvm::Module *M);
 
  protected:
@@ -136,6 +138,7 @@
  public:
   Backend(RSContext *Context,
             clang::DiagnosticsEngine *DiagEngine,
+            const RSCCOptions &Opts,
             const clang::CodeGenOptions &CodeGenOpts,
             const clang::TargetOptions &TargetOpts,
             PragmaList *Pragmas,
diff --git a/slang_bitcode_gen.cpp b/slang_bitcode_gen.cpp
new file mode 100644
index 0000000..83d96bf
--- /dev/null
+++ b/slang_bitcode_gen.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bcinfo/BitcodeWrapper.h"
+
+#include "llvm/Support/raw_ostream.h"
+
+#include "BitWriter_2_9/ReaderWriter_2_9.h"
+#include "BitWriter_2_9_func/ReaderWriter_2_9_func.h"
+#include "BitWriter_3_2/ReaderWriter_3_2.h"
+
+#include "slang_assert.h"
+#include "slang_bitcode_gen.h"
+#include "slang_version.h"
+
+namespace slang {
+
+void writeBitcode(llvm::raw_ostream &Out,
+                  const llvm::Module &M,
+                  uint32_t TargetAPI,
+                  uint32_t OptimizationLevel) {
+  std::string BitcodeStr;
+  llvm::raw_string_ostream Bitcode(BitcodeStr);
+
+  // Create the bitcode.
+  switch (TargetAPI) {
+  case SLANG_HC_TARGET_API:
+  case SLANG_HC_MR1_TARGET_API:
+  case SLANG_HC_MR2_TARGET_API: {
+    // Pre-ICS targets must use the LLVM 2.9 BitcodeWriter
+    llvm_2_9::WriteBitcodeToFile(&M, Bitcode);
+    break;
+  }
+  case SLANG_ICS_TARGET_API:
+  case SLANG_ICS_MR1_TARGET_API: {
+    // ICS targets must use the LLVM 2.9_func BitcodeWriter
+    llvm_2_9_func::WriteBitcodeToFile(&M, Bitcode);
+    break;
+  }
+  default: {
+    if (TargetAPI != SLANG_DEVELOPMENT_TARGET_API &&
+        (TargetAPI < SLANG_MINIMUM_TARGET_API ||
+         TargetAPI > SLANG_MAXIMUM_TARGET_API)) {
+      slangAssert(false && "Invalid target API value");
+    }
+    // Switch to the 3.2 BitcodeWriter by default, and don't use
+    // LLVM's included BitcodeWriter at all (for now).
+    llvm_3_2::WriteBitcodeToFile(&M, Bitcode);
+    break;
+  }
+  }
+
+  const uint32_t CompilerVersion = SlangVersion::CURRENT;
+
+  // Create the bitcode wrapper.
+  bcinfo::AndroidBitcodeWrapper Wrapper;
+  size_t ActualWrapperLen = bcinfo::writeAndroidBitcodeWrapper(
+        &Wrapper, Bitcode.str().length(), TargetAPI,
+        CompilerVersion, OptimizationLevel);
+
+  slangAssert(ActualWrapperLen > 0);
+
+  // Write out the file.
+  Out.write(reinterpret_cast<char*>(&Wrapper), ActualWrapperLen);
+  Out << Bitcode.str();
+}
+
+}  // namespace slang
diff --git a/slang_bitcode_gen.h b/slang_bitcode_gen.h
new file mode 100644
index 0000000..cc0e9f6
--- /dev/null
+++ b/slang_bitcode_gen.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FRAMEWORKS_COMPILE_SLANG_SLANG_BITCODE_GEN_H_  // NOLINT
+#define _FRAMEWORKS_COMPILE_SLANG_SLANG_BITCODE_GEN_H_
+
+#include <cstdint>
+
+namespace llvm {
+  class raw_ostream;
+  class Module;
+}
+
+namespace slang {
+
+// Write out the LLVM bitcode for a module, encased in a wrapper
+// containing RS version information.
+void writeBitcode(llvm::raw_ostream &Out,
+                  const llvm::Module &M,
+                  uint32_t TargetAPI,
+                  uint32_t OptimizationLevel);
+
+} // end namespace slang
+
+#endif  // _FRAMEWORKS_COMPILE_SLANG_SLANG_BITCODE_GEN_H_  NOLINT
diff --git a/slang_rs_check_ast.cpp b/slang_rs_check_ast.cpp
index 7c96291..3d3e886 100644
--- a/slang_rs_check_ast.cpp
+++ b/slang_rs_check_ast.cpp
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
+#include "clang/AST/Attr.h"
+
 #include "slang_rs_check_ast.h"
 
 #include "slang_assert.h"
 #include "slang.h"
 #include "slang_rs_export_foreach.h"
+#include "slang_rs_export_reduce.h"
 #include "slang_rs_export_type.h"
 
 namespace slang {
@@ -148,6 +151,31 @@
     return;
   }
 
+  if (FD->hasAttr<clang::KernelAttr>()) {
+    // Validate that the kernel attribute is not used with static.
+    if (FD->getStorageClass() == clang::SC_Static) {
+      Context->ReportError(FD->getLocation(),
+                           "Invalid use of attribute kernel with "
+                           "static function declaration: %0")
+        << FD->getName();
+      mValid = false;
+    }
+
+    // We allow no arguments to the attribute, or an expected single
+    // argument. If there is an expected single argument, we verify
+    // that it is one of the recognized kernel kinds.
+    llvm::StringRef KernelKind =
+      FD->getAttr<clang::KernelAttr>()->getKernelKind();
+
+    if (!KernelKind.empty() && !KernelKind.equals("reduce")) {
+      Context->ReportError(FD->getLocation(),
+                           "Unknown kernel attribute argument '%0' "
+                           "in declaration of function '%1'")
+        << KernelKind << FD->getName();
+      mValid = false;
+    }
+  }
+
   clang::QualType resultType = FD->getReturnType().getCanonicalType();
   bool isExtern = (FD->getFormalLinkage() == clang::ExternalLinkage);
 
@@ -169,7 +197,8 @@
   }
 
   bool saveKernel = mInKernel;
-  mInKernel = RSExportForEach::isRSForEachFunc(mTargetAPI, Context, FD);
+  mInKernel = RSExportForEach::isRSForEachFunc(mTargetAPI, FD) ||
+              RSExportReduce::isRSReduceFunc(mTargetAPI, FD);
 
   if (clang::Stmt *Body = FD->getBody()) {
     Visit(Body);
diff --git a/slang_rs_context.cpp b/slang_rs_context.cpp
index 81b3b49..94eb6be 100644
--- a/slang_rs_context.cpp
+++ b/slang_rs_context.cpp
@@ -34,11 +34,13 @@
 #include "slang_assert.h"
 #include "slang_rs_export_foreach.h"
 #include "slang_rs_export_func.h"
+#include "slang_rs_export_reduce.h"
 #include "slang_rs_export_type.h"
 #include "slang_rs_export_var.h"
 #include "slang_rs_exportable.h"
 #include "slang_rs_pragma_handler.h"
 #include "slang_rs_reflection.h"
+#include "slang_rs_special_func.h"
 
 namespace slang {
 
@@ -98,25 +100,37 @@
     return false;
   }
 
-  if (RSExportForEach::isSpecialRSFunc(mTargetAPI, FD)) {
+  // Specialized function
+  if (RSSpecialFunc::isSpecialRSFunc(mTargetAPI, FD)) {
     // Do not reflect specialized functions like init, dtor, or graphics root.
-    return RSExportForEach::validateSpecialFuncDecl(mTargetAPI, this, FD);
-  } else if (RSExportForEach::isRSForEachFunc(mTargetAPI, this, FD)) {
-    RSExportForEach *EFE = RSExportForEach::Create(this, FD);
-    if (EFE == nullptr)
-      return false;
-    else
+    return RSSpecialFunc::validateSpecialFuncDecl(mTargetAPI, this, FD);
+  }
+
+  // Foreach kernel
+  if (RSExportForEach::isRSForEachFunc(mTargetAPI, FD)) {
+    if (auto *EFE = RSExportForEach::Create(this, FD)) {
       mExportForEach.push_back(EFE);
+      return true;
+    }
+    return false;
+  }
+
+  // Reduce kernel
+  if (RSExportReduce::isRSReduceFunc(mTargetAPI, FD)) {
+    if (auto *ER = RSExportReduce::Create(this, FD)) {
+      mExportReduce.push_back(ER);
+      return true;
+    }
+    return false;
+  }
+
+  // Invokable
+  if (auto *EF = RSExportFunc::Create(this, FD)) {
+    mExportFuncs.push_back(EF);
     return true;
   }
 
-  RSExportFunc *EF = RSExportFunc::Create(this, FD);
-  if (EF == nullptr)
-    return false;
-  else
-    mExportFuncs.push_back(EF);
-
-  return true;
+  return false;
 }
 
 
@@ -258,7 +272,7 @@
   if (mExportTypes.insert(NewItem)) {
     return true;
   } else {
-    free(NewItem);
+    NewItem->Destroy(mExportTypes.getAllocator());
     return false;
   }
 }
diff --git a/slang_rs_context.h b/slang_rs_context.h
index 8e9b577..8cced4d 100644
--- a/slang_rs_context.h
+++ b/slang_rs_context.h
@@ -48,6 +48,7 @@
   class RSExportVar;
   class RSExportFunc;
   class RSExportForEach;
+  class RSExportReduce;
   class RSExportType;
 
 class RSContext {
@@ -60,6 +61,7 @@
   typedef std::list<RSExportVar*> ExportVarList;
   typedef std::list<RSExportFunc*> ExportFuncList;
   typedef std::list<RSExportForEach*> ExportForEachList;
+  typedef std::list<RSExportReduce*> ExportReduceList;
   typedef llvm::StringMap<RSExportType*> ExportTypeMap;
 
  private:
@@ -100,6 +102,7 @@
   ExportVarList mExportVars;
   ExportFuncList mExportFuncs;
   ExportForEachList mExportForEach;
+  ExportReduceList mExportReduce;
   ExportTypeMap mExportTypes;
 
  public:
@@ -198,6 +201,15 @@
   }
   inline bool hasExportForEach() const { return !mExportForEach.empty(); }
 
+  typedef ExportReduceList::const_iterator const_export_reduce_iterator;
+  const_export_reduce_iterator export_reduce_begin() const {
+    return mExportReduce.begin();
+  }
+  const_export_reduce_iterator export_reduce_end() const {
+    return mExportReduce.end();
+  }
+  inline bool hasExportReduce() const { return !mExportReduce.empty(); }
+
   typedef ExportTypeMap::iterator export_type_iterator;
   typedef ExportTypeMap::const_iterator const_export_type_iterator;
   export_type_iterator export_types_begin() { return mExportTypes.begin(); }
diff --git a/slang_rs_export_foreach.cpp b/slang_rs_export_foreach.cpp
index d539e13..6a2d89e 100644
--- a/slang_rs_export_foreach.cpp
+++ b/slang_rs_export_foreach.cpp
@@ -30,6 +30,7 @@
 #include "slang_assert.h"
 #include "slang_rs_context.h"
 #include "slang_rs_export_type.h"
+#include "slang_rs_special_func.h"
 #include "slang_version.h"
 
 namespace {
@@ -89,8 +90,15 @@
   return ret;
 }
 
+bool isRootRSFunc(const clang::FunctionDecl *FD) {
+  if (!FD) {
+    return false;
+  }
+  return FD->getName().equals("root");
 }
 
+} // end anonymous namespace
+
 namespace slang {
 
 // This function takes care of additional validation and construction of
@@ -533,14 +541,20 @@
     }
   }
 
-  if (FE->hasIns()) {
+  // Construct type information about inputs and outputs. Return null when
+  // there is an error exporting types.
 
+  bool TypeExportError = false;
+
+  if (FE->hasIns()) {
     for (InIter BI = FE->mIns.begin(), EI = FE->mIns.end(); BI != EI; BI++) {
       const clang::Type *T = (*BI)->getType().getCanonicalType().getTypePtr();
       RSExportType *InExportType = RSExportType::Create(Context, T);
 
-      if (FE->mIsKernelStyle) {
-        slangAssert(InExportType != nullptr);
+      // It is not an error if we don't export an input type for legacy
+      // kernels. This can happen in the case of a void pointer.
+      if (FE->mIsKernelStyle && !InExportType) {
+        TypeExportError = true;
       }
 
       FE->mInTypes.push_back(InExportType);
@@ -548,12 +562,21 @@
   }
 
   if (FE->mIsKernelStyle && FE->mHasReturnType) {
-    const clang::Type *T = FE->mResultType.getTypePtr();
-    FE->mOutType = RSExportType::Create(Context, T);
-    slangAssert(FE->mOutType);
+    const clang::Type *ReturnType = FE->mResultType.getTypePtr();
+    FE->mOutType = RSExportType::Create(Context, ReturnType);
+    TypeExportError |= !FE->mOutType;
   } else if (FE->mOut) {
-    const clang::Type *T = FE->mOut->getType().getCanonicalType().getTypePtr();
-    FE->mOutType = RSExportType::Create(Context, T);
+    const clang::Type *OutType =
+        FE->mOut->getType().getCanonicalType().getTypePtr();
+    FE->mOutType = RSExportType::Create(Context, OutType);
+    // It is not an error if we don't export an output type.
+    // This can happen in the case of a void pointer.
+  }
+
+  if (TypeExportError) {
+    slangAssert(Context->getDiagnostics()->hasErrorOccurred() &&
+                "Error exporting type but no diagnostic message issued!");
+    return nullptr;
   }
 
   return FE;
@@ -567,54 +590,16 @@
   return FE;
 }
 
-bool RSExportForEach::isGraphicsRootRSFunc(unsigned int targetAPI,
-                                           const clang::FunctionDecl *FD) {
-  if (FD->hasAttr<clang::KernelAttr>()) {
-    return false;
-  }
-
-  if (!isRootRSFunc(FD)) {
-    return false;
-  }
-
-  if (FD->getNumParams() == 0) {
-    // Graphics root function
-    return true;
-  }
-
-  // Check for legacy graphics root function (with single parameter).
-  if ((targetAPI < SLANG_ICS_TARGET_API) && (FD->getNumParams() == 1)) {
-    const clang::QualType &IntType = FD->getASTContext().IntTy;
-    if (FD->getReturnType().getCanonicalType() == IntType) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
 bool RSExportForEach::isRSForEachFunc(unsigned int targetAPI,
-                                      slang::RSContext* Context,
                                       const clang::FunctionDecl *FD) {
-  slangAssert(Context && FD);
-  bool hasKernelAttr = FD->hasAttr<clang::KernelAttr>();
+  slangAssert(FD);
 
-  if (FD->getStorageClass() == clang::SC_Static) {
-    if (hasKernelAttr) {
-      Context->ReportError(FD->getLocation(),
-                           "Invalid use of attribute kernel with "
-                           "static function declaration: %0")
-          << FD->getName();
-    }
-    return false;
+  // Anything tagged as a kernel("") is definitely used with ForEach.
+  if (auto *Kernel = FD->getAttr<clang::KernelAttr>()) {
+    return Kernel->getKernelKind().empty();
   }
 
-  // Anything tagged as a kernel is definitely used with ForEach.
-  if (hasKernelAttr) {
-    return true;
-  }
-
-  if (isGraphicsRootRSFunc(targetAPI, FD)) {
+  if (RSSpecialFunc::isGraphicsRootRSFunc(targetAPI, FD)) {
     return false;
   }
 
@@ -642,57 +627,4 @@
   return false;
 }
 
-bool
-RSExportForEach::validateSpecialFuncDecl(unsigned int targetAPI,
-                                         slang::RSContext *Context,
-                                         clang::FunctionDecl const *FD) {
-  slangAssert(Context && FD);
-  bool valid = true;
-  const clang::ASTContext &C = FD->getASTContext();
-  const clang::QualType &IntType = FD->getASTContext().IntTy;
-
-  if (isGraphicsRootRSFunc(targetAPI, FD)) {
-    if ((targetAPI < SLANG_ICS_TARGET_API) && (FD->getNumParams() == 1)) {
-      // Legacy graphics root function
-      const clang::ParmVarDecl *PVD = FD->getParamDecl(0);
-      clang::QualType QT = PVD->getType().getCanonicalType();
-      if (QT != IntType) {
-        Context->ReportError(PVD->getLocation(),
-                             "invalid parameter type for legacy "
-                             "graphics root() function: %0")
-            << PVD->getType();
-        valid = false;
-      }
-    }
-
-    // Graphics root function, so verify that it returns an int
-    if (FD->getReturnType().getCanonicalType() != IntType) {
-      Context->ReportError(FD->getLocation(),
-                           "root() is required to return "
-                           "an int for graphics usage");
-      valid = false;
-    }
-  } else if (isInitRSFunc(FD) || isDtorRSFunc(FD)) {
-    if (FD->getNumParams() != 0) {
-      Context->ReportError(FD->getLocation(),
-                           "%0(void) is required to have no "
-                           "parameters")
-          << FD->getName();
-      valid = false;
-    }
-
-    if (FD->getReturnType().getCanonicalType() != C.VoidTy) {
-      Context->ReportError(FD->getLocation(),
-                           "%0(void) is required to have a void "
-                           "return type")
-          << FD->getName();
-      valid = false;
-    }
-  } else {
-    slangAssert(false && "must be called on root, init or .rs.dtor function!");
-  }
-
-  return valid;
-}
-
 }  // namespace slang
diff --git a/slang_rs_export_foreach.h b/slang_rs_export_foreach.h
index 581d8a1..033e9ed 100644
--- a/slang_rs_export_foreach.h
+++ b/slang_rs_export_foreach.h
@@ -160,48 +160,8 @@
     return mParamPacketType->fields_end();
   }
 
-  inline static bool isInitRSFunc(const clang::FunctionDecl *FD) {
-    if (!FD) {
-      return false;
-    }
-    const llvm::StringRef Name = FD->getName();
-    static llvm::StringRef FuncInit("init");
-    return Name.equals(FuncInit);
-  }
-
-  inline static bool isRootRSFunc(const clang::FunctionDecl *FD) {
-    if (!FD) {
-      return false;
-    }
-    const llvm::StringRef Name = FD->getName();
-    static llvm::StringRef FuncRoot("root");
-    return Name.equals(FuncRoot);
-  }
-
-  inline static bool isDtorRSFunc(const clang::FunctionDecl *FD) {
-    if (!FD) {
-      return false;
-    }
-    const llvm::StringRef Name = FD->getName();
-    static llvm::StringRef FuncDtor(".rs.dtor");
-    return Name.equals(FuncDtor);
-  }
-
-  static bool isGraphicsRootRSFunc(unsigned int targetAPI,
-                                   const clang::FunctionDecl *FD);
-
-  static bool isRSForEachFunc(unsigned int targetAPI, slang::RSContext *Context,
+  static bool isRSForEachFunc(unsigned int targetAPI,
                               const clang::FunctionDecl *FD);
-
-  inline static bool isSpecialRSFunc(unsigned int targetAPI,
-                                     const clang::FunctionDecl *FD) {
-    return isGraphicsRootRSFunc(targetAPI, FD) || isInitRSFunc(FD) ||
-           isDtorRSFunc(FD);
-  }
-
-  static bool validateSpecialFuncDecl(unsigned int targetAPI,
-                                      slang::RSContext *Context,
-                                      const clang::FunctionDecl *FD);
 };  // RSExportForEach
 
 }  // namespace slang
diff --git a/slang_rs_export_reduce.cpp b/slang_rs_export_reduce.cpp
new file mode 100644
index 0000000..eae72ad
--- /dev/null
+++ b/slang_rs_export_reduce.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slang_rs_export_reduce.h"
+
+#include <algorithm>
+#include <string>
+
+#include "clang/AST/Attr.h"
+#include "clang/AST/ASTContext.h"
+
+#include "slang_assert.h"
+#include "slang_rs_context.h"
+#include "slang_rs_export_type.h"
+#include "slang_version.h"
+
+
+namespace {
+
+bool haveReduceInTargetAPI(unsigned int TargetAPI) {
+  return TargetAPI == RS_DEVELOPMENT_API;
+}
+
+} // end anonymous namespace
+
+
+namespace slang {
+
+// Validate the parameters to a reduce kernel, and set up the
+// exportable object if the kernel is valid.
+//
+// This checks that the passed function declaration of a reduce kernel is
+// a function which satisfies all the requirements for a reduce
+// kernel. Namely, we check for:
+//  - correct target API
+//  - correct parameter count
+//  - non void return type
+//  - return type and parameter types match
+//  - no pointer types in signature.
+//
+// We try to report useful errors to the user.
+//
+// On success, this function returns true and sets the fields mIns and
+// mType to point to the arguments and to the kernel type.
+//
+// If an error was detected, this function returns false.
+bool RSExportReduce::validateAndConstructParams(
+    RSContext *Context, const clang::FunctionDecl *FD) {
+  slangAssert(Context && FD);
+  bool Valid = true;
+
+  clang::ASTContext &ASTCtx = FD->getASTContext();
+
+  // Validate API version.
+  if (!haveReduceInTargetAPI(Context->getTargetAPI())) {
+    Context->ReportError(FD->getLocation(),
+                         "Reduce-style kernel %0() unsupported in SDK level %1")
+      << FD->getName() << Context->getTargetAPI();
+    Valid = false;
+  }
+
+  // Validate parameter count.
+  if (FD->getNumParams() != 2) {
+    Context->ReportError(FD->getLocation(),
+                         "Reduce-style kernel %0() must take 2 parameters "
+                         "(found %1).")
+      << FD->getName() << FD->getNumParams();
+    Valid = false;
+  }
+
+  // Validate return type.
+  const clang::QualType ReturnTy = FD->getReturnType().getCanonicalType();
+
+  if (ReturnTy->isVoidType()) {
+    Context->ReportError(FD->getLocation(),
+                         "Reduce-style kernel %0() cannot return void")
+      << FD->getName();
+    Valid = false;
+  } else if (ReturnTy->isPointerType()) {
+    Context->ReportError(FD->getLocation(),
+                         "Reduce-style kernel %0() cannot return a pointer "
+                         "type: %1")
+      << FD->getName() << ReturnTy.getAsString();
+    Valid = false;
+  }
+
+  // Validate parameter types.
+  if (FD->getNumParams() == 0) {
+    return false;
+  }
+
+  const clang::ParmVarDecl &FirstParam = *FD->getParamDecl(0);
+  const clang::QualType FirstParamTy = FirstParam.getType().getCanonicalType();
+
+  for (auto PVD = FD->param_begin(), PE = FD->param_end(); PVD != PE; ++PVD) {
+    const clang::ParmVarDecl &Param = **PVD;
+    const clang::QualType ParamTy = Param.getType().getCanonicalType();
+
+    // Check that the parameter is not a pointer.
+    if (ParamTy->isPointerType()) {
+      Context->ReportError(Param.getLocation(),
+                           "Reduce-style kernel %0() cannot have "
+                           "parameter '%1' of pointer type: '%2'")
+        << FD->getName() << Param.getName() << ParamTy.getAsString();
+      Valid = false;
+    }
+
+    // Check for type mismatch between this parameter and the return type.
+    if (!ASTCtx.hasSameUnqualifiedType(ReturnTy, ParamTy)) {
+      Context->ReportError(FD->getLocation(),
+                           "Reduce-style kernel %0() return type '%1' is not "
+                           "the same type as parameter '%2' (type '%3')")
+        << FD->getName() << ReturnTy.getAsString() << Param.getName()
+        << ParamTy.getAsString();
+      Valid = false;
+    }
+
+    // Check for type mismatch between parameters. It is sufficient to check
+    // for a mismatch with the type of the first argument.
+    if (ParamTy != FirstParamTy) {
+      Context->ReportError(FirstParam.getLocation(),
+                           "In reduce-style kernel %0(): parameter '%1' "
+                           "(type '%2') does not have the same type as "
+                           "parameter '%3' (type '%4')")
+        << FD->getName() << FirstParam.getName() << FirstParamTy.getAsString()
+        << Param.getName() << ParamTy.getAsString();
+      Valid = false;
+    }
+  }
+
+  if (Valid) {
+    // If the validation was successful, then populate the fields of
+    // the exportable.
+    if (!(mType = RSExportType::Create(Context, ReturnTy.getTypePtr()))) {
+      // There was an error exporting the type for the reduce kernel.
+      return false;
+    }
+
+    slangAssert(mIns.size() == 2 && FD->param_end() - FD->param_begin() == 2);
+    std::copy(FD->param_begin(), FD->param_end(), mIns.begin());
+  }
+
+  return Valid;
+}
+
+RSExportReduce *RSExportReduce::Create(RSContext *Context,
+                                       const clang::FunctionDecl *FD) {
+  slangAssert(Context && FD);
+  llvm::StringRef Name = FD->getName();
+
+  slangAssert(!Name.empty() && "Function must have a name");
+
+  RSExportReduce *RE = new RSExportReduce(Context, Name);
+
+  if (!RE->validateAndConstructParams(Context, FD)) {
+    // Don't delete RE here - owned by Context.
+    return nullptr;
+  }
+
+  return RE;
+}
+
+bool RSExportReduce::isRSReduceFunc(unsigned int /* targetAPI */,
+                                    const clang::FunctionDecl *FD) {
+  slangAssert(FD);
+  clang::KernelAttr *KernelAttrOrNull = FD->getAttr<clang::KernelAttr>();
+  return KernelAttrOrNull && KernelAttrOrNull->getKernelKind().equals("reduce");
+}
+
+}  // namespace slang
diff --git a/slang_rs_export_reduce.h b/slang_rs_export_reduce.h
new file mode 100644
index 0000000..9df27ae
--- /dev/null
+++ b/slang_rs_export_reduce.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_EXPORT_REDUCE_H_  // NOLINT
+#define _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_EXPORT_REDUCE_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallVector.h"
+
+#include "slang_rs_context.h"
+#include "slang_rs_exportable.h"
+#include "slang_rs_export_type.h"
+
+namespace clang {
+  class FunctionDecl;
+}  // namespace clang
+
+namespace slang {
+
+// Base class for reflecting control-side reduce
+class RSExportReduce : public RSExportable {
+ public:
+  typedef llvm::SmallVectorImpl<const clang::ParmVarDecl*> InVec;
+  typedef InVec::const_iterator InIter;
+
+ private:
+  // Function name
+  std::string mName;
+  // Input and output type
+  RSExportType *mType;
+  // Inputs
+  llvm::SmallVector<const clang::ParmVarDecl *, 2> mIns;
+
+  RSExportReduce(RSContext *Context, const llvm::StringRef &Name)
+    : RSExportable(Context, RSExportable::EX_REDUCE),
+      mName(Name.data(), Name.size()), mType(nullptr), mIns(2) {
+  }
+
+  RSExportReduce(const RSExportReduce &) = delete;
+  RSExportReduce &operator=(const RSExportReduce &) = delete;
+
+  // Given a reduce kernel declaration, validate the parameters to the
+  // reduce kernel.
+  bool validateAndConstructParams(RSContext *Context,
+                                  const clang::FunctionDecl *FD);
+
+ public:
+  static RSExportReduce *Create(RSContext *Context,
+                                const clang::FunctionDecl *FD);
+
+  const std::string &getName() const {
+    return mName;
+  }
+
+  const InVec &getIns() const {
+    return mIns;
+  }
+
+  const RSExportType *getType() const {
+    return mType;
+  }
+
+  static bool isRSReduceFunc(unsigned int targetAPI,
+                             const clang::FunctionDecl *FD);
+
+};  // RSExportReduce
+
+}  // namespace slang
+
+#endif  // _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_EXPORT_REDUCE_H_  NOLINT
diff --git a/slang_rs_export_type.cpp b/slang_rs_export_type.cpp
index d0b6094..efe989c 100644
--- a/slang_rs_export_type.cpp
+++ b/slang_rs_export_type.cpp
@@ -41,49 +41,59 @@
 
 namespace {
 
-/* For the data types we support, their category, names, and size (in bits).
- *
- * IMPORTANT: The data types in this table should be at the same index
- * as specified by the corresponding DataType enum.
- */
+// For the data types we support:
+//  Category      - data type category
+//  RsType        - element name in RenderScript
+//  RsShortType   - short element name in RenderScript
+//  SizeInBits    - size in bits
+//  CName         - reflected C name
+//  JavaName      - reflected Java name
+//  JavaArrayElementName - reflected name in Java arrays
+//  CVecName      - prefix for C vector types
+//  JavaVecName   - prefix for Java vector type
+//  JavaPromotion - unsigned type undergoing Java promotion
+//
+// IMPORTANT: The data types in this table should be at the same index as
+// specified by the corresponding DataType enum.
+//
+// TODO: Pull this information out into a separate file.
 static RSReflectionType gReflectionTypes[] = {
-    {PrimitiveDataType, "FLOAT_16", "F16", 16, "half", "short", "Half", "Half", false},
-    {PrimitiveDataType, "FLOAT_32", "F32", 32, "float", "float", "Float", "Float", false},
-    {PrimitiveDataType, "FLOAT_64", "F64", 64, "double", "double", "Double", "Double",false},
-    {PrimitiveDataType, "SIGNED_8", "I8", 8, "int8_t", "byte", "Byte", "Byte", false},
-    {PrimitiveDataType, "SIGNED_16", "I16", 16, "int16_t", "short", "Short", "Short", false},
-    {PrimitiveDataType, "SIGNED_32", "I32", 32, "int32_t", "int", "Int", "Int", false},
-    {PrimitiveDataType, "SIGNED_64", "I64", 64, "int64_t", "long", "Long", "Long", false},
-    {PrimitiveDataType, "UNSIGNED_8", "U8", 8, "uint8_t", "short", "UByte", "Short", true},
-    {PrimitiveDataType, "UNSIGNED_16", "U16", 16, "uint16_t", "int", "UShort", "Int", true},
-    {PrimitiveDataType, "UNSIGNED_32", "U32", 32, "uint32_t", "long", "UInt", "Long", true},
-    {PrimitiveDataType, "UNSIGNED_64", "U64", 64, "uint64_t", "long", "ULong", "Long", false},
+#define _ nullptr
+{PrimitiveDataType,         "FLOAT_16",     "F16", 16,     "half",   "short",        _,   "Half",   "Half", false},
+{PrimitiveDataType,         "FLOAT_32",     "F32", 32,    "float",   "float",  "float",  "Float",  "Float", false},
+{PrimitiveDataType,         "FLOAT_64",     "F64", 64,   "double",  "double", "double", "Double", "Double", false},
+{PrimitiveDataType,         "SIGNED_8",      "I8",  8,   "int8_t",    "byte",   "byte",   "Byte",   "Byte", false},
+{PrimitiveDataType,        "SIGNED_16",     "I16", 16,  "int16_t",   "short",  "short",  "Short",  "Short", false},
+{PrimitiveDataType,        "SIGNED_32",     "I32", 32,  "int32_t",     "int",    "int",    "Int",    "Int", false},
+{PrimitiveDataType,        "SIGNED_64",     "I64", 64,  "int64_t",    "long",   "long",   "Long",   "Long", false},
+{PrimitiveDataType,       "UNSIGNED_8",      "U8",  8,  "uint8_t",   "short",   "byte",  "UByte",  "Short",  true},
+{PrimitiveDataType,      "UNSIGNED_16",     "U16", 16, "uint16_t",     "int",  "short", "UShort",    "Int",  true},
+{PrimitiveDataType,      "UNSIGNED_32",     "U32", 32, "uint32_t",    "long",    "int",   "UInt",   "Long",  true},
+{PrimitiveDataType,      "UNSIGNED_64",     "U64", 64, "uint64_t",    "long",   "long",  "ULong",   "Long", false},
+{PrimitiveDataType,          "BOOLEAN", "BOOLEAN",  8,     "bool", "boolean",   "byte",        _,        _, false},
+{PrimitiveDataType,   "UNSIGNED_5_6_5",         _, 16,          _,         _,        _,        _,        _, false},
+{PrimitiveDataType, "UNSIGNED_5_5_5_1",         _, 16,          _,         _,        _,        _,        _, false},
+{PrimitiveDataType, "UNSIGNED_4_4_4_4",         _, 16,          _,         _,        _,        _,        _, false},
 
-    {PrimitiveDataType, "BOOLEAN", "BOOLEAN", 8, "bool", "boolean", nullptr, nullptr, false},
+{MatrixDataType, "MATRIX_2X2", _,  4*32, "rsMatrix_2x2", "Matrix2f", _, _, _, false},
+{MatrixDataType, "MATRIX_3X3", _,  9*32, "rsMatrix_3x3", "Matrix3f", _, _, _, false},
+{MatrixDataType, "MATRIX_4X4", _, 16*32, "rsMatrix_4x4", "Matrix4f", _, _, _, false},
 
-    {PrimitiveDataType, "UNSIGNED_5_6_5", nullptr, 16, nullptr, nullptr, nullptr, nullptr, false},
-    {PrimitiveDataType, "UNSIGNED_5_5_5_1", nullptr, 16, nullptr, nullptr, nullptr, nullptr, false},
-    {PrimitiveDataType, "UNSIGNED_4_4_4_4", nullptr, 16, nullptr, nullptr, nullptr, nullptr, false},
-
-    {MatrixDataType, "MATRIX_2X2", nullptr, 4*32, "rsMatrix_2x2", "Matrix2f", nullptr, nullptr, false},
-    {MatrixDataType, "MATRIX_3X3", nullptr, 9*32, "rsMatrix_3x3", "Matrix3f", nullptr, nullptr, false},
-    {MatrixDataType, "MATRIX_4X4", nullptr, 16*32, "rsMatrix_4x4", "Matrix4f", nullptr, nullptr, false},
-
-    // RS object types are 32 bits in 32-bit RS, but 256 bits in 64-bit RS.
-    // This is handled specially by the GetSizeInBits() method.
-    {ObjectDataType, "RS_ELEMENT", "ELEMENT", 32, "Element", "Element", nullptr, nullptr, false},
-    {ObjectDataType, "RS_TYPE", "TYPE", 32, "Type", "Type", nullptr, nullptr, false},
-    {ObjectDataType, "RS_ALLOCATION", "ALLOCATION", 32, "Allocation", "Allocation", nullptr, nullptr, false},
-    {ObjectDataType, "RS_SAMPLER", "SAMPLER", 32, "Sampler", "Sampler", nullptr, nullptr, false},
-    {ObjectDataType, "RS_SCRIPT", "SCRIPT", 32, "Script", "Script", nullptr, nullptr, false},
-    {ObjectDataType, "RS_MESH", "MESH", 32, "Mesh", "Mesh", nullptr, nullptr, false},
-    {ObjectDataType, "RS_PATH", "PATH", 32, "Path", "Path", nullptr, nullptr, false},
-
-    {ObjectDataType, "RS_PROGRAM_FRAGMENT", "PROGRAM_FRAGMENT", 32, "ProgramFragment", "ProgramFragment", nullptr, nullptr, false},
-    {ObjectDataType, "RS_PROGRAM_VERTEX", "PROGRAM_VERTEX", 32, "ProgramVertex", "ProgramVertex", nullptr, nullptr, false},
-    {ObjectDataType, "RS_PROGRAM_RASTER", "PROGRAM_RASTER", 32, "ProgramRaster", "ProgramRaster", nullptr, nullptr, false},
-    {ObjectDataType, "RS_PROGRAM_STORE", "PROGRAM_STORE", 32, "ProgramStore", "ProgramStore", nullptr, nullptr, false},
-    {ObjectDataType, "RS_FONT", "FONT", 32, "Font", "Font", nullptr, nullptr, false}
+// RS object types are 32 bits in 32-bit RS, but 256 bits in 64-bit RS.
+// This is handled specially by the GetSizeInBits(}, method.
+{ObjectDataType,          "RS_ELEMENT",          "ELEMENT", 32,         "Element",         "Element", _, _, _, false},
+{ObjectDataType,             "RS_TYPE",             "TYPE", 32,            "Type",            "Type", _, _, _, false},
+{ObjectDataType,       "RS_ALLOCATION",       "ALLOCATION", 32,      "Allocation",      "Allocation", _, _, _, false},
+{ObjectDataType,          "RS_SAMPLER",          "SAMPLER", 32,         "Sampler",         "Sampler", _, _, _, false},
+{ObjectDataType,           "RS_SCRIPT",           "SCRIPT", 32,          "Script",          "Script", _, _, _, false},
+{ObjectDataType,             "RS_MESH",             "MESH", 32,            "Mesh",            "Mesh", _, _, _, false},
+{ObjectDataType,             "RS_PATH",             "PATH", 32,            "Path",            "Path", _, _, _, false},
+{ObjectDataType, "RS_PROGRAM_FRAGMENT", "PROGRAM_FRAGMENT", 32, "ProgramFragment", "ProgramFragment", _, _, _, false},
+{ObjectDataType,   "RS_PROGRAM_VERTEX",   "PROGRAM_VERTEX", 32,   "ProgramVertex",   "ProgramVertex", _, _, _, false},
+{ObjectDataType,   "RS_PROGRAM_RASTER",   "PROGRAM_RASTER", 32,   "ProgramRaster",   "ProgramRaster", _, _, _, false},
+{ObjectDataType,    "RS_PROGRAM_STORE",    "PROGRAM_STORE", 32,    "ProgramStore",    "ProgramStore", _, _, _, false},
+{ObjectDataType,             "RS_FONT",             "FONT", 32,            "Font",            "Font", _, _, _, false},
+#undef _
 };
 
 const int kMaxVectorSize = 4;
diff --git a/slang_rs_export_type.h b/slang_rs_export_type.h
index 63ce388..2b2a2eb 100644
--- a/slang_rs_export_type.h
+++ b/slang_rs_export_type.h
@@ -136,14 +136,26 @@
 };
 
 typedef struct {
+    // The data type category
     DataTypeCategory category;
+    // The element name in RenderScript
     const char * rs_type;
+    // The short element name in RenderScript
     const char * rs_short_type;
+    // The size of the type in bits
     uint32_t size_in_bits;
+    // The reflected name in C code
     const char * c_name;
+    // The reflected name in Java code
     const char * java_name;
+    // The array type that is compatible with Allocations of our type,
+    // for use with copyTo(), copyFrom()
+    const char * java_array_element_name;
+    // The prefix for C vector types
     const char * rs_c_vector_prefix;
+    // The prefix for Java vector types
     const char * rs_java_vector_prefix;
+    // Indicates an unsigned type undergoing Java promotion
     bool java_promotion;
 } RSReflectionType;
 
diff --git a/slang_rs_exportable.h b/slang_rs_exportable.h
index 0871be3..e8fc11e 100644
--- a/slang_rs_exportable.h
+++ b/slang_rs_exportable.h
@@ -27,7 +27,8 @@
     EX_FUNC,
     EX_TYPE,
     EX_VAR,
-    EX_FOREACH
+    EX_FOREACH,
+    EX_REDUCE
   };
 
  private:
diff --git a/slang_rs_metadata.h b/slang_rs_metadata.h
index 63e7e0f..b84a8cd 100644
--- a/slang_rs_metadata.h
+++ b/slang_rs_metadata.h
@@ -33,4 +33,6 @@
 
 #define RS_EXPORT_FOREACH_MN "#rs_export_foreach"
 
+#define RS_EXPORT_REDUCE_MN "#rs_export_reduce"
+
 #endif  // _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_METADATA_H_  NOLINT
diff --git a/slang_rs_reflection.cpp b/slang_rs_reflection.cpp
index 16b8e09..685cfc4 100644
--- a/slang_rs_reflection.cpp
+++ b/slang_rs_reflection.cpp
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2010-2014, The Android Open Source Project
  *
@@ -35,6 +34,7 @@
 #include "slang_rs_export_var.h"
 #include "slang_rs_export_foreach.h"
 #include "slang_rs_export_func.h"
+#include "slang_rs_export_reduce.h"
 #include "slang_rs_reflect_utils.h"
 #include "slang_version.h"
 
@@ -66,10 +66,13 @@
 
 #define RS_EXPORT_FUNC_INDEX_PREFIX "mExportFuncIdx_"
 #define RS_EXPORT_FOREACH_INDEX_PREFIX "mExportForEachIdx_"
+#define RS_EXPORT_REDUCE_INDEX_PREFIX "mExportReduceIdx_"
 
 #define RS_EXPORT_VAR_ALLOCATION_PREFIX "mAlloction_"
 #define RS_EXPORT_VAR_DATA_STORAGE_PREFIX "mData_"
 
+#define SAVED_RS_REFERENCE "mRSLocal"
+
 namespace slang {
 
 class RSReflectionJavaElementBuilder {
@@ -278,6 +281,57 @@
   return "";
 }
 
+// If FromIntegerType == DestIntegerType, then Value is returned.
+// Otherwise, return a Java expression that zero-extends the value
+// Value, assumed to be of type FromIntegerType, to the integer type
+// DestIntegerType.
+//
+// Intended operations:
+//  byte  -> {byte,int,short,long}
+//  short -> {short,int,long}
+//  int   -> {int,long}
+//  long  -> long
+static std::string ZeroExtendValue(const std::string &Value,
+                                   const std::string &FromIntegerType,
+                                   const std::string &DestIntegerType) {
+#ifndef __DISABLE_ASSERTS
+  // Integer types arranged in increasing order by width
+  const std::vector<std::string> ValidTypes{"byte", "short", "int", "long"};
+  auto FromTypeLoc = std::find(ValidTypes.begin(), ValidTypes.end(), FromIntegerType);
+  auto DestTypeLoc = std::find(ValidTypes.begin(), ValidTypes.end(), DestIntegerType);
+  // Check that both types are valid.
+  slangAssert(FromTypeLoc != ValidTypes.end());
+  slangAssert(DestTypeLoc != ValidTypes.end());
+  // Check that DestIntegerType is at least as wide as FromIntegerType.
+  slangAssert(FromTypeLoc - ValidTypes.begin() <= DestTypeLoc - ValidTypes.begin());
+#endif
+
+  if (FromIntegerType == DestIntegerType) {
+    return Value;
+  }
+
+  std::string Mask, MaskLiteralType;
+  if (FromIntegerType == "byte") {
+    Mask = "0xff";
+    MaskLiteralType = "int";
+  } else if (FromIntegerType == "short") {
+    Mask = "0xffff";
+    MaskLiteralType = "int";
+  } else if (FromIntegerType == "int") {
+    Mask = "0xffffffffL";
+    MaskLiteralType = "long";
+  } else {
+    // long -> long casts should have already been handled.
+    slangAssert(false && "Unknown integer type");
+  }
+
+  // Cast the mask to the appropriate type.
+  if (MaskLiteralType != DestIntegerType) {
+    Mask = "(" + DestIntegerType + ") " + Mask;
+  }
+  return "((" + DestIntegerType + ") ((" + Value + ") & " + Mask + "))";
+}
+
 /********************** Methods to generate script class **********************/
 RSReflectionJava::RSReflectionJava(const RSContext *Context,
                                    std::vector<std::string> *GeneratedFileNames,
@@ -295,7 +349,8 @@
                        RSSlangReflectUtils::JavaClassNameFromRSFileName(
                            mRSSourceFileName.c_str())),
       mEmbedBitcodeInJava(EmbedBitcodeInJava), mNextExportVarSlot(0),
-      mNextExportFuncSlot(0), mNextExportForEachSlot(0), mLastError(""),
+      mNextExportFuncSlot(0), mNextExportForEachSlot(0),
+      mNextExportReduceSlot(0), mLastError(""),
       mGeneratedFileNames(GeneratedFileNames), mFieldIndex(0) {
   slangAssert(mGeneratedFileNames && "Must supply GeneratedFileNames");
   slangAssert(!mPackageName.empty() && mPackageName != "-");
@@ -320,26 +375,31 @@
 
   genScriptClassConstructor();
 
-  // Reflect export variable
-  for (RSContext::const_export_var_iterator I = mRSContext->export_vars_begin(),
-                                            E = mRSContext->export_vars_end();
+  // Reflect exported variables
+  for (auto I = mRSContext->export_vars_begin(),
+            E = mRSContext->export_vars_end();
        I != E; I++)
     genExportVariable(*I);
 
-  // Reflect export for each functions (only available on ICS+)
+  // Reflect exported forEach functions (only available on ICS+)
   if (mRSContext->getTargetAPI() >= SLANG_ICS_TARGET_API) {
-    for (RSContext::const_export_foreach_iterator
-             I = mRSContext->export_foreach_begin(),
-             E = mRSContext->export_foreach_end();
-         I != E; I++)
+    for (auto I = mRSContext->export_foreach_begin(),
+              E = mRSContext->export_foreach_end();
+         I != E; I++) {
       genExportForEach(*I);
+    }
   }
 
-  // Reflect export function
-  for (RSContext::const_export_func_iterator
-           I = mRSContext->export_funcs_begin(),
-           E = mRSContext->export_funcs_end();
-       I != E; I++)
+  // Reflect exported reduce functions
+  for (auto I = mRSContext->export_reduce_begin(),
+            E = mRSContext->export_reduce_end();
+       I != E; ++I)
+    genExportReduce(*I);
+
+  // Reflect exported functions (invokable)
+  for (auto I = mRSContext->export_funcs_begin(),
+            E = mRSContext->export_funcs_end();
+       I != E; ++I)
     genExportFunction(*I);
 
   endClass();
@@ -360,6 +420,9 @@
   startFunction(AM_Public, false, nullptr, getClassName(), 1, "RenderScript",
                 "rs");
 
+  const bool haveReduceExportables =
+    mRSContext->export_reduce_begin() != mRSContext->export_reduce_end();
+
   if (getEmbedBitcodeInJava()) {
     // Call new single argument Java-only constructor
     mOut.indent() << "super(rs,\n";
@@ -387,8 +450,8 @@
 
   // If an exported variable has initial value, reflect it
 
-  for (RSContext::const_export_var_iterator I = mRSContext->export_vars_begin(),
-                                            E = mRSContext->export_vars_end();
+  for (auto I = mRSContext->export_vars_begin(),
+            E = mRSContext->export_vars_end();
        I != E; I++) {
     const RSExportVar *EV = *I;
     if (!EV->getInit().isUninit()) {
@@ -414,9 +477,14 @@
     genFieldPackerInstance(EV->getType());
   }
 
-  for (RSContext::const_export_foreach_iterator
-           I = mRSContext->export_foreach_begin(),
-           E = mRSContext->export_foreach_end();
+  if (haveReduceExportables) {
+    mOut.indent() << SAVED_RS_REFERENCE << " = rs;\n";
+  }
+
+  // Reflect argument / return types in kernels
+
+  for (auto I = mRSContext->export_foreach_begin(),
+            E = mRSContext->export_foreach_end();
        I != E; I++) {
     const RSExportForEach *EF = *I;
 
@@ -435,6 +503,13 @@
     }
   }
 
+  for (auto I = mRSContext->export_reduce_begin(),
+            E = mRSContext->export_reduce_end();
+       I != E; I++) {
+    const RSExportReduce *ER = *I;
+    genTypeInstance(ER->getType());
+  }
+
   endFunction();
 
   for (std::set<std::string>::iterator I = mTypesToCheck.begin(),
@@ -448,6 +523,12 @@
        I != E; I++) {
     mOut.indent() << "private FieldPacker " RS_FP_PREFIX << *I << ";\n";
   }
+
+  if (haveReduceExportables) {
+    // We save a private copy of rs in order to create temporary
+    // allocations in the reduce_* entry points.
+    mOut.indent() << "private RenderScript " << SAVED_RS_REFERENCE << ";\n";
+  }
 }
 
 void RSReflectionJava::genInitBoolExportVariable(const std::string &VarName,
@@ -676,6 +757,43 @@
   mOut.indent() << "}\n\n";
 }
 
+void RSReflectionJava::genNullOrEmptyArrayCheck(const std::string &ArrayName) {
+  mOut.indent() << "// Verify that \"" << ArrayName << "\" is non-null.\n";
+  mOut.indent() << "if (" << ArrayName << " == null) {\n";
+  mOut.indent() << "    throw new RSIllegalArgumentException(\"Array \\\""
+                << ArrayName << "\\\" is null!\");\n";
+  mOut.indent() << "}\n";
+  mOut.indent() << "// Verify that \"" << ArrayName << "\" is non-empty.\n";
+  mOut.indent() << "if (" << ArrayName << ".length == 0) {\n";
+  mOut.indent() << "    throw new RSIllegalArgumentException(\"Array \\\""
+                << ArrayName << "\\\" is zero-length!\");\n";
+  mOut.indent() << "}\n";
+}
+
+void RSReflectionJava::genVectorLengthCompatibilityCheck(const std::string &ArrayName,
+                                                         unsigned VecSize) {
+  mOut.indent() << "// Verify that the array length is a multiple of the vector size.\n";
+  mOut.indent() << "if (" << ArrayName << ".length % " << std::to_string(VecSize)
+                << " != 0) {\n";
+  mOut.indent() << "    throw new RSIllegalArgumentException(\"Array \\\"" << ArrayName
+                << "\\\" is not a multiple of " << std::to_string(VecSize)
+                << " in length!\");\n";
+  mOut.indent() << "}\n";
+}
+
+void RSReflectionJava::gen1DCheck(const std::string &Name) {
+  // TODO: Check that t0.getArrayCount() == 0, when / if this API is
+  // un-hidden.
+  mOut.indent() << "Type t0 = " << Name << ".getType();\n";
+  mOut.indent() << "// Verify " << Name << " is 1D\n";
+  mOut.indent() << "if (t0.getY() != 0  ||\n";
+  mOut.indent() << "    t0.hasFaces()   ||\n";
+  mOut.indent() << "    t0.hasMipmaps()) {\n";
+  mOut.indent() << "    throw new RSIllegalArgumentException(\"Parameter "
+                << Name << " is not 1D!\");\n";
+  mOut.indent() << "}\n\n";
+}
+
 void RSReflectionJava::genExportForEach(const RSExportForEach *EF) {
   if (EF->isDummyRoot()) {
     // Skip reflection for dummy root() kernels. Note that we have to
@@ -858,6 +976,209 @@
   endFunction();
 }
 
+void RSReflectionJava::genExportReduce(const RSExportReduce *ER) {
+  // Generate the reflected function index.
+  mOut.indent() << "private final static int " << RS_EXPORT_REDUCE_INDEX_PREFIX
+                << ER->getName() << " = " << getNextExportReduceSlot()
+                << ";\n";
+
+  // Two variants of reduce_* entry points get generated:
+  // Array variant:
+  //   ty' reduce_foo(ty[] input)
+  //   ty' reduce_foo(ty[] input, int x1, int x2)
+  // Allocation variant:
+  //   void reduce_foo(Allocation ain, Allocation aout)
+  //   void reduce_foo(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+
+  const RSExportType *Type = ER->getType();
+  const std::string Name = ER->getName();
+
+  genExportReduceArrayVariant(Type, Name);
+  genExportReduceAllocationVariant(Type, Name);
+}
+
+void RSReflectionJava::genExportReduceAllocationVariant(const RSExportType *Type,
+                                                        const std::string &KernelName) {
+  const std::string FuncName = "reduce_" + KernelName;
+
+  // void reduce_foo(Allocation ain, Allocation aout)
+  startFunction(AM_Public, false, "void", FuncName, 2,
+                "Allocation", "ain",
+                "Allocation", "aout");
+  mOut.indent() << FuncName << "(ain, aout, null);\n";
+  endFunction();
+
+  // void reduce_foo(Allocation ain, Allocation aout, Script.LaunchOptions sc)
+  startFunction(AM_Public, false, "void", FuncName, 3,
+                "Allocation", "ain",
+                "Allocation", "aout",
+                "Script.LaunchOptions", "sc");
+
+  // Type checking
+  genTypeCheck(Type, "ain");
+  genTypeCheck(Type, "aout");
+
+  // Check that the input is 1D
+  gen1DCheck("ain");
+
+  // Call backend
+
+  // Script.reduce has the signature
+  //
+  // protected void
+  // reduce(int slot, Allocation ain, Allocation aout, Script.LaunchOptions sc)
+  mOut.indent() << "reduce("
+                << RS_EXPORT_REDUCE_INDEX_PREFIX << KernelName
+                << ", ain, aout, sc);\n";
+
+  endFunction();
+}
+
+void RSReflectionJava::genExportReduceArrayVariant(const RSExportType *Type,
+                                                   const std::string &KernelName) {
+  // Determine if the array variant can be generated. Some type
+  // classes cannot be reflected in Java.
+  auto Class = Type->getClass();
+  if (Class != RSExportType::ExportClassPrimitive &&
+      Class != RSExportType::ExportClassVector) {
+    return;
+  }
+
+  RSReflectionTypeData TypeData;
+  Type->convertToRTD(&TypeData);
+
+  // Check if the type supports reading back from an Allocation and
+  // returning as a first class Java type. If not, the helper cannot
+  // be generated.
+  if (!TypeData.type->java_name || !TypeData.type->java_array_element_name ||
+      (TypeData.vecSize > 1 && !TypeData.type->rs_java_vector_prefix)) {
+    return;
+  }
+
+  const std::string FuncName = "reduce_" + KernelName;
+  const std::string TypeName = GetTypeName(Type);
+  const std::string ReflectedScalarType = TypeData.type->java_name;
+  const std::string ArrayElementType = TypeData.type->java_array_element_name;
+  const std::string ArrayType = ArrayElementType + "[]";
+  const std::string ElementName = Type->getElementName();
+
+  const uint32_t VecSize = TypeData.vecSize;
+
+  std::string InLength = "in.length";
+  // Adjust the length so that it corresponds to the number of
+  // elements in the allocation.
+  if (VecSize > 1) {
+    InLength += " / " + std::to_string(VecSize);
+  }
+
+  // TypeName reduce_foo(ArrayElementType[] in)
+  startFunction(AM_Public, false, TypeName.c_str(), FuncName, 1,
+                ArrayType.c_str(), "in");
+  genNullOrEmptyArrayCheck("in");
+  if (VecSize > 1) {
+    genVectorLengthCompatibilityCheck("in", VecSize);
+  }
+  mOut.indent() << "return " << FuncName << "(in, 0, "
+                << InLength << ");\n";
+  endFunction();
+
+  // TypeName reduce_foo(ArrayElementType[] in, int x1, int x2)
+
+  startFunction(AM_Public, false, TypeName.c_str(), FuncName, 3,
+                ArrayType.c_str(), "in",
+                "int", "x1",
+                "int", "x2");
+
+  genNullOrEmptyArrayCheck("in");
+  if (VecSize > 1) {
+    genVectorLengthCompatibilityCheck("in", VecSize);
+  }
+  // Check that 0 <= x1 and x1 < x2 and x2 <= InLength
+  mOut.indent() << "// Bounds check passed x1 and x2\n";
+  mOut.indent() << "if (x1 < 0 || x1 >= x2 || x2 > " << InLength << ") {\n";
+  mOut.indent() << "    throw new RSRuntimeException("
+                << "\"Input bounds are invalid!\");\n";
+  mOut.indent() << "}\n";
+
+  // Create a temporary input allocation.
+  mOut.indent() << "Allocation ain = Allocation.createSized("
+                << SAVED_RS_REFERENCE << ", "
+                << RS_ELEM_PREFIX << ElementName << ", "
+                << "x2 - x1);\n";
+  mOut.indent() << "ain.setAutoPadding(true);\n";
+  mOut.indent() << "ain.copy1DRangeFrom(x1, x2 - x1, in);\n";
+
+  // Create a temporary output allocation.
+  mOut.indent() << "Allocation aout = Allocation.createSized("
+                << SAVED_RS_REFERENCE << ", "
+                << RS_ELEM_PREFIX << ElementName << ", "
+                << "1);\n";
+  mOut.indent() << "aout.setAutoPadding(true);\n";
+
+  mOut.indent() << FuncName << "(ain, aout, null);\n";
+
+  if (VecSize > 1) {
+    // An allocation with vector elements is represented as an array
+    // of primitives, so we have to extract the output from the
+    // element array and rebuild the vector.
+    //
+    // E.g. for int2
+    //
+    // Allocation outArray = new int[2];
+    // aout.copyTo(outArray);
+    // int elem0 = outArray[0];
+    // int elem1 = outArray[1];
+    // return new Int2(elem0, elem1);
+
+    mOut.indent() << ArrayType << " outArray = new "
+                  << ArrayElementType << "[" << VecSize << "];\n";
+
+    mOut.indent() << "aout.copy1DRangeTo(0, 1, outArray);\n";
+
+    for (unsigned Elem = 0; Elem < VecSize; ++Elem) {
+      mOut.indent() << ReflectedScalarType << " elem" << Elem << " = ";
+      std::string Index = "outArray[" + std::to_string(Elem) + "]";
+
+      if (ReflectedScalarType == ArrayElementType) {
+        mOut << Index << ";\n";
+      } else {
+        mOut << ZeroExtendValue(Index, ArrayElementType, ReflectedScalarType) << ";\n";
+      }
+    }
+
+    mOut.indent() << "return new " << TypeName << "(";
+    for (unsigned Elem = 0; Elem < VecSize; ++Elem) {
+      if (Elem > 0) mOut << ", ";
+      mOut << "elem" << Elem;
+    }
+    mOut << ");\n";
+  } else {
+    // Scalar handling.
+    //
+    // E.g. for int
+    // Allocation outArray = new int[1];
+    // aout.copyTo(outArray);
+    // return outArray[0];
+    mOut.indent() << ArrayType << " outArray = new " << ArrayElementType
+                  << "[1];\n";
+    mOut.indent() << "aout.copyTo(outArray);\n";
+
+    if (ReflectedScalarType == "boolean") {
+      mOut.indent() << "return outArray[0] != 0;\n";
+    } else if (ReflectedScalarType == ArrayElementType) {
+      mOut.indent() << "return outArray[0];\n";
+    } else {
+      mOut.indent() << "return "
+                    << ZeroExtendValue("outArray[0]",
+                                       ArrayElementType,
+                                       ReflectedScalarType)
+                    << ";\n";
+    }
+  }
+
+  endFunction();
+}
+
 void RSReflectionJava::genTypeInstanceFromPointer(const RSExportType *ET) {
   if (ET->getClass() == RSExportType::ExportClassPointer) {
     // For pointer parameters to original forEach kernels.
diff --git a/slang_rs_reflection.h b/slang_rs_reflection.h
index 097a380..98a1aba 100644
--- a/slang_rs_reflection.h
+++ b/slang_rs_reflection.h
@@ -81,6 +81,7 @@
   int mNextExportVarSlot;
   int mNextExportFuncSlot;
   int mNextExportForEachSlot;
+  int mNextExportReduceSlot;
 
   GeneratedFile mOut;
 
@@ -101,6 +102,7 @@
     mNextExportVarSlot = 0;
     mNextExportFuncSlot = 0;
     mNextExportForEachSlot = 0;
+    mNextExportReduceSlot = 0;
   }
 
 public:
@@ -127,6 +129,7 @@
   inline int getNextExportVarSlot() { return mNextExportVarSlot++; }
   inline int getNextExportFuncSlot() { return mNextExportFuncSlot++; }
   inline int getNextExportForEachSlot() { return mNextExportForEachSlot++; }
+  inline int getNextExportReduceSlot() { return mNextExportReduceSlot++; }
 
   bool startClass(AccessModifier AM, bool IsStatic,
                   const std::string &ClassName, const char *SuperClassName,
@@ -197,6 +200,12 @@
 
   void genExportForEach(const RSExportForEach *EF);
 
+  void genExportReduce(const RSExportReduce *ER);
+  void genExportReduceAllocationVariant(const RSExportType *Type,
+                                        const std::string &KernelName);
+  void genExportReduceArrayVariant(const RSExportType *Type,
+                                   const std::string &KernelName);
+
   void genTypeCheck(const RSExportType *ET, const char *VarName);
 
   void genTypeInstanceFromPointer(const RSExportType *ET);
@@ -234,6 +243,9 @@
   void genNewItemBufferPackerIfNull();
 
   void genPairwiseDimCheck(std::string name0, std::string name1);
+  void genVectorLengthCompatibilityCheck(const std::string &ArrayName, unsigned VecSize);
+  void genNullOrEmptyArrayCheck(const std::string &ArrayName);
+  void gen1DCheck(const std::string &Name);
 
 public:
   RSReflectionJava(const RSContext *Context,
diff --git a/slang_rs_reflection_cpp.cpp b/slang_rs_reflection_cpp.cpp
index 6b40ff5..7d29627 100644
--- a/slang_rs_reflection_cpp.cpp
+++ b/slang_rs_reflection_cpp.cpp
@@ -25,13 +25,13 @@
 #include <algorithm>
 #include <sstream>
 #include <string>
-#include <utility>
 
 #include "os_sep.h"
 #include "slang_rs_context.h"
 #include "slang_rs_export_var.h"
 #include "slang_rs_export_foreach.h"
 #include "slang_rs_export_func.h"
+#include "slang_rs_export_reduce.h"
 #include "slang_rs_reflect_utils.h"
 #include "slang_version.h"
 
@@ -41,9 +41,11 @@
 
 namespace slang {
 
-#define RS_TYPE_ITEM_CLASS_NAME "Item"
-
-#define RS_ELEM_PREFIX "__rs_elem_"
+const char kRsTypeItemClassName[] = "Item";
+const char kRsElemPrefix[] = "__rs_elem_";
+// The name of the Allocation type that is reflected in C++
+const char kAllocationSp[] = "android::RSC::sp<android::RSC::Allocation>";
+const char kConstRsScriptCall[] = "const RsScriptCall";
 
 static const char *GetMatrixTypeName(const RSExportMatrixType *EMT) {
   static const char *MatrixTypeCNameMap[] = {
@@ -75,7 +77,7 @@
         static_cast<const RSExportPointerType *>(ET)->getPointeeType();
 
     if (PointeeType->getClass() != RSExportType::ExportClassRecord)
-      return "android::RSC::sp<android::RSC::Allocation>";
+      return kAllocationSp;
     else
       return PointeeType->getElementName();
   }
@@ -101,7 +103,7 @@
   }
   case RSExportType::ExportClassRecord: {
     // TODO: Fix for C structs!
-    return ET->getElementName() + "." RS_TYPE_ITEM_CLASS_NAME;
+    return ET->getElementName() + "." + kRsTypeItemClassName;
   }
   default: { slangAssert(false && "Unknown class of type"); }
   }
@@ -109,13 +111,23 @@
   return "";
 }
 
+static bool canExportReduceArrayVariant(const RSExportType *Type) {
+  // FIXME: No half types available for C++ reflection yet
+  if (Type->getElementName().find("F16") == 0) {
+    return false;
+  }
+  return Type->getClass() == RSExportType::ExportClassPrimitive ||
+    Type->getClass() == RSExportType::ExportClassVector;
+}
+
 RSReflectionCpp::RSReflectionCpp(const RSContext *Context,
                                  const string &OutputDirectory,
                                  const string &RSSourceFileName,
                                  const string &BitCodeFileName)
     : mRSContext(Context), mRSSourceFilePath(RSSourceFileName),
       mBitCodeFilePath(BitCodeFileName), mOutputDirectory(OutputDirectory),
-      mNextExportVarSlot(0), mNextExportFuncSlot(0), mNextExportForEachSlot(0) {
+      mNextExportVarSlot(0), mNextExportFuncSlot(0), mNextExportForEachSlot(0),
+      mNextExportReduceSlot(0) {
   mCleanedRSFileName = RootNameFromRSFileName(mRSSourceFilePath);
   mClassName = "ScriptC_" + mCleanedRSFileName;
 }
@@ -155,6 +167,7 @@
 
   genFieldsToStoreExportVariableValues();
   genTypeInstancesUsedInForEach();
+  genTypeInstancesUsedInReduce();
   genFieldsForAllocationTypeVerification();
 
   mOut.decreaseIndent();
@@ -167,6 +180,7 @@
 
   genExportVariablesGetterAndSetter();
   genForEachDeclarations();
+  genReduceDeclarations();
   genExportFunctionDeclarations();
 
   mOut.endBlock(true);
@@ -175,9 +189,8 @@
 }
 
 void RSReflectionCpp::genTypeInstancesUsedInForEach() {
-  for (RSContext::const_export_foreach_iterator
-           I = mRSContext->export_foreach_begin(),
-           E = mRSContext->export_foreach_end();
+  for (auto I = mRSContext->export_foreach_begin(),
+            E = mRSContext->export_foreach_end();
        I != E; I++) {
     const RSExportForEach *EF = *I;
     const RSExportType *OET = EF->getOutType();
@@ -196,6 +209,15 @@
   }
 }
 
+// Ensure that the type of the reduce kernel is reflected.
+void RSReflectionCpp::genTypeInstancesUsedInReduce() {
+  for (auto I = mRSContext->export_reduce_begin(),
+            E = mRSContext->export_reduce_end();
+       I != E; ++I) {
+    genTypeInstance((*I)->getType());
+  }
+}
+
 void RSReflectionCpp::genFieldsForAllocationTypeVerification() {
   bool CommentAdded = false;
   for (std::set<std::string>::iterator I = mTypesToCheck.begin(),
@@ -207,7 +229,7 @@
       CommentAdded = true;
     }
     mOut.indent() << "android::RSC::sp<const android::RSC::Element> "
-                  << RS_ELEM_PREFIX << *I << ";\n";
+                  << kRsElemPrefix << *I << ";\n";
   }
 }
 
@@ -266,13 +288,11 @@
     for (RSExportForEach::InIter BI = Ins.begin(), EI = Ins.end();
          BI != EI; BI++) {
 
-      Arguments.push_back(std::make_pair(
-        "android::RSC::sp<const android::RSC::Allocation>", (*BI)->getName()));
+      Arguments.push_back(Argument(kAllocationSp, (*BI)->getName()));
     }
 
     if (ForEach->hasOut() || ForEach->hasReturn()) {
-      Arguments.push_back(std::make_pair(
-          "android::RSC::sp<const android::RSC::Allocation>", "aout"));
+      Arguments.push_back(Argument(kAllocationSp, "aout"));
     }
 
     const RSExportRecordType *ERT = ForEach->getParamPacketType();
@@ -282,7 +302,7 @@
            i != e; i++) {
         RSReflectionTypeData rtd;
         (*i)->getType()->convertToRTD(&rtd);
-        Arguments.push_back(std::make_pair(rtd.type->c_name, (*i)->getName()));
+        Arguments.push_back(Argument(rtd.type->c_name, (*i)->getName()));
       }
     }
     genArguments(Arguments, FunctionStart.length());
@@ -290,6 +310,21 @@
   }
 }
 
+void RSReflectionCpp::genReduceDeclarations() {
+  bool CommentAdded = false;
+  for (auto I = mRSContext->export_reduce_begin(),
+            E = mRSContext->export_reduce_end(); I != E; I++) {
+    if (!CommentAdded) {
+      mOut.comment("For each reduce kernel of the script, there is an entry "
+                   "point to call the reduce kernel.");
+      CommentAdded = true;
+    }
+
+    makeReduceSignatureAllocationVariant(false, *I);
+    makeReduceSignatureArrayVariant(false, *I);
+  }
+}
+
 void RSReflectionCpp::genExportFunctionDeclarations() {
   for (RSContext::const_export_func_iterator
            I = mRSContext->export_funcs_begin(),
@@ -301,6 +336,218 @@
   }
 }
 
+// forEach_* implementation
+void RSReflectionCpp::genExportForEachBodies() {
+  uint32_t slot = 0;
+  for (auto I = mRSContext->export_foreach_begin(),
+            E = mRSContext->export_foreach_end();
+       I != E; I++, slot++) {
+    const RSExportForEach *ef = *I;
+    if (ef->isDummyRoot()) {
+      mOut.indent() << "// No forEach_root(...)\n";
+      continue;
+    }
+
+    ArgumentList Arguments;
+    std::string FunctionStart =
+        "void " + mClassName + "::forEach_" + ef->getName() + "(";
+    mOut.indent() << FunctionStart;
+
+    if (ef->hasIns()) {
+      // FIXME: Add support for kernels with multiple inputs.
+      slangAssert(ef->getIns().size() == 1);
+      Arguments.push_back(Argument(kAllocationSp, "ain"));
+    }
+
+    if (ef->hasOut() || ef->hasReturn()) {
+      Arguments.push_back(Argument(kAllocationSp, "aout"));
+    }
+
+    const RSExportRecordType *ERT = ef->getParamPacketType();
+    if (ERT) {
+      for (RSExportForEach::const_param_iterator i = ef->params_begin(),
+                                                 e = ef->params_end();
+           i != e; i++) {
+        RSReflectionTypeData rtd;
+        (*i)->getType()->convertToRTD(&rtd);
+        Arguments.push_back(Argument(rtd.type->c_name, (*i)->getName()));
+      }
+    }
+    genArguments(Arguments, FunctionStart.length());
+    mOut << ")";
+    mOut.startBlock();
+
+    const RSExportType *OET = ef->getOutType();
+    const RSExportForEach::InTypeVec &InTypes = ef->getInTypes();
+    if (ef->hasIns()) {
+      // FIXME: Add support for kernels with multiple inputs.
+      slangAssert(ef->getIns().size() == 1);
+      genTypeCheck(InTypes[0], "ain");
+    }
+    if (OET) {
+      genTypeCheck(OET, "aout");
+    }
+
+    // TODO Add the appropriate dimension checking code, as seen in
+    // slang_rs_reflection.cpp.
+
+    std::string FieldPackerName = ef->getName() + "_fp";
+    if (ERT) {
+      if (genCreateFieldPacker(ERT, FieldPackerName.c_str())) {
+        genPackVarOfType(ERT, nullptr, FieldPackerName.c_str());
+      }
+    }
+    mOut.indent() << "forEach(" << slot << ", ";
+
+    if (ef->hasIns()) {
+      // FIXME: Add support for kernels with multiple inputs.
+      slangAssert(ef->getIns().size() == 1);
+      mOut << "ain, ";
+    } else {
+      mOut << "NULL, ";
+    }
+
+    if (ef->hasOut() || ef->hasReturn()) {
+      mOut << "aout, ";
+    } else {
+      mOut << "NULL, ";
+    }
+
+    // FIXME (no support for usrData with C++ kernels)
+    mOut << "NULL, 0);\n";
+    mOut.endBlock();
+  }
+}
+
+// reduce_* implementation
+void RSReflectionCpp::genExportReduceBodies() {
+  for (auto I = mRSContext->export_reduce_begin(),
+            E = mRSContext->export_reduce_end();
+       I != E; ++I) {
+    const RSExportReduce &Reduce = **I;
+    const RSExportType *Type = Reduce.getType();
+
+    // Allocation variant
+    //
+    // void reduce_foo(sp<Allocation> ain, sp<Allocation> aout,
+    //                 const RsScriptCall *sc);
+    makeReduceSignatureAllocationVariant(true, &Reduce);
+    mOut.startBlock();
+
+    // Type check
+    genTypeCheck(Type, "ain");
+    genTypeCheck(Type, "aout");
+
+    // Dimension check
+    gen1DCheck("ain");
+
+    const uint32_t Slot = getNextExportReduceSlot();
+
+    // Call into RenderScript.
+    mOut.indent() << "reduce(" << Slot << ", "
+                  << "ain, aout, sc);\n";
+    mOut.endBlock();
+
+    if (!canExportReduceArrayVariant(Type)) {
+      continue;
+    }
+
+    // Array variant
+    //
+    // Ty reduce_foo(const ElemTy[] in, uint32_t x1, uint32_t x2, uint32_t inLen);
+    // "Ty" could be different from "ElemTy" in the case of vectors.
+    makeReduceSignatureArrayVariant(true, &Reduce);
+    mOut.startBlock();
+
+    const std::string ReturnType = GetTypeName(Type);
+    const std::string DefaultReturnValue = ReturnType + "()";
+
+    genNullOrEmptyArrayCheck("in", "inLen", DefaultReturnValue);
+
+    RSReflectionTypeData TypeData;
+    Type->convertToRTD(&TypeData);
+    const uint32_t VecSize = TypeData.vecSize;
+    std::string InLength = "inLen";
+    // Adjust the length so that it corresponds to the number of elements in the allocation.
+    if (VecSize > 1) {
+      InLength += " / " + std::to_string(VecSize);
+    }
+    genVectorLengthCompatibilityCheck("inLen", VecSize, DefaultReturnValue);
+
+    mOut.indent() << "if (x1 >= x2 || x2 > " << InLength << ")";
+    mOut.startBlock();
+    mOut.indent() << "mRS->throwError(RS_ERROR_RUNTIME_ERROR, "
+                  << "\"Input bounds are invalid\");\n";
+    mOut.indent() << "return " << DefaultReturnValue << ";\n";
+    mOut.endBlock();
+
+    mOut.indent() << kAllocationSp
+                  << " ain = android::RSC::Allocation::createSized(mRS, "
+                  << kRsElemPrefix << Type->getElementName() << ", "
+                  << "x2 - x1);\n";
+
+    mOut.indent() << "ain->setAutoPadding(true);\n";
+
+    mOut.indent() << kAllocationSp
+                  << " aout = android::RSC::Allocation::createSized(mRS, "
+                  << kRsElemPrefix << Type->getElementName() << ", 1);\n";
+
+    mOut.indent() << "aout->setAutoPadding(true);\n";
+
+    const std::string ArrayElementType = TypeData.type->c_name;
+
+    std::string StartOffset = "x1";
+    if (VecSize > 1) {
+      StartOffset += " * " + std::to_string(VecSize);
+    }
+    mOut.indent() << "ain->copy1DRangeFrom(0, x2 - x1, &in[" << StartOffset << "]);\n";
+    mOut.indent() << "reduce_" << Reduce.getName() << "(ain, aout);\n";
+    mOut.indent() << ArrayElementType << " outArray[" << VecSize << "];\n";
+
+    mOut.indent() << "aout->copy1DRangeTo(0, 1, &outArray[0]);\n";
+
+    mOut.indent() << "return " << ReturnType << "(";
+    for (uint32_t VecElem = 0; VecElem < VecSize; ++VecElem) {
+      if (VecElem > 0) mOut << ", ";
+      mOut << "outArray[" << VecElem << "]";
+    }
+    mOut << ");\n";
+    mOut.endBlock();
+  }
+}
+
+// invoke_* implementation
+void RSReflectionCpp::genExportFunctionBodies() {
+  uint32_t slot = 0;
+  // Reflect export function
+  for (auto I = mRSContext->export_funcs_begin(),
+            E = mRSContext->export_funcs_end();
+       I != E; I++) {
+    const RSExportFunc *ef = *I;
+
+    makeFunctionSignature(true, ef);
+    mOut.startBlock();
+    const RSExportRecordType *params = ef->getParamPacketType();
+    size_t param_len = 0;
+    if (params) {
+      param_len = params->getAllocSize();
+      if (genCreateFieldPacker(params, "__fp")) {
+        genPackVarOfType(params, nullptr, "__fp");
+      }
+    }
+
+    mOut.indent() << "invoke(" << slot;
+    if (params) {
+      mOut << ", __fp.getData(), " << param_len << ");\n";
+    } else {
+      mOut << ", NULL, 0);\n";
+    }
+    mOut.endBlock();
+
+    slot++;
+  }
+}
+
 bool RSReflectionCpp::genEncodedBitCode() {
   FILE *pfin = fopen(mBitCodeFilePath.c_str(), "rb");
   if (pfin == nullptr) {
@@ -334,11 +581,13 @@
     return false;
   }
 
+  // Front matter
   mOut.indent() << "#include \"" << mClassName << ".h\"\n\n";
 
   genEncodedBitCode();
   mOut.indent() << "\n\n";
 
+  // Constructor
   const std::string &packageName = mRSContext->getReflectJavaPackageName();
   mOut.indent() << mClassName << "::" << mClassName
                 << "(android::RSC::sp<android::RSC::RS> rs):\n"
@@ -350,7 +599,7 @@
   for (std::set<std::string>::iterator I = mTypesToCheck.begin(),
                                        E = mTypesToCheck.end();
        I != E; I++) {
-    mOut.indent() << RS_ELEM_PREFIX << *I << " = android::RSC::Element::" << *I
+    mOut.indent() << kRsElemPrefix << *I << " = android::RSC::Element::" << *I
                   << "(mRS);\n";
   }
 
@@ -366,123 +615,15 @@
   }
   mOut.endBlock();
 
+  // Destructor
   mOut.indent() << mClassName << "::~" << mClassName << "()";
   mOut.startBlock();
   mOut.endBlock();
 
-  // Reflect export for each functions
-  uint32_t slot = 0;
-  for (RSContext::const_export_foreach_iterator
-           I = mRSContext->export_foreach_begin(),
-           E = mRSContext->export_foreach_end();
-       I != E; I++, slot++) {
-    const RSExportForEach *ef = *I;
-    if (ef->isDummyRoot()) {
-      mOut.indent() << "// No forEach_root(...)\n";
-      continue;
-    }
-
-    ArgumentList Arguments;
-    std::string FunctionStart =
-        "void " + mClassName + "::forEach_" + ef->getName() + "(";
-    mOut.indent() << FunctionStart;
-
-    if (ef->hasIns()) {
-      // FIXME: Add support for kernels with multiple inputs.
-      assert(ef->getIns().size() == 1);
-      Arguments.push_back(std::make_pair(
-          "android::RSC::sp<const android::RSC::Allocation>", "ain"));
-    }
-
-    if (ef->hasOut() || ef->hasReturn()) {
-      Arguments.push_back(std::make_pair(
-          "android::RSC::sp<const android::RSC::Allocation>", "aout"));
-    }
-
-    const RSExportRecordType *ERT = ef->getParamPacketType();
-    if (ERT) {
-      for (RSExportForEach::const_param_iterator i = ef->params_begin(),
-                                                 e = ef->params_end();
-           i != e; i++) {
-        RSReflectionTypeData rtd;
-        (*i)->getType()->convertToRTD(&rtd);
-        Arguments.push_back(std::make_pair(rtd.type->c_name, (*i)->getName()));
-      }
-    }
-    genArguments(Arguments, FunctionStart.length());
-    mOut << ")";
-    mOut.startBlock();
-
-    const RSExportType *OET = ef->getOutType();
-    const RSExportForEach::InTypeVec &InTypes = ef->getInTypes();
-    if (ef->hasIns()) {
-      // FIXME: Add support for kernels with multiple inputs.
-      assert(ef->getIns().size() == 1);
-      genTypeCheck(InTypes[0], "ain");
-    }
-    if (OET) {
-      genTypeCheck(OET, "aout");
-    }
-
-    // TODO Add the appropriate dimension checking code, as seen in
-    // slang_rs_reflection.cpp.
-
-    std::string FieldPackerName = ef->getName() + "_fp";
-    if (ERT) {
-      if (genCreateFieldPacker(ERT, FieldPackerName.c_str())) {
-        genPackVarOfType(ERT, nullptr, FieldPackerName.c_str());
-      }
-    }
-    mOut.indent() << "forEach(" << slot << ", ";
-
-    if (ef->hasIns()) {
-      // FIXME: Add support for kernels with multiple inputs.
-      assert(ef->getIns().size() == 1);
-      mOut << "ain, ";
-    } else {
-      mOut << "NULL, ";
-    }
-
-    if (ef->hasOut() || ef->hasReturn()) {
-      mOut << "aout, ";
-    } else {
-      mOut << "NULL, ";
-    }
-
-    // FIXME (no support for usrData with C++ kernels)
-    mOut << "NULL, 0);\n";
-    mOut.endBlock();
-  }
-
-  slot = 0;
-  // Reflect export function
-  for (RSContext::const_export_func_iterator
-           I = mRSContext->export_funcs_begin(),
-           E = mRSContext->export_funcs_end();
-       I != E; I++) {
-    const RSExportFunc *ef = *I;
-
-    makeFunctionSignature(true, ef);
-    mOut.startBlock();
-    const RSExportRecordType *params = ef->getParamPacketType();
-    size_t param_len = 0;
-    if (params) {
-      param_len = params->getAllocSize();
-      if (genCreateFieldPacker(params, "__fp")) {
-        genPackVarOfType(params, nullptr, "__fp");
-      }
-    }
-
-    mOut.indent() << "invoke(" << slot;
-    if (params) {
-      mOut << ", __fp.getData(), " << param_len << ");\n";
-    } else {
-      mOut << ", NULL, 0);\n";
-    }
-    mOut.endBlock();
-
-    slot++;
-  }
+  // Function bodies
+  genExportForEachBodies();
+  genExportReduceBodies();
+  genExportFunctionBodies();
 
   mOut.closeFile();
   return true;
@@ -546,7 +687,7 @@
     mOut.indent() << "setVar(" << getNextExportVarSlot() << ", ";
     if (EPT->isRSObjectType()) {
       mOut << "v";
-    } else {
+   } else {
       mOut << "&v, sizeof(v)";
     }
     mOut << ");\n";
@@ -677,6 +818,131 @@
   }
 }
 
+void RSReflectionCpp::makeReduceSignatureAllocationVariant(bool IsDefinition,
+                                                           const RSExportReduce *ER) {
+  // void reduce_foo(sp<Allocation> ain, sp<Allocation> aout,
+  //                 const RsScriptCall *sc = nullptr);
+  std::string FunctionStart = "void ";
+  if (IsDefinition) {
+    FunctionStart += mClassName +  "::";
+  }
+  FunctionStart += "reduce_" + ER->getName() + "(";
+
+  ArgumentList Arguments{
+    Argument(kAllocationSp, "ain"),
+    Argument(kAllocationSp, "aout"),
+    Argument(kConstRsScriptCall, "*sc", IsDefinition ? "" : "nullptr")
+  };
+
+  mOut.indent() << FunctionStart;
+
+  genArguments(Arguments, FunctionStart.length());
+
+  if (IsDefinition) {
+    mOut << ")";
+  } else {
+    mOut << ");\n\n";
+  }
+}
+
+void RSReflectionCpp::makeReduceSignatureArrayVariant(bool IsDefinition,
+                                                      const RSExportReduce *ER) {
+  // Ty reduce_foo(const ElemTy[] in, uint32_t x1, uint32_t x2, size_t inLen);
+  // "Ty" could be different from "ElemTy" in the case of vectors.
+
+  const RSExportType *Type = ER->getType();
+  if (!canExportReduceArrayVariant(Type)) {
+      return;
+  }
+
+  RSReflectionTypeData TypeData;
+  Type->convertToRTD(&TypeData);
+
+  const std::string ReturnType = GetTypeName(Type);
+  std::string FunctionStart = ReturnType + " ";
+  if (IsDefinition) {
+    FunctionStart += mClassName +  "::";
+  }
+  FunctionStart += "reduce_" + ER->getName() + "(";
+
+  const std::string ArrayElementType = TypeData.type->c_name;
+
+  ArgumentList Arguments{
+    Argument("const " + ArrayElementType, "in[]"),
+    Argument("uint32_t", "x1"),
+    Argument("uint32_t", "x2"),
+    Argument("size_t", "inLen")
+  };
+
+  mOut.indent() << FunctionStart;
+  genArguments(Arguments, FunctionStart.size());
+
+  if (IsDefinition) {
+    mOut << ")";
+  } else {
+    mOut << ");\n\n";
+  }
+
+  if (!IsDefinition) {
+    // We reflect three more variants in the header. First, there is
+    //
+    //   Ty reduce_foo(const ElemTy[] in, size_t inLen);
+    //
+    // Note the inLen is the number of primitive elements in the array, as opposed to the
+    // bounds whose units are allocation elements. The other variants use templates to infer
+    // the array length statically:
+    //
+    //   template<size_t inLen> Ty reduce_foo(const ElemTy (&in)[inLen]);
+    //   template<size_t inLen> Ty reduce_foo(const ElemTy (&in)[inLen], uint32_t x1, uint32_t x2);
+
+    // Generate inLen variant
+    const uint32_t VecSize = TypeData.vecSize;
+    std::string X2 = "inLen";
+
+    const std::string FunctionName = ER->getName();
+
+    auto ForwardReduce = [this, &FunctionName](const std::string &x1,
+                                               const std::string &x2,
+                                               const std::string &inLen) {
+      this->mOut.indent() << "    return reduce_" << FunctionName << "(in, "
+                          << x1 << ", " << x2 << ", " << inLen << ");\n";
+      this->mOut.indent() << "}\n\n";
+    };
+
+    const std::string DefaultValue = ReturnType + "()";
+
+    ArgumentList InLenVariantArguments{
+      Argument("const " + ArrayElementType, "in[]"), Argument("size_t", "inLen")
+    };
+    mOut.indent() << FunctionStart;
+    genArguments(InLenVariantArguments, FunctionStart.size());
+    mOut << ") {\n";
+    if (VecSize > 1) {
+      genVectorLengthCompatibilityCheck("inLen", VecSize, DefaultValue, 2);
+      X2 += " / " + std::to_string(VecSize);
+    }
+    ForwardReduce("0", X2, "inLen");
+
+    // Generate template variants
+    ArgumentList TemplateVariantArguments{
+      Argument("const " + ArrayElementType, "(&in)[inLen]")
+    };
+
+    mOut.indent() << "template<size_t inLen>\n";
+    mOut.indent() << FunctionStart;
+    genArguments(TemplateVariantArguments, FunctionStart.size());
+    mOut << ") {\n        return reduce_" << FunctionName << "(in, inLen);\n    }\n\n";
+
+    TemplateVariantArguments.push_back(Argument("uint32_t", "x1"));
+    TemplateVariantArguments.push_back(Argument("uint32_t", "x2"));
+    mOut.indent() << "template<size_t inLen>\n";
+    mOut.indent() << FunctionStart;
+    genArguments(TemplateVariantArguments, FunctionStart.size());
+    mOut << ") {\n";
+    ForwardReduce("x1", "x2", "inLen");
+  }
+}
+
 void RSReflectionCpp::genArguments(const ArgumentList &Arguments, int Offset) {
   bool FirstArg = true;
 
@@ -689,7 +955,10 @@
       FirstArg = false;
     }
 
-    mOut << I->first << " " << I->second;
+    mOut << I->Type << " " << I->Name;
+    if (!I->DefaultValue.empty()) {
+      mOut << " = " << I->DefaultValue;
+    }
   }
 }
 
@@ -822,7 +1091,7 @@
   if (!TypeName.empty()) {
     mOut.indent() << "if (!" << VarName
                   << "->getType()->getElement()->isCompatible("
-                  << RS_ELEM_PREFIX << TypeName << "))";
+                  << kRsElemPrefix << TypeName << "))";
     mOut.startBlock();
     mOut.indent() << "mRS->throwError(RS_ERROR_RUNTIME_ERROR, "
                      "\"Incompatible type\");\n";
@@ -831,6 +1100,60 @@
   }
 }
 
+// Ensure that the input is 1 dimensional.
+void RSReflectionCpp::gen1DCheck(const std::string &VarName) {
+  mOut.indent() << "// check that " << VarName << " is 1d\n";
+  mOut.indent() << "sp<const Type> t0 = " << VarName << "->getType();\n";
+  mOut.indent() << "if (t0->getY() != 0 ||\n";
+  mOut.indent() << "    t0->hasFaces()  ||\n";
+  mOut.indent() << "    t0->hasMipmaps())";
+  mOut.startBlock();
+  mOut.indent() << "mRS->throwError(RS_ERROR_INVALID_PARAMETER, "
+                << "\"" << VarName << " is not 1D!\");\n";
+  mOut.indent() << "return;\n";
+  mOut.endBlock();
+}
+
+// Generates code to ensure that the supplied array length is a multiple of the vector size.
+void RSReflectionCpp::genVectorLengthCompatibilityCheck(const std::string &Length,
+                                                        unsigned VecSize,
+                                                        const std::string &ValueToReturn,
+                                                        unsigned IndentLevels) {
+  auto Indenter = [this, IndentLevels]() -> std::ofstream& {
+    GeneratedFile &Out = this->mOut;
+    for (unsigned Level = 0; Level < IndentLevels; ++Level) {
+      Out.indent();
+    }
+    return Out;
+  };
+
+  Indenter() << "// Verify that the array length is a multiple of the vector size.\n";
+  Indenter() << "if (" << Length << " % " << std::to_string(VecSize) << " != 0) {\n";
+  Indenter() << "    mRS->throwError(RS_ERROR_INVALID_PARAMETER, "
+             << "\"Input array length is not a multiple of "
+             << std::to_string(VecSize) << "\");\n";
+  Indenter() << "    return " << ValueToReturn << ";\n";
+  Indenter() << "}\n\n";
+}
+
+// Generates code to ensure that the supplied array is non-null and nonzero in length.
+void RSReflectionCpp::genNullOrEmptyArrayCheck(const std::string &ArrayName,
+                                               const std::string &Length,
+                                               const std::string &ValueToReturn) {
+  mOut.indent() << "// Verify that the array is non-null and non-empty.\n";
+  mOut.indent() << "if (" << ArrayName << " == nullptr) {\n";
+  mOut.indent() << "    mRS->throwError(RS_ERROR_INVALID_PARAMETER, "
+                << "\"Input array is null\");\n";
+  mOut.indent() << "    return " << ValueToReturn << ";\n";
+  mOut.indent() << "}\n\n";
+
+  mOut.indent() << "if (" << Length << " == 0) {\n";
+  mOut.indent() << "    mRS->throwError(RS_ERROR_INVALID_PARAMETER, "
+                << "\"Input array is zero-length\");\n";
+  mOut.indent() << "    return " << ValueToReturn << ";\n";
+  mOut.indent() << "}\n\n";
+}
+
 void RSReflectionCpp::genTypeInstanceFromPointer(const RSExportType *ET) {
   if (ET->getClass() == RSExportType::ExportClassPointer) {
     // For pointer parameters to original forEach kernels.
diff --git a/slang_rs_reflection_cpp.h b/slang_rs_reflection_cpp.h
index f451ce6..9c55ad8 100644
--- a/slang_rs_reflection_cpp.h
+++ b/slang_rs_reflection_cpp.h
@@ -36,8 +36,14 @@
   bool reflect();
 
  private:
-  // List of of (type, name) pairs.
-  typedef std::vector<std::pair<std::string, std::string> > ArgumentList;
+  struct Argument {
+    std::string Type;
+    std::string Name;
+    std::string DefaultValue;
+    Argument(std::string Type, std::string Name, std::string DefaultValue = "")
+      : Type(Type), Name(Name), DefaultValue(DefaultValue) {}
+  };
+  typedef std::vector<Argument> ArgumentList;
 
   // Information coming from the compiler about the code we're reflecting.
   const RSContext *mRSContext;
@@ -58,6 +64,7 @@
   unsigned int mNextExportVarSlot;
   unsigned int mNextExportFuncSlot;
   unsigned int mNextExportForEachSlot;
+  unsigned int mNextExportReduceSlot;
 
   // Generated RS Elements for type-checking code.
   std::set<std::string> mTypesToCheck;
@@ -66,6 +73,7 @@
     mNextExportVarSlot = 0;
     mNextExportFuncSlot = 0;
     mNextExportForEachSlot = 0;
+    mNextExportReduceSlot = 0;
     mTypesToCheck.clear();
   }
 
@@ -84,17 +92,37 @@
     return mNextExportForEachSlot++;
   }
 
+  inline unsigned int getNextExportReduceSlot() {
+    return mNextExportReduceSlot++;
+  }
+
   bool writeHeaderFile();
   bool writeImplementationFile();
+
+  // Write out signatures both in the header and implementation.
   void makeFunctionSignature(bool isDefinition, const RSExportFunc *ef);
+  void makeReduceSignatureAllocationVariant(bool isDefinition, const RSExportReduce *er);
+  void makeReduceSignatureArrayVariant(bool isDefinition, const RSExportReduce *er);
+
   bool genEncodedBitCode();
   void genFieldsToStoreExportVariableValues();
   void genTypeInstancesUsedInForEach();
+  void genTypeInstancesUsedInReduce();
   void genFieldsForAllocationTypeVerification();
+
+  // Write out the code for the getters and setters.
   void genExportVariablesGetterAndSetter();
+
+  // Write out the code for the declaration of the kernel entry points.
   void genForEachDeclarations();
+  void genReduceDeclarations();
   void genExportFunctionDeclarations();
 
+  // Write out code for the definitions of the kernel entry points.
+  void genExportForEachBodies();
+  void genExportReduceBodies();
+  void genExportFunctionBodies();
+
   bool startScriptHeader();
 
   // Write out code for an export variable initialization.
@@ -128,7 +156,20 @@
   // Generate a runtime type check for VarName.
   void genTypeCheck(const RSExportType *ET, const char *VarName);
 
-  // Generate a type instance for a given forEach argument type.
+  // Generate a runtime check that VarName is 1-dimensional.
+  void gen1DCheck(const std::string &VarName);
+
+  // Generate a runtime check that VarName is non-null.
+  void genNullOrEmptyArrayCheck(const std::string &ArrayName, const std::string &Length,
+                                const std::string &ValueToReturn);
+
+  // Generate a runtime check that ArrayName's length is a multiple of
+  // a vector size.
+  void genVectorLengthCompatibilityCheck(const std::string &Length, unsigned VecSize,
+                                         const std::string &ValueToReturn,
+                                         unsigned IndentLevels = 1);
+
+  // Generate a type instance for a given type.
   void genTypeInstanceFromPointer(const RSExportType *ET);
   void genTypeInstance(const RSExportType *ET);
 
diff --git a/slang_rs_special_func.cpp b/slang_rs_special_func.cpp
new file mode 100644
index 0000000..56ae590
--- /dev/null
+++ b/slang_rs_special_func.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "slang_rs_special_func.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Attr.h"
+
+#include "slang_assert.h"
+#include "slang_version.h"
+
+namespace slang {
+
+bool RSSpecialFunc::isGraphicsRootRSFunc(unsigned int targetAPI,
+                                         const clang::FunctionDecl *FD) {
+  if (FD->hasAttr<clang::KernelAttr>()) {
+    return false;
+  }
+
+  if (!FD->getName().equals("root")) {
+    return false;
+  }
+
+  if (FD->getNumParams() == 0) {
+    // Graphics root function
+    return true;
+  }
+
+  // Check for legacy graphics root function (with single parameter).
+  if ((targetAPI < SLANG_ICS_TARGET_API) && (FD->getNumParams() == 1)) {
+    const clang::QualType &IntType = FD->getASTContext().IntTy;
+    if (FD->getReturnType().getCanonicalType() == IntType) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool
+RSSpecialFunc::validateSpecialFuncDecl(unsigned int targetAPI,
+                                       slang::RSContext *Context,
+                                       clang::FunctionDecl const *FD) {
+  slangAssert(Context && FD);
+  bool valid = true;
+  const clang::ASTContext &C = FD->getASTContext();
+  const clang::QualType &IntType = FD->getASTContext().IntTy;
+
+  if (isGraphicsRootRSFunc(targetAPI, FD)) {
+    if ((targetAPI < SLANG_ICS_TARGET_API) && (FD->getNumParams() == 1)) {
+      // Legacy graphics root function
+      const clang::ParmVarDecl *PVD = FD->getParamDecl(0);
+      clang::QualType QT = PVD->getType().getCanonicalType();
+      if (QT != IntType) {
+        Context->ReportError(PVD->getLocation(),
+                             "invalid parameter type for legacy "
+                             "graphics root() function: %0")
+            << PVD->getType();
+        valid = false;
+      }
+    }
+
+    // Graphics root function, so verify that it returns an int
+    if (FD->getReturnType().getCanonicalType() != IntType) {
+      Context->ReportError(FD->getLocation(),
+                           "root() is required to return "
+                           "an int for graphics usage");
+      valid = false;
+    }
+  } else if (isInitRSFunc(FD) || isDtorRSFunc(FD)) {
+    if (FD->getNumParams() != 0) {
+      Context->ReportError(FD->getLocation(),
+                           "%0(void) is required to have no "
+                           "parameters")
+          << FD->getName();
+      valid = false;
+    }
+
+    if (FD->getReturnType().getCanonicalType() != C.VoidTy) {
+      Context->ReportError(FD->getLocation(),
+                           "%0(void) is required to have a void "
+                           "return type")
+          << FD->getName();
+      valid = false;
+    }
+  } else {
+    slangAssert(false && "must be called on root, init or .rs.dtor function!");
+  }
+
+  return valid;
+}
+
+}  // namespace slang
diff --git a/slang_rs_special_func.h b/slang_rs_special_func.h
new file mode 100644
index 0000000..7390871
--- /dev/null
+++ b/slang_rs_special_func.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2015, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_SPECIAL_FUNC_H_
+#define _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_SPECIAL_FUNC_H_
+
+#include "llvm/ADT/StringRef.h"
+
+#include "clang/AST/Decl.h"
+
+#include "slang_rs_context.h"
+
+namespace slang {
+
+namespace RSSpecialFunc {
+
+inline bool isInitRSFunc(const clang::FunctionDecl *FD) {
+  if (!FD) {
+    return false;
+  }
+  const llvm::StringRef Name = FD->getName();
+  static llvm::StringRef FuncInit("init");
+  return Name.equals(FuncInit);
+}
+
+inline bool isDtorRSFunc(const clang::FunctionDecl *FD) {
+  if (!FD) {
+    return false;
+  }
+  const llvm::StringRef Name = FD->getName();
+  static llvm::StringRef FuncDtor(".rs.dtor");
+  return Name.equals(FuncDtor);
+}
+
+bool isGraphicsRootRSFunc(unsigned int targetAPI,
+                          const clang::FunctionDecl *FD);
+
+inline bool isSpecialRSFunc(unsigned int targetAPI,
+                                   const clang::FunctionDecl *FD) {
+  return isGraphicsRootRSFunc(targetAPI, FD) || isInitRSFunc(FD) ||
+         isDtorRSFunc(FD);
+}
+
+bool validateSpecialFuncDecl(unsigned int targetAPI,
+                             slang::RSContext *Context,
+                             const clang::FunctionDecl *FD);
+
+} // namespace RSSpecialFunc
+
+} // namespace slang
+
+#endif  // _FRAMEWORKS_COMPILE_SLANG_SLANG_RS_SPECIAL_FUNC_H
diff --git a/tests/F_anon_struct_kernel_sig/anon_struct_kernel_sig.rs b/tests/F_anon_struct_kernel_sig/anon_struct_kernel_sig.rs
new file mode 100644
index 0000000..028a328
--- /dev/null
+++ b/tests/F_anon_struct_kernel_sig/anon_struct_kernel_sig.rs
@@ -0,0 +1,21 @@
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+typedef struct {
+  int i;
+} myStruct;
+
+/* Test old-style kernel */
+void root(const myStruct *in, int *out) {
+  *out = in->i;
+}
+
+/* Test new-style kernel */
+myStruct RS_KERNEL kernel_returning_myStruct(int in) {
+  myStruct out = { in };
+  return out;
+}
+
+int RS_KERNEL kernel_with_myStruct_param(myStruct in) {
+  return in.i;
+}
diff --git a/tests/F_anon_struct_kernel_sig/stderr.txt.expect b/tests/F_anon_struct_kernel_sig/stderr.txt.expect
new file mode 100644
index 0000000..276aaaf
--- /dev/null
+++ b/tests/F_anon_struct_kernel_sig/stderr.txt.expect
@@ -0,0 +1 @@
+anon_struct_kernel_sig.rs:4:9: error: anonymous structures cannot be exported
diff --git a/tests/F_anon_struct_kernel_sig/stdout.txt.expect b/tests/F_anon_struct_kernel_sig/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_anon_struct_kernel_sig/stdout.txt.expect
diff --git a/tests/F_kernel_badattr/kernel_badattr.rs b/tests/F_kernel_badattr/kernel_badattr.rs
new file mode 100644
index 0000000..a719bd9
--- /dev/null
+++ b/tests/F_kernel_badattr/kernel_badattr.rs
@@ -0,0 +1,14 @@
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+int __attribute__((kernel("unimplemented"))) kernel(int arg) {
+  return 0;
+}
+
+int __attribute__((kernel(7))) kernel2(int arg) {
+  return 0;
+}
+
+int __attribute__((kernel("reduce", 1))) kernel3(int arg) {
+  return 0;
+}
diff --git a/tests/F_kernel_badattr/stderr.txt.expect b/tests/F_kernel_badattr/stderr.txt.expect
new file mode 100644
index 0000000..6cd1c74
--- /dev/null
+++ b/tests/F_kernel_badattr/stderr.txt.expect
@@ -0,0 +1,3 @@
+kernel_badattr.rs:8:27: error: 'kernel' attribute requires a string
+kernel_badattr.rs:12:20: error: 'kernel' attribute takes no more than 1 argument
+kernel_badattr.rs:4:46: error: Unknown kernel attribute argument 'unimplemented' in declaration of function 'kernel'
diff --git a/tests/F_kernel_badattr/stdout.txt.expect b/tests/F_kernel_badattr/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_kernel_badattr/stdout.txt.expect
diff --git a/tests/F_reduce_api_unsupported/reduce_api_unsupported.rs b/tests/F_reduce_api_unsupported/reduce_api_unsupported.rs
new file mode 100644
index 0000000..8dfedfb
--- /dev/null
+++ b/tests/F_reduce_api_unsupported/reduce_api_unsupported.rs
@@ -0,0 +1,12 @@
+// -target-api 23
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+typedef struct foo {
+   int x;
+} foo;
+
+foo __attribute__((kernel("reduce"))) addFoo(foo a, foo b) {
+  foo result = { a.x + b.x };
+  return result;
+}
diff --git a/tests/F_reduce_api_unsupported/stderr.txt.expect b/tests/F_reduce_api_unsupported/stderr.txt.expect
new file mode 100644
index 0000000..0e389a4
--- /dev/null
+++ b/tests/F_reduce_api_unsupported/stderr.txt.expect
@@ -0,0 +1 @@
+reduce_api_unsupported.rs:9:39: error: Reduce-style kernel addFoo() unsupported in SDK level 23
diff --git a/tests/F_reduce_api_unsupported/stdout.txt.expect b/tests/F_reduce_api_unsupported/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_api_unsupported/stdout.txt.expect
diff --git a/tests/F_reduce_non_binary/reduce_non_binary.rs b/tests/F_reduce_non_binary/reduce_non_binary.rs
new file mode 100644
index 0000000..77fdaa1
--- /dev/null
+++ b/tests/F_reduce_non_binary/reduce_non_binary.rs
@@ -0,0 +1,27 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+/* 0 arguments */
+
+int __attribute__((kernel("reduce"))) kernel0(void) {
+  return 0;
+}
+
+/* 1 argument */
+
+int __attribute__((kernel("reduce"))) kernel1(int arg1) {
+  return 0;
+}
+
+/* 3 arguments */
+
+int __attribute__((kernel("reduce"))) kernel3(int arg1, int arg2, int arg3) {
+  return 0;
+}
+
+/* 4 arguments */
+
+int __attribute__((kernel("reduce"))) kernel4(int arg1, int arg2, int arg3, int arg4) {
+  return 0;
+}
diff --git a/tests/F_reduce_non_binary/stderr.txt.expect b/tests/F_reduce_non_binary/stderr.txt.expect
new file mode 100644
index 0000000..1463a21
--- /dev/null
+++ b/tests/F_reduce_non_binary/stderr.txt.expect
@@ -0,0 +1,4 @@
+reduce_non_binary.rs:7:39: error: Reduce-style kernel kernel0() must take 2 parameters (found 0).
+reduce_non_binary.rs:13:39: error: Reduce-style kernel kernel1() must take 2 parameters (found 1).
+reduce_non_binary.rs:19:39: error: Reduce-style kernel kernel3() must take 2 parameters (found 3).
+reduce_non_binary.rs:25:39: error: Reduce-style kernel kernel4() must take 2 parameters (found 4).
diff --git a/tests/F_reduce_non_binary/stdout.txt.expect b/tests/F_reduce_non_binary/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_non_binary/stdout.txt.expect
diff --git a/tests/F_reduce_param_type_mismatch/reduce_param_type_mismatch.rs b/tests/F_reduce_param_type_mismatch/reduce_param_type_mismatch.rs
new file mode 100644
index 0000000..fd8e1aa
--- /dev/null
+++ b/tests/F_reduce_param_type_mismatch/reduce_param_type_mismatch.rs
@@ -0,0 +1,7 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+int __attribute__((kernel("reduce"))) kernel1(int arg1, float arg2) {
+  return 0;
+}
diff --git a/tests/F_reduce_param_type_mismatch/stderr.txt.expect b/tests/F_reduce_param_type_mismatch/stderr.txt.expect
new file mode 100644
index 0000000..cc55737
--- /dev/null
+++ b/tests/F_reduce_param_type_mismatch/stderr.txt.expect
@@ -0,0 +1,2 @@
+reduce_param_type_mismatch.rs:5:39: error: Reduce-style kernel kernel1() return type 'int' is not the same type as parameter 'arg2' (type 'float')
+reduce_param_type_mismatch.rs:5:51: error: In reduce-style kernel kernel1(): parameter 'arg1' (type 'int') does not have the same type as parameter 'arg2' (type 'float')
diff --git a/tests/F_reduce_param_type_mismatch/stdout.txt.expect b/tests/F_reduce_param_type_mismatch/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_param_type_mismatch/stdout.txt.expect
diff --git a/tests/F_reduce_ptr_param/reduce_ptr_param.rs b/tests/F_reduce_ptr_param/reduce_ptr_param.rs
new file mode 100644
index 0000000..f099e19
--- /dev/null
+++ b/tests/F_reduce_ptr_param/reduce_ptr_param.rs
@@ -0,0 +1,7 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+int __attribute__((kernel("reduce"))) kernel(int *arg1, int *arg2) {
+  return 0;
+}
diff --git a/tests/F_reduce_ptr_param/stderr.txt.expect b/tests/F_reduce_ptr_param/stderr.txt.expect
new file mode 100644
index 0000000..3dc971e
--- /dev/null
+++ b/tests/F_reduce_ptr_param/stderr.txt.expect
@@ -0,0 +1,4 @@
+reduce_ptr_param.rs:5:51: error: Reduce-style kernel kernel() cannot have parameter 'arg1' of pointer type: 'int *'
+reduce_ptr_param.rs:5:39: error: Reduce-style kernel kernel() return type 'int' is not the same type as parameter 'arg1' (type 'int *')
+reduce_ptr_param.rs:5:62: error: Reduce-style kernel kernel() cannot have parameter 'arg2' of pointer type: 'int *'
+reduce_ptr_param.rs:5:39: error: Reduce-style kernel kernel() return type 'int' is not the same type as parameter 'arg2' (type 'int *')
diff --git a/tests/F_reduce_ptr_param/stdout.txt.expect b/tests/F_reduce_ptr_param/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_ptr_param/stdout.txt.expect
diff --git a/tests/F_reduce_ptr_ret_val/reduce_ptr_ret_val.rs b/tests/F_reduce_ptr_ret_val/reduce_ptr_ret_val.rs
new file mode 100644
index 0000000..b1ee9c9
--- /dev/null
+++ b/tests/F_reduce_ptr_ret_val/reduce_ptr_ret_val.rs
@@ -0,0 +1,7 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+int *__attribute__((kernel("reduce"))) kernel(int arg1, int arg2) {
+  return 0;
+}
diff --git a/tests/F_reduce_ptr_ret_val/stderr.txt.expect b/tests/F_reduce_ptr_ret_val/stderr.txt.expect
new file mode 100644
index 0000000..5e5852e
--- /dev/null
+++ b/tests/F_reduce_ptr_ret_val/stderr.txt.expect
@@ -0,0 +1,3 @@
+reduce_ptr_ret_val.rs:5:40: error: Reduce-style kernel kernel() cannot return a pointer type: int *
+reduce_ptr_ret_val.rs:5:40: error: Reduce-style kernel kernel() return type 'int *' is not the same type as parameter 'arg1' (type 'int')
+reduce_ptr_ret_val.rs:5:40: error: Reduce-style kernel kernel() return type 'int *' is not the same type as parameter 'arg2' (type 'int')
diff --git a/tests/F_reduce_ptr_ret_val/stdout.txt.expect b/tests/F_reduce_ptr_ret_val/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_ptr_ret_val/stdout.txt.expect
diff --git a/tests/F_reduce_ret_type_mismatch/reduce_ret_type_mismatch.rs b/tests/F_reduce_ret_type_mismatch/reduce_ret_type_mismatch.rs
new file mode 100644
index 0000000..e43ca89
--- /dev/null
+++ b/tests/F_reduce_ret_type_mismatch/reduce_ret_type_mismatch.rs
@@ -0,0 +1,7 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+double __attribute__((kernel("reduce"))) kernel(float arg1, float arg2) {
+  return arg1 + arg2;
+}
diff --git a/tests/F_reduce_ret_type_mismatch/stderr.txt.expect b/tests/F_reduce_ret_type_mismatch/stderr.txt.expect
new file mode 100644
index 0000000..e616c94
--- /dev/null
+++ b/tests/F_reduce_ret_type_mismatch/stderr.txt.expect
@@ -0,0 +1,2 @@
+reduce_ret_type_mismatch.rs:5:42: error: Reduce-style kernel kernel() return type 'double' is not the same type as parameter 'arg1' (type 'float')
+reduce_ret_type_mismatch.rs:5:42: error: Reduce-style kernel kernel() return type 'double' is not the same type as parameter 'arg2' (type 'float')
diff --git a/tests/F_reduce_ret_type_mismatch/stdout.txt.expect b/tests/F_reduce_ret_type_mismatch/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_ret_type_mismatch/stdout.txt.expect
diff --git a/tests/F_reduce_void_ret/reduce_void_ret.rs b/tests/F_reduce_void_ret/reduce_void_ret.rs
new file mode 100644
index 0000000..8fecfc2
--- /dev/null
+++ b/tests/F_reduce_void_ret/reduce_void_ret.rs
@@ -0,0 +1,7 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+void __attribute__((kernel("reduce"))) kernel(int arg1, int arg2) {
+  return;
+}
diff --git a/tests/F_reduce_void_ret/stderr.txt.expect b/tests/F_reduce_void_ret/stderr.txt.expect
new file mode 100644
index 0000000..ecb3dd1
--- /dev/null
+++ b/tests/F_reduce_void_ret/stderr.txt.expect
@@ -0,0 +1,3 @@
+reduce_void_ret.rs:5:40: error: Reduce-style kernel kernel() cannot return void
+reduce_void_ret.rs:5:40: error: Reduce-style kernel kernel() return type 'void' is not the same type as parameter 'arg1' (type 'int')
+reduce_void_ret.rs:5:40: error: Reduce-style kernel kernel() return type 'void' is not the same type as parameter 'arg2' (type 'int')
diff --git a/tests/F_reduce_void_ret/stdout.txt.expect b/tests/F_reduce_void_ret/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/F_reduce_void_ret/stdout.txt.expect
diff --git a/tests/P_kernel/kernel.rs b/tests/P_kernel/kernel.rs
index fbf9285..fa962dd 100644
--- a/tests/P_kernel/kernel.rs
+++ b/tests/P_kernel/kernel.rs
@@ -1,18 +1,7 @@
+// -target-api 0
 #pragma version(1)
 #pragma rs java_package_name(foo)
 
-int RS_KERNEL root(uint32_t ain) {
-  return 0;
+int RS_KERNEL add_foreach(int a, int b) {
+  return a + b;
 }
-
-void RS_KERNEL in_only(uint32_t ain) {
-}
-
-int RS_KERNEL out_only() {
-  return 0;
-}
-
-int RS_KERNEL everything(uint32_t ain, uint32_t x, uint32_t y) {
-  return 0;
-}
-
diff --git a/tests/P_reduce/reduce.rs b/tests/P_reduce/reduce.rs
new file mode 100644
index 0000000..4017f79
--- /dev/null
+++ b/tests/P_reduce/reduce.rs
@@ -0,0 +1,298 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+bool __attribute__((kernel("reduce")))
+mul_bool(bool lhs, bool rhs) {
+  return lhs * rhs;
+}
+
+char __attribute__((kernel("reduce")))
+mul_char(char lhs, char rhs) {
+  return lhs * rhs;
+}
+
+char2 __attribute__((kernel("reduce")))
+mul_char2(char2 lhs, char2 rhs) {
+  return lhs * rhs;
+}
+
+char3 __attribute__((kernel("reduce")))
+mul_char3(char3 lhs, char3 rhs) {
+  return lhs * rhs;
+}
+
+char4 __attribute__((kernel("reduce")))
+mul_char4(char4 lhs, char4 rhs) {
+  return lhs * rhs;
+}
+
+double __attribute__((kernel("reduce")))
+mul_double(double lhs, double rhs) {
+  return lhs * rhs;
+}
+
+double2 __attribute__((kernel("reduce")))
+mul_double2(double2 lhs, double2 rhs) {
+  return lhs * rhs;
+}
+
+double3 __attribute__((kernel("reduce")))
+mul_double3(double3 lhs, double3 rhs) {
+  return lhs * rhs;
+}
+
+double4 __attribute__((kernel("reduce")))
+mul_double4(double4 lhs, double4 rhs) {
+  return lhs * rhs;
+}
+
+float __attribute__((kernel("reduce")))
+mul_float(float lhs, float rhs) {
+  return lhs * rhs;
+}
+
+float2 __attribute__((kernel("reduce")))
+mul_float2(float2 lhs, float2 rhs) {
+  return lhs * rhs;
+}
+
+float3 __attribute__((kernel("reduce")))
+mul_float3(float3 lhs, float3 rhs) {
+  return lhs * rhs;
+}
+
+float4 __attribute__((kernel("reduce")))
+mul_float4(float4 lhs, float4 rhs) {
+  return lhs * rhs;
+}
+
+int __attribute__((kernel("reduce")))
+mul_int(int lhs, int rhs) {
+  return lhs * rhs;
+}
+
+int2 __attribute__((kernel("reduce")))
+mul_int2(int2 lhs, int2 rhs) {
+  return lhs * rhs;
+}
+
+int3 __attribute__((kernel("reduce")))
+mul_int3(int3 lhs, int3 rhs) {
+  return lhs * rhs;
+}
+
+int4 __attribute__((kernel("reduce")))
+mul_int4(int4 lhs, int4 rhs) {
+  return lhs * rhs;
+}
+
+long __attribute__((kernel("reduce")))
+mul_long(long lhs, long rhs) {
+  return lhs * rhs;
+}
+
+long2 __attribute__((kernel("reduce")))
+mul_long2(long2 lhs, long2 rhs) {
+  return lhs * rhs;
+}
+
+long3 __attribute__((kernel("reduce")))
+mul_long3(long3 lhs, long3 rhs) {
+  return lhs * rhs;
+}
+
+long4 __attribute__((kernel("reduce")))
+mul_long4(long4 lhs, long4 rhs) {
+  return lhs * rhs;
+}
+
+short __attribute__((kernel("reduce")))
+mul_short(short lhs, short rhs) {
+  return lhs * rhs;
+}
+
+short2 __attribute__((kernel("reduce")))
+mul_short2(short2 lhs, short2 rhs) {
+  return lhs * rhs;
+}
+
+short3 __attribute__((kernel("reduce")))
+mul_short3(short3 lhs, short3 rhs) {
+  return lhs * rhs;
+}
+
+short4 __attribute__((kernel("reduce")))
+mul_short4(short4 lhs, short4 rhs) {
+  return lhs * rhs;
+}
+
+uchar __attribute__((kernel("reduce")))
+mul_uchar(uchar lhs, uchar rhs) {
+  return lhs * rhs;
+}
+
+uchar2 __attribute__((kernel("reduce")))
+mul_uchar2(uchar2 lhs, uchar2 rhs) {
+  return lhs * rhs;
+}
+
+uchar3 __attribute__((kernel("reduce")))
+mul_uchar3(uchar3 lhs, uchar3 rhs) {
+  return lhs * rhs;
+}
+
+uchar4 __attribute__((kernel("reduce")))
+mul_uchar4(uchar4 lhs, uchar4 rhs) {
+  return lhs * rhs;
+}
+
+uint __attribute__((kernel("reduce")))
+mul_uint(uint lhs, uint rhs) {
+  return lhs * rhs;
+}
+
+uint2 __attribute__((kernel("reduce")))
+mul_uint2(uint2 lhs, uint2 rhs) {
+  return lhs * rhs;
+}
+
+uint3 __attribute__((kernel("reduce")))
+mul_uint3(uint3 lhs, uint3 rhs) {
+  return lhs * rhs;
+}
+
+uint4 __attribute__((kernel("reduce")))
+mul_uint4(uint4 lhs, uint4 rhs) {
+  return lhs * rhs;
+}
+
+ulong __attribute__((kernel("reduce")))
+mul_ulong(ulong lhs, ulong rhs) {
+  return lhs * rhs;
+}
+
+ulong2 __attribute__((kernel("reduce")))
+mul_ulong2(ulong2 lhs, ulong2 rhs) {
+  return lhs * rhs;
+}
+
+ulong3 __attribute__((kernel("reduce")))
+mul_ulong3(ulong3 lhs, ulong3 rhs) {
+  return lhs * rhs;
+}
+
+ulong4 __attribute__((kernel("reduce")))
+mul_ulong4(ulong4 lhs, ulong4 rhs) {
+  return lhs * rhs;
+}
+
+ushort __attribute__((kernel("reduce")))
+mul_ushort(ushort lhs, ushort rhs) {
+  return lhs * rhs;
+}
+
+ushort2 __attribute__((kernel("reduce")))
+mul_ushort2(ushort2 lhs, ushort2 rhs) {
+  return lhs * rhs;
+}
+
+ushort3 __attribute__((kernel("reduce")))
+mul_ushort3(ushort3 lhs, ushort3 rhs) {
+  return lhs * rhs;
+}
+
+ushort4 __attribute__((kernel("reduce")))
+mul_ushort4(ushort4 lhs, ushort4 rhs) {
+  return lhs * rhs;
+}
+
+struct indirect {
+  bool elem_bool;
+  char elem_char;
+  char2 elem_char2;
+  char3 elem_char3;
+  char4 elem_char4;
+  double elem_double;
+  double2 elem_double2;
+  double3 elem_double3;
+  double4 elem_double4;
+  float elem_float;
+  float2 elem_float2;
+  float3 elem_float3;
+  float4 elem_float4;
+  int elem_int;
+  int2 elem_int2;
+  int3 elem_int3;
+  int4 elem_int4;
+  long elem_long;
+  long2 elem_long2;
+  long3 elem_long3;
+  long4 elem_long4;
+  short elem_short;
+  short2 elem_short2;
+  short3 elem_short3;
+  short4 elem_short4;
+  uchar elem_uchar;
+  uchar2 elem_uchar2;
+  uchar3 elem_uchar3;
+  uchar4 elem_uchar4;
+  uint elem_uint;
+  uint2 elem_uint2;
+  uint3 elem_uint3;
+  uint4 elem_uint4;
+  ulong elem_ulong;
+  ulong2 elem_ulong2;
+  ulong3 elem_ulong3;
+  ulong4 elem_ulong4;
+  ushort elem_ushort;
+  ushort2 elem_ushort2;
+  ushort3 elem_ushort3;
+  ushort4 elem_ushort4;
+};
+
+struct indirect __attribute__((kernel("reduce")))
+mul_indirect(struct indirect lhs, struct indirect rhs) {
+  lhs.elem_bool *= rhs.elem_bool;
+  lhs.elem_char *= rhs.elem_char;
+  lhs.elem_char2 *= rhs.elem_char2;
+  lhs.elem_char3 *= rhs.elem_char3;
+  lhs.elem_char4 *= rhs.elem_char4;
+  lhs.elem_double *= rhs.elem_double;
+  lhs.elem_double2 *= rhs.elem_double2;
+  lhs.elem_double3 *= rhs.elem_double3;
+  lhs.elem_double4 *= rhs.elem_double4;
+  lhs.elem_float *= rhs.elem_float;
+  lhs.elem_float2 *= rhs.elem_float2;
+  lhs.elem_float3 *= rhs.elem_float3;
+  lhs.elem_float4 *= rhs.elem_float4;
+  lhs.elem_int *= rhs.elem_int;
+  lhs.elem_int2 *= rhs.elem_int2;
+  lhs.elem_int3 *= rhs.elem_int3;
+  lhs.elem_int4 *= rhs.elem_int4;
+  lhs.elem_long *= rhs.elem_long;
+  lhs.elem_long2 *= rhs.elem_long2;
+  lhs.elem_long3 *= rhs.elem_long3;
+  lhs.elem_long4 *= rhs.elem_long4;
+  lhs.elem_short *= rhs.elem_short;
+  lhs.elem_short2 *= rhs.elem_short2;
+  lhs.elem_short3 *= rhs.elem_short3;
+  lhs.elem_short4 *= rhs.elem_short4;
+  lhs.elem_uchar *= rhs.elem_uchar;
+  lhs.elem_uchar2 *= rhs.elem_uchar2;
+  lhs.elem_uchar3 *= rhs.elem_uchar3;
+  lhs.elem_uchar4 *= rhs.elem_uchar4;
+  lhs.elem_uint *= rhs.elem_uint;
+  lhs.elem_uint2 *= rhs.elem_uint2;
+  lhs.elem_uint3 *= rhs.elem_uint3;
+  lhs.elem_uint4 *= rhs.elem_uint4;
+  lhs.elem_ulong *= rhs.elem_ulong;
+  lhs.elem_ulong2 *= rhs.elem_ulong2;
+  lhs.elem_ulong3 *= rhs.elem_ulong3;
+  lhs.elem_ulong4 *= rhs.elem_ulong4;
+  lhs.elem_ushort *= rhs.elem_ushort;
+  lhs.elem_ushort2 *= rhs.elem_ushort2;
+  lhs.elem_ushort3 *= rhs.elem_ushort3;
+  lhs.elem_ushort4 *= rhs.elem_ushort4;
+  return lhs;
+}
diff --git a/tests/P_reduce/stderr.txt.expect b/tests/P_reduce/stderr.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce/stderr.txt.expect
diff --git a/tests/P_reduce/stdout.txt.expect b/tests/P_reduce/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce/stdout.txt.expect
diff --git a/tests/P_reduce_cpp/reduce_cpp.rs b/tests/P_reduce_cpp/reduce_cpp.rs
new file mode 100644
index 0000000..a4ddeb5
--- /dev/null
+++ b/tests/P_reduce_cpp/reduce_cpp.rs
@@ -0,0 +1,8 @@
+// -target-api 0 -reflect-c++
+
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+int __attribute__((kernel("reduce"))) add(int a, int b) {
+  return a + b;
+}
diff --git a/tests/P_reduce_cpp/stderr.txt.expect b/tests/P_reduce_cpp/stderr.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce_cpp/stderr.txt.expect
diff --git a/tests/P_reduce_cpp/stdout.txt.expect b/tests/P_reduce_cpp/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce_cpp/stdout.txt.expect
diff --git a/tests/P_reduce_ignore_cv/reduce_ignore_cv.rs b/tests/P_reduce_ignore_cv/reduce_ignore_cv.rs
new file mode 100644
index 0000000..0a055ca
--- /dev/null
+++ b/tests/P_reduce_ignore_cv/reduce_ignore_cv.rs
@@ -0,0 +1,15 @@
+// -target-api 0
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+typedef struct foo {
+  int x[10];
+} foo;
+
+foo __attribute__((kernel("reduce"))) add_foo_a(const foo a, const foo b) {
+  foo tmp = a;
+  for (int i = 0; i < 10; ++i)
+    tmp.x[i] += b.x[i];
+  return tmp;
+}
+
diff --git a/tests/P_reduce_ignore_cv/stderr.txt.expect b/tests/P_reduce_ignore_cv/stderr.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce_ignore_cv/stderr.txt.expect
diff --git a/tests/P_reduce_ignore_cv/stdout.txt.expect b/tests/P_reduce_ignore_cv/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_reduce_ignore_cv/stdout.txt.expect
diff --git a/tests/P_root_void/root_void.rs b/tests/P_root_void/root_void.rs
new file mode 100644
index 0000000..13bed22
--- /dev/null
+++ b/tests/P_root_void/root_void.rs
@@ -0,0 +1,83 @@
+#pragma version(1)
+#pragma rs java_package_name(foo)
+
+void root(const void *ain, void *aout, const void *usrData,
+          uint32_t x, uint32_t y) {
+}
+
+void in_only(const void *ain) {
+}
+
+void in_x_only(const void *ain, uint32_t x) {
+}
+
+void in_y_only(const void *ain, uint32_t y) {
+}
+
+void in_x_y_only(const void *ain, uint32_t x, uint32_t y) {
+}
+
+void in_usrdata_only(const void *ain, const void *usrData) {
+}
+
+void in_usrdata_x_only(const void *ain, const void *usrData, uint32_t x) {
+}
+
+void in_usrdata_y_only(const void *ain, const void *usrData, uint32_t y) {
+}
+
+void in_usrdata_x_y_only(const void *ain, const void *usrData, uint32_t x,
+                         uint32_t y) {
+}
+
+void out_only(void *aout) {
+}
+
+void out_x_only(void *aout, uint32_t x) {
+}
+
+void out_y_only(void *aout, uint32_t y) {
+}
+
+void out_x_y_only(void *aout, uint32_t x, uint32_t y) {
+}
+
+void out_usrdata_only(void *aout, const void *usrData) {
+}
+
+void out_usrdata_x_only(void *aout, const void *usrData, uint32_t x) {
+}
+
+void out_usrdata_y_only(void *aout, const void *usrData, uint32_t y) {
+}
+
+void out_usrdata_x_y_only(void *aout, const void *usrData, uint32_t x,
+                         uint32_t y) {
+}
+
+void in_out_only(const void *ain, void *aout) {
+}
+
+void in_out_x_only(const void *ain, void *aout, uint32_t x) {
+}
+
+void in_out_y_only(const void *ain, void *aout, uint32_t y) {
+}
+
+void in_out_x_y_only(const void *ain, void *aout, uint32_t x, uint32_t y) {
+}
+
+void in_out_usrdata_only(const void *ain, void *aout, const void *usrData) {
+}
+
+void in_out_usrdata_x_only(const void *ain, void *aout, const void *usrData,
+                           uint32_t x) {
+}
+
+void in_out_usrdata_y_only(const void *ain, void *aout, const void *usrData,
+                           uint32_t y) {
+}
+
+void in_out_usrdata_x_y_only(const void *ain, void *aout, const void *usrData,
+                             uint32_t x, uint32_t y) {
+}
diff --git a/tests/P_root_void/stderr.txt.expect b/tests/P_root_void/stderr.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_root_void/stderr.txt.expect
diff --git a/tests/P_root_void/stdout.txt.expect b/tests/P_root_void/stdout.txt.expect
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/P_root_void/stdout.txt.expect
diff --git a/tests/P_warnings_deprecated/stderr.txt.expect b/tests/P_warnings_deprecated/stderr.txt.expect
index 32f00d6..94ff955 100644
--- a/tests/P_warnings_deprecated/stderr.txt.expect
+++ b/tests/P_warnings_deprecated/stderr.txt.expect
@@ -1,4 +1,4 @@
 deprecated.rs:9:9: warning: 'rsClamp' is deprecated: Use clamp() instead.
-../../../../../frameworks/rs/scriptc/rs_math.rsh:4091:5: note: 'rsClamp' has been explicitly marked deprecated here
+../../../../../frameworks/rs/scriptc/rs_math.rsh:4109:5: note: 'rsClamp' has been explicitly marked deprecated here
 deprecated.rs:10:8: warning: 'rsGetAllocation' is deprecated: This function is deprecated and will be removed from the SDK in a future release.
 ../../../../../frameworks/rs/scriptc/rs_object_info.rsh:381:5: note: 'rsGetAllocation' has been explicitly marked deprecated here