am 5f23bccc: (-s ours) am c7aac9dd: am 20328131: Merge "If the host supports it, build libbcc as an LLVM loadable module." * commit '5f23bcccf58fc33f84f3daef86f852f8c7268892':

commit: 2f041b66ae4833317b7f2d9782c3d3a3fb4938bf [log] [tgz]
author: Stephen Hines <srhines@google.com> Thu Aug 27 10:49:41 2015 +0000
committer: Android Git Automerger <android-git-automerger@android.com> Thu Aug 27 10:49:41 2015 +0000
tree: 31930a291c99fe616059e735ef57023eff11a9c7
parent: b4544c32f3ddb3a6a6494b266b00adf12c74d47a [diff]
parent: 5f23bcccf58fc33f84f3daef86f852f8c7268892 [diff]
diff --git a/bcinfo/BitReader_2_7/Android.mk b/bcinfo/BitReader_2_7/Android.mk
index 5cd3b7b..181c731 100644
--- a/bcinfo/BitReader_2_7/Android.mk
+++ b/bcinfo/BitReader_2_7/Android.mk

@@ -1,6 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 bitcode_reader_2_7_SRC_FILES := \

diff --git a/bcinfo/BitReader_2_7/BitcodeReader.cpp b/bcinfo/BitReader_2_7/BitcodeReader.cpp
index ea910ee..894b801 100644
--- a/bcinfo/BitReader_2_7/BitcodeReader.cpp
+++ b/bcinfo/BitReader_2_7/BitcodeReader.cpp

@@ -262,9 +262,9 @@
 
   bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
-  std::error_code MaterializeModule(Module *M) override;
+  std::error_code materializeModule(Module *M) override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
-  void Dematerialize(GlobalValue *GV) override;
+  void dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
@@ -2302,8 +2302,7 @@
         return Error("Invalid type for value");
 
       auto *NewGA =
-          GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                              getDecodedLinkage(Record[2]), "", TheModule);
+          GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
       if (Record.size() > 3)
         NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3141,7 +3140,7 @@
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3349,7 +3348,7 @@
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If this function isn't dematerializable, this is a noop.
   if (!F || !isDematerializable(F))
@@ -3362,7 +3361,7 @@
   F->setIsMaterializable(true);
 }
 
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on

diff --git a/bcinfo/BitReader_3_0/Android.mk b/bcinfo/BitReader_3_0/Android.mk
index b425475..95ccd40 100644
--- a/bcinfo/BitReader_3_0/Android.mk
+++ b/bcinfo/BitReader_3_0/Android.mk

@@ -1,6 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 bitcode_reader_3_0_SRC_FILES := \

diff --git a/bcinfo/BitReader_3_0/BitcodeReader.cpp b/bcinfo/BitReader_3_0/BitcodeReader.cpp
index 0c99f3b..0d1262c 100644
--- a/bcinfo/BitReader_3_0/BitcodeReader.cpp
+++ b/bcinfo/BitReader_3_0/BitcodeReader.cpp

@@ -500,9 +500,9 @@
 
   bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
-  std::error_code MaterializeModule(Module *M) override;
+  std::error_code materializeModule(Module *M) override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
-  void Dematerialize(GlobalValue *GV) override;
+  void dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
@@ -2570,8 +2570,7 @@
         return Error("Invalid type for value");
 
       auto *NewGA =
-          GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                              getDecodedLinkage(Record[2]), "", TheModule);
+          GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
       if (Record.size() > 3)
         NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3420,7 +3419,7 @@
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3684,7 +3683,7 @@
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If this function isn't dematerializable, this is a noop.
   if (!F || !isDematerializable(F))
@@ -3697,7 +3696,7 @@
   F->setIsMaterializable(true);
 }
 
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on

diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index 23d97fe..add1ab1 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp

@@ -21,7 +21,7 @@
 
 #define LOG_TAG "bcinfo"
 #include <cutils/log.h>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
 #include <cutils/properties.h>
 #endif
 
@@ -340,7 +340,7 @@
   }
   mRSFloatPrecision = RelaxedPragmaSeen ? RS_FP_Relaxed : RS_FP_Full;
 
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
   // Provide an override for precsiion via adb shell setprop
   // adb shell setprop debug.rs.precision rs_fp_full
   // adb shell setprop debug.rs.precision rs_fp_relaxed

diff --git a/bcinfo/Wrap/Android.mk b/bcinfo/Wrap/Android.mk
index 7da8b3f..1b5db36 100644
--- a/bcinfo/Wrap/Android.mk
+++ b/bcinfo/Wrap/Android.mk

@@ -16,7 +16,7 @@
 
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 llvm_wrap_SRC_FILES := \

diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 75cde37..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h

@@ -80,13 +80,11 @@
 
   enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
 
-  bool addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM);
   bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addInvariantPass(llvm::legacy::PassManager &pPM);
-  bool addInvokeHelperPass(llvm::legacy::PassManager &pPM);
-  bool addPostLTOCustomPasses(llvm::legacy::PassManager &pPM);
+  void addExpandKernelPass(llvm::legacy::PassManager &pPM);
+  void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
+  void addInvariantPass(llvm::legacy::PassManager &pPM);
+  void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
 
 public:
   Compiler();

diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h

@@ -25,7 +25,7 @@
 namespace bcc {
 
 llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
 
 llvm::FunctionPass *
 createRSInvariantPass();

diff --git a/include/bcc/Support/Properties.h b/include/bcc/Support/Properties.h
index c82901c..4c3c404 100644
--- a/include/bcc/Support/Properties.h
+++ b/include/bcc/Support/Properties.h

@@ -20,12 +20,12 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
 #include <cutils/properties.h>
 #endif
 
 static inline uint32_t getProperty(const char *str) {
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
     char buf[PROPERTY_VALUE_MAX];
     property_get(str, buf, "0");
     return atoi(buf);

diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index c314b6c..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp

@@ -143,6 +143,8 @@
 }
 
 
+// This function has complete responsibility for creating and executing the
+// exact list of compiler passes.
 enum Compiler::ErrorCode Compiler::runPasses(Script &pScript,
                                              llvm::raw_pwrite_stream &pResult) {
   // Pass manager for link-time optimization
@@ -153,10 +155,13 @@
 
   passes.add(createTargetTransformInfoWrapperPass(mTarget->getTargetIRAnalysis()));
 
-  // Add our custom passes.
-  if (!addCustomPasses(pScript, passes)) {
+  // Add some initial custom passes.
+  addInvokeHelperPass(passes);
+  addExpandKernelPass(passes);
+  addInvariantPass(passes);
+  if (!addInternalizeSymbolsPass(pScript, passes))
     return kErrCustomPasses;
-  }
+  addGlobalInfoPass(pScript, passes);
 
   if (mTarget->getOptLevel() == llvm::CodeGenOpt::None) {
     passes.add(llvm::createGlobalOptimizerPass());
@@ -187,9 +192,9 @@
 
   // These passes have to come after LTO, since we don't want to examine
   // functions that are never actually called.
-  if (!addPostLTOCustomPasses(passes)) {
-    return kErrCustomPasses;
-  }
+  if (llvm::Triple(getTargetMachine().getTargetTriple()).getArch() == llvm::Triple::x86_64)
+    passes.add(createRSX86_64CallConvPass());  // Add pass to correct calling convention for X86-64.
+  passes.add(createRSIsThreadablePass());      // Add pass to mark script as threadable.
 
   // RSEmbedInfoPass needs to come after we have scanned for non-threadable
   // functions.
@@ -324,9 +329,11 @@
   size_t exportVarCount = me.getExportVarCount();
   size_t exportFuncCount = me.getExportFuncCount();
   size_t exportForEachCount = me.getExportForEachSignatureCount();
+  size_t exportReduceCount = me.getExportReduceCount();
   const char **exportVarNameList = me.getExportVarNameList();
   const char **exportFuncNameList = me.getExportFuncNameList();
   const char **exportForEachNameList = me.getExportForEachNameList();
+  const char **exportReduceNameList = me.getExportReduceNameList();
   size_t i;
 
   for (i = 0; i < exportVarCount; ++i) {
@@ -337,18 +344,22 @@
     export_symbols.push_back(exportFuncNameList[i]);
   }
 
-  // Expanded foreach functions should not be internalized, too.
-  // expanded_foreach_funcs keeps the .expand version of the kernel names
-  // around until createInternalizePass() is finished making its own
-  // copy of the visible symbols.
-  std::vector<std::string> expanded_foreach_funcs;
+  // Expanded foreach and reduce functions should not be
+  // internalized. expanded_funcs keeps the names of the expanded
+  // functions around until createInternalizePass() is finished making
+  // its own copy of the visible symbols.
+  std::vector<std::string> expanded_funcs;
+  expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
   for (i = 0; i < exportForEachCount; ++i) {
-    expanded_foreach_funcs.push_back(
-        std::string(exportForEachNameList[i]) + ".expand");
+    expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+  }
+  for (i = 0; i < exportReduceCount; ++i) {
+    expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
   }
 
-  for (i = 0; i < exportForEachCount; i++) {
-      export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+  for (auto &symbol_name : expanded_funcs) {
+    export_symbols.push_back(symbol_name.c_str());
   }
 
   pPM.add(llvm::createInternalizePass(export_symbols));
@@ -356,69 +367,31 @@
   return true;
 }
 
-bool Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
   llvm::Triple arch(getTargetMachine().getTargetTriple());
   if (arch.isArch64Bit()) {
     pPM.add(createRSInvokeHelperPass());
   }
-  return true;
 }
 
-bool Compiler::addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM) {
-  // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+  // Expand ForEach and reduce on CPU path to reduce launch overhead.
   bool pEnableStepOpt = true;
-  pPM.add(createRSForEachExpandPass(pEnableStepOpt));
-
-  return true;
+  pPM.add(createRSKernelExpandPass(pEnableStepOpt));
 }
 
-bool Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
+void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
   // Add additional information about RS global variables inside the Module.
   RSScript &script = static_cast<RSScript &>(pScript);
   if (script.getEmbedGlobalInfo()) {
     pPM.add(createRSGlobalInfoPass(script.getEmbedGlobalInfoSkipConstant()));
   }
-
-  return true;
 }
 
-bool Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
   // Mark Loads from RsExpandKernelDriverInfo as "load.invariant".
   // Should run after ExpandForEach and before inlining.
   pPM.add(createRSInvariantPass());
-
-  return true;
-}
-
-bool Compiler::addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM) {
-  if (!addInvokeHelperPass(pPM))
-    return false;
-
-  if (!addExpandForEachPass(pScript, pPM))
-    return false;
-
-  if (!addInvariantPass(pPM))
-    return false;
-
-  if (!addInternalizeSymbolsPass(pScript, pPM))
-    return false;
-
-  if (!addGlobalInfoPass(pScript, pPM))
-    return false;
-
-  return true;
-}
-
-bool Compiler::addPostLTOCustomPasses(llvm::legacy::PassManager &pPM) {
-  // Add pass to correct calling convention for X86-64.
-  llvm::Triple arch(getTargetMachine().getTargetTriple());
-  if (arch.getArch() == llvm::Triple::x86_64)
-    pPM.add(createRSX86_64CallConvPass());
-
-  // Add pass to mark script as threadable.
-  pPM.add(createRSIsThreadablePass());
-
-  return true;
 }
 
 enum Compiler::ErrorCode Compiler::screenGlobalFunctions(Script &pScript) {

diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk

@@ -24,7 +24,7 @@
 libbcc_renderscript_SRC_FILES := \
   RSCompilerDriver.cpp \
   RSEmbedInfo.cpp \
-  RSForEachExpand.cpp \
+  RSKernelExpand.cpp \
   RSGlobalInfoPass.cpp \
   RSInvariant.cpp \
   RSScript.cpp \

diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index b9a32c1..7cc4ffb 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp

@@ -42,7 +42,7 @@
 #include <sstream>
 #include <string>
 
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
 #include <cutils/properties.h>
 #endif
 #include <utils/StopWatch.h>

diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp

@@ -73,11 +73,13 @@
     size_t exportVarCount = me.getExportVarCount();
     size_t exportFuncCount = me.getExportFuncCount();
     size_t exportForEachCount = me.getExportForEachSignatureCount();
+    size_t exportReduceCount = me.getExportReduceCount();
     size_t objectSlotCount = me.getObjectSlotCount();
     size_t pragmaCount = me.getPragmaCount();
     const char **exportVarNameList = me.getExportVarNameList();
     const char **exportFuncNameList = me.getExportFuncNameList();
     const char **exportForEachNameList = me.getExportForEachNameList();
+    const char **exportReduceNameList = me.getExportReduceNameList();
     const uint32_t *exportForEachSignatureList =
         me.getExportForEachSignatureList();
     const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
         << exportForEachNameList[i] << "\n";
     }
 
+    s << "exportReduceCount: " << exportReduceCount << "\n";
+    for (i = 0; i < exportReduceCount; ++i) {
+      s << exportReduceNameList[i] << "\n";
+    }
+
     s << "objectSlotCount: " << objectSlotCount << "\n";
     for (i = 0; i < objectSlotCount; ++i) {
       s << objectSlotList[i] << "\n";

diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
similarity index 69%
rename from lib/Renderscript/RSForEachExpand.cpp
rename to lib/Renderscript/RSKernelExpand.cpp
index ce1fb58..34611d7 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp

@@ -38,7 +38,14 @@
 
 #include "bcinfo/MetadataExtractor.h"
 
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
 
 using namespace bcc;
 
@@ -46,15 +53,17 @@
 
 static const bool gEnableRsTbaa = true;
 
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
  */
-class RSForEachExpandPass : public llvm::ModulePass {
+class RSKernelExpandPass : public llvm::ModulePass {
 public:
   static char ID;
 
@@ -91,16 +100,19 @@
   llvm::LLVMContext *Context;
 
   /*
-   * Pointer to LLVM type information for the the function signature
-   * for expanded kernels.  This must be re-calculated for each
-   * module the pass is run on.
+   * Pointers to LLVM type information for the the function signatures
+   * for expanded functions. These must be re-calculated for each module
+   * the pass is run on.
    */
-  llvm::FunctionType *ExpandedFunctionType;
+  llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
 
   uint32_t mExportForEachCount;
   const char **mExportForEachNameList;
   const uint32_t *mExportForEachSignatureList;
 
+  uint32_t mExportReduceCount;
+  const char **mExportReduceNameList;
+
   // Turns on optimization of allocation stride values.
   bool mEnableStepOpt;
 
@@ -286,41 +298,68 @@
         llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
 
     // Create the function type for expanded kernels.
+    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
 
     llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+    // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+    ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+        {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
 
-    llvm::SmallVector<llvm::Type*, 8> ParamTypes;
-    ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x1
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x2
-    ParamTypes.push_back(Int32Ty);                          // uint32_t outstep
-
-    ExpandedFunctionType =
-        llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
-                                false);
+    // void (void *inBuf, void *outBuf, uint32_t len)
+    ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
   }
 
-  /// @brief Create skeleton of the expanded function.
+  /// @brief Create skeleton of the expanded foreach kernel.
   ///
   /// This creates a function with the following signature:
   ///
   ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
   ///         uint32_t outstep)
   ///
-  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
+  llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
     llvm::Function *ExpandedFunction =
-      llvm::Function::Create(ExpandedFunctionType,
+      llvm::Function::Create(ExpandedForEachType,
                              llvm::GlobalValue::ExternalLinkage,
                              OldName + ".expand", Module);
-
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
     llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
     (AI++)->setName("p");
     (AI++)->setName("x1");
     (AI++)->setName("x2");
     (AI++)->setName("arg_outstep");
+    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+                                                       ExpandedFunction);
+    llvm::IRBuilder<> Builder(Begin);
+    Builder.CreateRetVoid();
+    return ExpandedFunction;
+  }
+
+  // Create skeleton of the expanded reduce kernel.
+  //
+  // This creates a function with the following signature:
+  //
+  //   void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+  //
+  llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+    llvm::Function *ExpandedFunction =
+      llvm::Function::Create(ExpandedReduceType,
+                             llvm::GlobalValue::ExternalLinkage,
+                             OldName + ".expand", Module);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+    using llvm::Attribute;
+
+    llvm::Argument *InBuf = &(*AI++);
+    InBuf->setName("inBuf");
+    InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+    llvm::Argument *OutBuf = &(*AI++);
+    OutBuf->setName("outBuf");
+    OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+    (AI++)->setName("len");
 
     llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
                                                        ExpandedFunction);
@@ -444,7 +483,7 @@
   //
   // Returns:
   //   Returns a SmallVector of ConstantInts.
-  SmallGEPIndices GEPHelper(std::initializer_list<int32_t> I32Args) {
+  SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
     SmallGEPIndices Out(I32Args.size());
     llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
     std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
@@ -453,7 +492,7 @@
   }
 
 public:
-  RSForEachExpandPass(bool pEnableStepOpt = true)
+  RSKernelExpandPass(bool pEnableStepOpt = true)
       : ModulePass(ID), Module(nullptr), Context(nullptr),
         mEnableStepOpt(pEnableStepOpt) {
 
@@ -536,7 +575,7 @@
    * Module will contain a new function of the name "<NAME>.expand" that
    * invokes <NAME>() in a loop with the appropriate parameters.
    */
-  bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
+  bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
     ALOGV("Expanding ForEach-able Function %s",
           Function->getName().str().c_str());
 
@@ -552,14 +591,14 @@
     llvm::DataLayout DL(Module);
 
     llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
+      createEmptyExpandedForEachKernel(Function->getName());
 
     /*
      * Extract the expanded function's parameters.  It is guaranteed by
-     * createEmptyExpandedFunction that there will be five parameters.
+     * createEmptyExpandedFunction that there will be four parameters.
      */
 
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
 
     llvm::Function::arg_iterator ExpandedFunctionArgIter =
       ExpandedFunction->arg_begin();
@@ -672,24 +711,24 @@
     return true;
   }
 
-  /* Expand a pass-by-value kernel.
+  /* Expand a pass-by-value foreach kernel.
    */
-  bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
+  bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
     bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
     ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
 
-    // TODO: Refactor this to share functionality with ExpandFunction.
+    // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
     llvm::DataLayout DL(Module);
 
     llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
+      createEmptyExpandedForEachKernel(Function->getName());
 
     /*
      * Extract the expanded function's parameters.  It is guaranteed by
-     * createEmptyExpandedFunction that there will be five parameters.
+     * createEmptyExpandedFunction that there will be four parameters.
      */
 
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
 
     llvm::Function::arg_iterator ExpandedFunctionArgIter =
       ExpandedFunction->arg_begin();
@@ -697,7 +736,7 @@
     llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
     llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
     llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
+    // Arg_outstep is not used by expanded new-style forEach kernels.
 
     // Construct the actual function body.
     llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
@@ -708,8 +747,8 @@
     llvm::MDBuilder MDHelper(*Context);
 
     TBAARenderScriptDistinct =
-      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
-    TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
         TBAARenderScriptDistinct);
     TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
                                                        TBAARenderScript);
@@ -719,10 +758,6 @@
                                                     TBAARenderScript);
     TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
 
-    llvm::MDNode *AliasingDomain, *AliasingScope;
-    AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
-    AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
-
     /*
      * Collect and construct the arguments for the kernel().
      *
@@ -738,7 +773,6 @@
 
     // Check the return type
     llvm::Type     *OutTy            = nullptr;
-    llvm::Value    *OutStep          = nullptr;
     llvm::LoadInst *OutBasePtr       = nullptr;
     llvm::Value    *CastedOutBasePtr = nullptr;
 
@@ -758,8 +792,6 @@
         OutTy = OutBaseTy->getPointerTo();
       }
 
-      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
-      OutStep->setName("outstep");
       SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
       OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
 
@@ -767,13 +799,10 @@
         OutBasePtr->setMetadata("tbaa", TBAAPointer);
       }
 
-      OutBasePtr->setMetadata("alias.scope", AliasingScope);
-
       CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
     }
 
     llvm::SmallVector<llvm::Type*,  8> InTypes;
-    llvm::SmallVector<llvm::Value*, 8> InSteps;
     llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
     llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
 
@@ -803,11 +832,6 @@
       Builder.SetInsertPoint(LoopHeader->getTerminator());
 
       for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
-        SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride,
-          static_cast<int32_t>(InputIndex)}));
-        llvm::Value *InStepAddr = Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep");
-        llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr, "instep_addr");
-
         llvm::Type *InType = ArgIter->getType();
 
         /*
@@ -829,10 +853,6 @@
           InStructTempSlots.push_back(nullptr);
         }
 
-        llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
-
-        InStep->setName("instep");
-
         SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
           static_cast<int32_t>(InputIndex)}));
         llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
@@ -842,10 +862,7 @@
           InBufPtr->setMetadata("tbaa", TBAAPointer);
         }
 
-        InBufPtr->setMetadata("alias.scope", AliasingScope);
-
         InTypes.push_back(InType);
-        InSteps.push_back(InStep);
         InBufPtrs.push_back(CastInBufPtr);
       }
 
@@ -855,23 +872,13 @@
     // Populate the actual call to kernel().
     llvm::SmallVector<llvm::Value*, 8> RootArgs;
 
-    // Calculate the current input and output pointers
-    //
-    //
-    // We always calculate the input/output pointers with a GEP operating on i8
-    // values combined with a multiplication and only cast at the very end to
-    // OutTy.  This is to account for dynamic stepping sizes when the value
-    // isn't apparent at compile time.  In the (very common) case when we know
-    // the step size at compile time, due to haveing complete type information
-    // this multiplication will optmized out and produces code equivalent to a
-    // a GEP on a pointer of the correct type.
+    // Calculate the current input and output pointers.
 
     // Output
 
     llvm::Value *OutPtr = nullptr;
     if (CastedOutBasePtr) {
       llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
       OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
 
       if (PassOutByPointer) {
@@ -888,31 +895,22 @@
         llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
         llvm::Value *Input;
 
+        llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
+
+        if (gEnableRsTbaa) {
+          InputLoad->setMetadata("tbaa", TBAAAllocation);
+        }
+
         if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
           // Pass a pointer to a temporary on the stack, rather than
           // passing a pointer to the original value. We do not want
           // the kernel to potentially modify the input data.
 
-          llvm::Type *ElementType = llvm::cast<llvm::PointerType>(
-                                        InPtr->getType())->getElementType();
-          uint64_t StoreSize = DL.getTypeStoreSize(ElementType);
-          uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-
-          Builder.CreateMemCpy(TemporarySlot, InPtr, StoreSize, Alignment,
-                               /* isVolatile = */ false,
-                               /* !tbaa = */ gEnableRsTbaa ? TBAAAllocation : nullptr,
-                               /* !tbaa.struct = */ nullptr,
-                               /* !alias.scope = */ AliasingScope);
+          // Note: don't annotate with TBAA, since the kernel might
+          // have its own TBAA annotations for the pointer argument.
+          Builder.CreateStore(InputLoad, TemporarySlot);
           Input = TemporarySlot;
         } else {
-          llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
-
-          if (gEnableRsTbaa) {
-            InputLoad->setMetadata("tbaa", TBAAAllocation);
-          }
-
-          InputLoad->setMetadata("alias.scope", AliasingScope);
-
           Input = InputLoad;
         }
 
@@ -925,11 +923,274 @@
     llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
 
     if (OutPtr && !PassOutByPointer) {
+      RetVal->setName("call.result");
       llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
       if (gEnableRsTbaa) {
         Store->setMetadata("tbaa", TBAAAllocation);
       }
-      Store->setMetadata("alias.scope", AliasingScope);
+    }
+
+    return true;
+  }
+
+  // Expand a reduce-style kernel function.
+  //
+  // The input is a kernel which represents a binary operation,
+  // of the form
+  //
+  //   define foo @func(foo %a, foo %b),
+  //
+  // (More generally, it can be of the forms
+  //
+  //   define void @func(foo* %ret, foo* %a, foo* %b)
+  //   define void @func(foo* %ret, foo1 %a, foo1 %b)
+  //   define foo1 @func(foo2 %a, foo2 %b)
+  //
+  // as a result of argument / return value conversions. Here, "foo1"
+  // and "foo2" refer to possibly coerced types, and the coerced
+  // argument type may be different from the coerced return type. See
+  // "Note on coercion" below.)
+  //
+  // Note also, we do not expect to encounter any case when the
+  // arguments are promoted to pointers but the return value is
+  // unpromoted to pointer, e.g.
+  //
+  //   define foo1 @func(foo* %a, foo* %b)
+  //
+  // and we will throw an assertion in this case.)
+  //
+  // The input kernel gets expanded into a kernel of the form
+  //
+  //   define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+  //
+  // which performs a serial reduction of `len` elements from `inBuf`,
+  // and stores the result into `outBuf`. In pseudocode, @func.expand
+  // does:
+  //
+  //   inArr := (foo *)inBuf;
+  //   accum := inArr[0];
+  //   for (i := 1; i < len; ++i) {
+  //     accum := foo(accum, inArr[i]);
+  //   }
+  //   *(foo *)outBuf := accum;
+  //
+  // Note on coercion
+  //
+  // Both the return value and the argument types may undergo internal
+  // coercion in clang as part of call lowering. As a result, the
+  // return value type may differ from the argument type even if the
+  // types in the RenderScript signaure are the same. For instance, the
+  // kernel
+  //
+  //   int3 add(int3 a, int3 b) { return a + b; }
+  //
+  // gets lowered by clang as
+  //
+  //   define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+  //
+  // under AArch64. The details of this process are found in clang,
+  // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+  // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+  // is passed by pointer, then the pointed-to type is not coerced.
+  //
+  // Since we lack the original type information, this code does loads
+  // and stores of allocation data by way of pointers to the coerced
+  // type.
+  bool ExpandReduce(llvm::Function *Function) {
+    bccAssert(Function);
+
+    ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+    llvm::DataLayout DL(Module);
+
+    // TBAA Metadata
+    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+    llvm::MDBuilder MDHelper(*Context);
+
+    TBAARenderScriptDistinct =
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+        TBAARenderScriptDistinct);
+    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+                                                       TBAARenderScript);
+    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+                                                      TBAAAllocation, 0);
+
+    llvm::Function *ExpandedFunction =
+      createEmptyExpandedReduceKernel(Function->getName());
+
+    // Extract the expanded kernel's parameters.  It is guaranteed by
+    // createEmptyExpandedFunction that there will be 3 parameters.
+    auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+    llvm::Value *Arg_inBuf  = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_len    = &*(ExpandedFunctionArgIter++);
+
+    bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+    // Check if, instead of returning a value, the original kernel has
+    // a pointer parameter which points to a temporary buffer into
+    // which the return value gets written.
+    const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+    bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+    // Check if, instead of being passed by value, the inputs to the
+    // original kernel are passed by pointer.
+    auto FirstArgIter = Function->arg_begin();
+    // The second argument is always an input to the original kernel.
+    auto SecondArgIter = std::next(FirstArgIter);
+    const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+    // Get the output type (i.e. return type of the original kernel).
+    llvm::PointerType *OutPtrTy = nullptr;
+    llvm::Type *OutTy = nullptr;
+    if (ReturnValuePointerStyle) {
+      OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+      bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+      OutTy = OutPtrTy->getElementType();
+    } else {
+      OutTy = Function->getReturnType();
+      bccAssert(!OutTy->isVoidTy());
+      OutPtrTy = OutTy->getPointerTo();
+    }
+
+    // Get the input type (type of the arguments to the original
+    // kernel). Some input types are different from the output type,
+    // due to explicit coercion that the compiler performs when
+    // lowering the parameters. See "Note on coercion" above.
+    llvm::PointerType *InPtrTy;
+    llvm::Type *InTy;
+    if (InputsPointerStyle) {
+      InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+      bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+      bccAssert(ReturnValuePointerStyle);
+      bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+                "Input type mismatch");
+      InTy = InPtrTy->getElementType();
+    } else {
+      InTy = SecondArgIter->getType();
+      InPtrTy = InTy->getPointerTo();
+      if (!ReturnValuePointerStyle) {
+        bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+      } else {
+        bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+                  "Input type mismatch");
+      }
+    }
+
+    // The input type should take up the same amount of space in
+    // memory as the output type.
+    bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+    // Construct the actual function body.
+    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+    // Cast input and output buffers to appropriate types.
+    llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+    llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+    // Create a slot to pass temporary results back. This needs to be
+    // separate from the accumulator slot because the kernel may mark
+    // the return value slot as noalias.
+    llvm::Value *ReturnBuf = nullptr;
+    if (ReturnValuePointerStyle) {
+      ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+    }
+
+    // Create a slot to hold the second input if the inputs are passed
+    // by pointer to the original kernel. We cannot directly pass a
+    // pointer to the input buffer, because the kernel may modify its
+    // inputs.
+    llvm::Value *SecondInputTempBuf = nullptr;
+    if (InputsPointerStyle) {
+      SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+    }
+
+    // Create a slot to accumulate temporary results, and fill it with
+    // the first value.
+    llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+    // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+    llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+      Builder.CreatePointerCast(InBuf, OutPtrTy));
+    if (gEnableRsTbaa) {
+      FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+    }
+    // Memory operations with AccumBuf shouldn't be marked with
+    // RenderScript TBAA, since this might conflict with TBAA metadata
+    // in the kernel function when AccumBuf is passed by pointer.
+    Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+    // Loop body
+
+    // Create the loop structure. Note that the first input in the input buffer
+    // has already been accumulated, so that we start at index 1.
+    llvm::PHINode *IndVar;
+    llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+    llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+    llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+    // Set up arguments and call the original (unexpanded) kernel.
+    //
+    // The original kernel can have at most 3 arguments, which is
+    // achieved when the signature looks like:
+    //
+    //    define void @func(foo* %ret, bar %a, bar %b)
+    //
+    // (bar can be one of foo/foo.coerce/foo*).
+    llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+    if (ReturnValuePointerStyle) {
+      KernelArgs.push_back(ReturnBuf);
+    }
+
+    if (InputsPointerStyle) {
+      bccAssert(ReturnValuePointerStyle);
+      // Because the return buffer is copied back into the
+      // accumulator, it's okay if the accumulator is overwritten.
+      KernelArgs.push_back(AccumBuf);
+
+      llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        InputLoad->setMetadata("tbaa", TBAAAllocation);
+      }
+      Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+      KernelArgs.push_back(SecondInputTempBuf);
+    } else {
+      // InPtrTy may be different from OutPtrTy (the type of
+      // AccumBuf), so first cast the accumulator buffer to the
+      // pointer type corresponding to the input argument type.
+      KernelArgs.push_back(
+        Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+      llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        LoadedArg->setMetadata("tbaa", TBAAAllocation);
+      }
+      KernelArgs.push_back(LoadedArg);
+    }
+
+    llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+    const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+    const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+    // Store the output in the accumulator.
+    if (ReturnValuePointerStyle) {
+      Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+    } else {
+      Builder.CreateStore(RetVal, AccumBuf);
+    }
+
+    // Loop exit
+    Builder.SetInsertPoint(Exit, Exit->begin());
+
+    llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+    llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+    if (gEnableRsTbaa) {
+      OutputStore->setMetadata("tbaa", TBAAAllocation);
     }
 
     return true;
@@ -1020,31 +1281,31 @@
   virtual bool runOnModule(llvm::Module &Module) {
     bool Changed  = false;
     this->Module  = &Module;
-    this->Context = &Module.getContext();
+    Context = &Module.getContext();
 
-    this->buildTypes();
+    buildTypes();
 
     bcinfo::MetadataExtractor me(&Module);
     if (!me.extract()) {
       ALOGE("Could not extract metadata from module!");
       return false;
     }
+
+    // Expand forEach_* style kernels.
     mExportForEachCount = me.getExportForEachSignatureCount();
     mExportForEachNameList = me.getExportForEachNameList();
     mExportForEachSignatureList = me.getExportForEachSignatureList();
 
-    bool AllocsExposed = allocPointersExposed(Module);
-
     for (size_t i = 0; i < mExportForEachCount; ++i) {
       const char *name = mExportForEachNameList[i];
       uint32_t signature = mExportForEachSignatureList[i];
       llvm::Function *kernel = Module.getFunction(name);
       if (kernel) {
         if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
-          Changed |= ExpandKernel(kernel, signature);
+          Changed |= ExpandForEach(kernel, signature);
           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
         } else if (kernel->getReturnType()->isVoidTy()) {
-          Changed |= ExpandFunction(kernel, signature);
+          Changed |= ExpandOldStyleForEach(kernel, signature);
           kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
         } else {
           // There are some graphics root functions that are not
@@ -1054,7 +1315,18 @@
       }
     }
 
-    if (gEnableRsTbaa && !AllocsExposed) {
+    // Expand reduce_* style kernels.
+    mExportReduceCount = me.getExportReduceCount();
+    mExportReduceNameList = me.getExportReduceNameList();
+
+    for (size_t i = 0; i < mExportReduceCount; ++i) {
+      llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+      if (kernel) {
+        Changed |= ExpandReduce(kernel);
+      }
+    }
+
+    if (gEnableRsTbaa && !allocPointersExposed(Module)) {
       connectRenderScriptTBAAMetadata(Module);
     }
 
@@ -1062,21 +1334,21 @@
   }
 
   virtual const char *getPassName() const {
-    return "ForEach-able Function Expansion";
+    return "forEach_* and reduce_* function expansion";
   }
 
-}; // end RSForEachExpandPass
+}; // end RSKernelExpandPass
 
 } // end anonymous namespace
 
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
 
 namespace bcc {
 
 llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
-  return new RSForEachExpandPass(pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt) {
+  return new RSKernelExpandPass(pEnableStepOpt);
 }
 
 } // end namespace bcc

diff --git a/lib/Renderscript/RSStubsWhiteList.cpp b/lib/Renderscript/RSStubsWhiteList.cpp
index b69681d..426fb43 100644
--- a/lib/Renderscript/RSStubsWhiteList.cpp
+++ b/lib/Renderscript/RSStubsWhiteList.cpp

@@ -1235,6 +1235,7 @@
 "_Z3madfff",
 "_Z3maxDv2_cS_",
 "_Z3maxDv2_fS_",
+"_Z3maxDv2_ff",
 "_Z3maxDv2_hS_",
 "_Z3maxDv2_iS_",
 "_Z3maxDv2_jS_",
@@ -1244,6 +1245,7 @@
 "_Z3maxDv2_tS_",
 "_Z3maxDv3_cS_",
 "_Z3maxDv3_fS_",
+"_Z3maxDv3_ff",
 "_Z3maxDv3_hS_",
 "_Z3maxDv3_iS_",
 "_Z3maxDv3_jS_",
@@ -1253,6 +1255,7 @@
 "_Z3maxDv3_tS_",
 "_Z3maxDv4_cS_",
 "_Z3maxDv4_fS_",
+"_Z3maxDv4_ff",
 "_Z3maxDv4_hS_",
 "_Z3maxDv4_iS_",
 "_Z3maxDv4_jS_",
@@ -1271,6 +1274,7 @@
 "_Z3maxtt",
 "_Z3minDv2_cS_",
 "_Z3minDv2_fS_",
+"_Z3minDv2_ff",
 "_Z3minDv2_hS_",
 "_Z3minDv2_iS_",
 "_Z3minDv2_jS_",
@@ -1280,6 +1284,7 @@
 "_Z3minDv2_tS_",
 "_Z3minDv3_cS_",
 "_Z3minDv3_fS_",
+"_Z3minDv3_ff",
 "_Z3minDv3_hS_",
 "_Z3minDv3_iS_",
 "_Z3minDv3_jS_",
@@ -1289,6 +1294,7 @@
 "_Z3minDv3_tS_",
 "_Z3minDv4_cS_",
 "_Z3minDv4_fS_",
+"_Z3minDv4_ff",
 "_Z3minDv4_hS_",
 "_Z3minDv4_iS_",
 "_Z3minDv4_jS_",

diff --git a/lib/Support/CompilerConfig.cpp b/lib/Support/CompilerConfig.cpp
index eac26aa..71cd7cc 100644
--- a/lib/Support/CompilerConfig.cpp
+++ b/lib/Support/CompilerConfig.cpp

@@ -155,7 +155,9 @@
 #if defined(TARGET_BUILD)
     if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
 #ifndef FORCE_CPU_VARIANT_32
+#ifdef DEFAULT_ARM_CODEGEN
       setCPU(llvm::sys::getHostCPUName());
+#endif
 #else
 #define XSTR(S) #S
 #define STR(S) XSTR(S)
@@ -175,7 +177,9 @@
 #if defined(TARGET_BUILD)
     if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
 #ifndef FORCE_CPU_VARIANT_64
+#ifdef DEFAULT_ARM64_CODEGEN
       setCPU(llvm::sys::getHostCPUName());
+#endif
 #else
 #define XSTR(S) #S
 #define STR(S) XSTR(S)

diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
index 6f3e175..1cf201a 100644
--- a/tests/libbcc/getelementptr.ll
+++ b/tests/libbcc/getelementptr.ll

@@ -3,7 +3,7 @@
 ; that they index into the right positions of the structure and that
 ; the instructions that are generated are in the loop header.
 
-; RUN: opt -load libbcc.so -foreachexp -S < %s | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
 
 ; ModuleID = 'test_getelementptr.bc'
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -41,7 +41,7 @@
 ; New style kernel with multiple inputs
 define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
   ret i32 0
-; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %arg_outstep)
 ; CHECK: Begin:
 ; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
 ; CHECK: load i8*, i8** %out_buf.gep
@@ -49,14 +49,10 @@
 ; CHECK: load i32, i32* %Y.gep
 ; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
 ; CHECK: load i32, i32* %Z.gep
-; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
-; CHECK: load i32, i32* %instep_addr.gep
 ; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
 ; CHECK: load i8*, i8** %input_buf.gep
-; CHECK: %instep_addr.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 1
-; CHECK: load i32, i32* %instep_addr.gep1
-; CHECK: %input_buf.gep3 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
-; CHECK: load i8*, i8** %input_buf.gep3
+; CHECK: %input_buf.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep1
 ; CHECK: Loop:
 }
 

diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
new file mode 100644
index 0000000..5b0a270
--- /dev/null
+++ b/tests/libbcc/tbaa-through-alloca.ll

@@ -0,0 +1,71 @@
+; This test checks that the code doesn't aggressively apply TBAA
+; metadata to temporaries that are passed by pointer to kernels.
+
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+%struct.int5.0 = type { [5 x i32] }
+
+; Function Attrs: nounwind
+define void @add1_int5(%struct.int5.0* noalias nocapture sret %agg.result, %struct.int5.0* nocapture %in) #0 {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds %struct.int5.0, %struct.int5.0* %in, i64 0, i32 0, i64 %indvars.iv
+; CHECK: MayAlias: %load_from_input{{.*}} <-> store %struct.int5.0 %input, %struct.int5.0* %input_struct_slot
+  %load_from_input = load i32, i32* %2, align 4, !tbaa !9
+  %3 = add nsw i32 %load_from_input, 1
+  store i32 %3, i32* %2, align 4, !tbaa !9
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 5
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  %5 = bitcast %struct.int5.0* %agg.result to i8*
+  %6 = bitcast %struct.int5.0* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 20, i32 4, i1 false), !tbaa.struct !13
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+!\23rs_export_type = !{!7}
+!\25int5 = !{!8}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1_int5"}
+!5 = !{!"0"}
+!6 = !{!"35"}
+!7 = !{!"int5"}
+!8 = !{!"data", !"<ConstantArray>"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{i64 0, i64 20, !14}
+!14 = !{!11, !11, i64 0}

diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
new file mode 100644
index 0000000..6d8cb48
--- /dev/null
+++ b/tests/libbcc/tbaa.ll

@@ -0,0 +1,43 @@
+; Basic test of TBAA that should report that pointer loads do not
+; alias with stores to allocations.
+
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; CHECK:   NoAlias:   %0 = load {{.*}}, i8** %out_buf.gep, !tbaa {{.*}} <->   store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+; CHECK:   NoAlias:   %input_buf = load i8*, i8** %input_buf.gep, !tbaa {{.*}} <->   store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+
+; Function Attrs: nounwind readnone
+define i32 @add1(i32 %in) #0 {
+  %1 = add nsw i32 %in, 1
+  ret i32 %1
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1"}
+!5 = !{!"0"}
+!6 = !{!"35"}
commit	2f041b66ae4833317b7f2d9782c3d3a3fb4938bf	[log] [tgz]
author	Stephen Hines <srhines@google.com>	Thu Aug 27 10:49:41 2015 +0000
committer	Android Git Automerger <android-git-automerger@android.com>	Thu Aug 27 10:49:41 2015 +0000
tree	31930a291c99fe616059e735ef57023eff11a9c7
parent	b4544c32f3ddb3a6a6494b266b00adf12c74d47a [diff]
parent	5f23bcccf58fc33f84f3daef86f852f8c7268892 [diff]