am 4d7108a1: (-s ours) am 0248f898: am 335e0bb9: Merge "Add libLLVM to LOCAL_LDLIBS so that the mac build can link correctly."

* commit '4d7108a1b28142ea600f301b919dfb8c057b97d4':
diff --git a/Android.mk b/Android.mk
index d1bf790..ad21289 100644
--- a/Android.mk
+++ b/Android.mk
@@ -100,20 +100,17 @@
 
 LOCAL_SHARED_LIBRARIES := libbcinfo
 
-ifdef USE_MINGW
-# Windows needs libLLVM, since DLLs must really have fully resolved their
-# symbolic dependencies.
-LOCAL_SHARED_LIBRARIES += libLLVM
-else
-LOCAL_LDLIBS := -ldl -lpthread -lLLVM
-ifeq (true,$(FORCE_BUILD_LLVM_COMPONENTS))
-# This line allows libbcc to be used as an LLVM loadable module with
-# opt. We don't build unless we have libLLVMLinker, which is not
-# provided as a prebuilt. libLLVMLinker is needed because it is not
-# pulled into opt.
+ifndef USE_MINGW
+LOCAL_LDLIBS := -ldl -lpthread
+endif
+
+include $(LIBBCC_ROOT_PATH)/llvm-loadable-libbcc.mk
+
+ifeq ($(CAN_BUILD_HOST_LLVM_LOADABLE_MODULE),true)
 LOCAL_STATIC_LIBRARIES += libLLVMLinker
-endif  # FORCE_BUILD_LLVM_COMPONENTS
-endif  # USE_MINGW
+else
+LOCAL_SHARED_LIBRARIES += libLLVM
+endif
 
 include $(LIBBCC_HOST_BUILD_MK)
 include $(LLVM_HOST_BUILD_MK)
diff --git a/bcinfo/Android.mk b/bcinfo/Android.mk
index 573aa82..3da0d34 100644
--- a/bcinfo/Android.mk
+++ b/bcinfo/Android.mk
@@ -81,12 +81,14 @@
 LOCAL_STATIC_LIBRARIES += $(libbcinfo_STATIC_LIBRARIES)
 LOCAL_STATIC_LIBRARIES += libcutils liblog
 
-ifdef USE_MINGW
-# Windows needs libLLVM, since DLLs must really have fully resolved their
-# symbolic dependencies.
+ifndef USE_MINGW
+LOCAL_LDLIBS := -ldl -lpthread
+endif
+
+include $(LOCAL_PATH)/../llvm-loadable-libbcc.mk
+
+ifneq ($(CAN_BUILD_HOST_LLVM_LOADABLE_MODULE),true)
 LOCAL_SHARED_LIBRARIES += libLLVM
-else
-LOCAL_LDLIBS := -ldl -lpthread -lLLVM
 endif
 
 include $(LLVM_ROOT_PATH)/llvm-host-build.mk
diff --git a/bcinfo/BitReader_2_7/Android.mk b/bcinfo/BitReader_2_7/Android.mk
index 5cd3b7b..181c731 100644
--- a/bcinfo/BitReader_2_7/Android.mk
+++ b/bcinfo/BitReader_2_7/Android.mk
@@ -1,6 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 bitcode_reader_2_7_SRC_FILES := \
diff --git a/bcinfo/BitReader_2_7/BitcodeReader.cpp b/bcinfo/BitReader_2_7/BitcodeReader.cpp
index ea910ee..894b801 100644
--- a/bcinfo/BitReader_2_7/BitcodeReader.cpp
+++ b/bcinfo/BitReader_2_7/BitcodeReader.cpp
@@ -262,9 +262,9 @@
 
   bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
-  std::error_code MaterializeModule(Module *M) override;
+  std::error_code materializeModule(Module *M) override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
-  void Dematerialize(GlobalValue *GV) override;
+  void dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
@@ -2302,8 +2302,7 @@
         return Error("Invalid type for value");
 
       auto *NewGA =
-          GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                              getDecodedLinkage(Record[2]), "", TheModule);
+          GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
       if (Record.size() > 3)
         NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3141,7 +3140,7 @@
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3349,7 +3348,7 @@
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If this function isn't dematerializable, this is a noop.
   if (!F || !isDematerializable(F))
@@ -3362,7 +3361,7 @@
   F->setIsMaterializable(true);
 }
 
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/BitReader_3_0/Android.mk b/bcinfo/BitReader_3_0/Android.mk
index b425475..95ccd40 100644
--- a/bcinfo/BitReader_3_0/Android.mk
+++ b/bcinfo/BitReader_3_0/Android.mk
@@ -1,6 +1,6 @@
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 bitcode_reader_3_0_SRC_FILES := \
diff --git a/bcinfo/BitReader_3_0/BitcodeReader.cpp b/bcinfo/BitReader_3_0/BitcodeReader.cpp
index 0c99f3b..0d1262c 100644
--- a/bcinfo/BitReader_3_0/BitcodeReader.cpp
+++ b/bcinfo/BitReader_3_0/BitcodeReader.cpp
@@ -500,9 +500,9 @@
 
   bool isDematerializable(const GlobalValue *GV) const override;
   std::error_code materialize(GlobalValue *GV) override;
-  std::error_code MaterializeModule(Module *M) override;
+  std::error_code materializeModule(Module *M) override;
   std::vector<StructType *> getIdentifiedStructTypes() const override;
-  void Dematerialize(GlobalValue *GV) override;
+  void dematerialize(GlobalValue *GV) override;
 
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
@@ -2570,8 +2570,7 @@
         return Error("Invalid type for value");
 
       auto *NewGA =
-          GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
-                              getDecodedLinkage(Record[2]), "", TheModule);
+          GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
       // Old bitcode files didn't have visibility field.
       if (Record.size() > 3)
         NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3420,7 +3419,7 @@
       InstructionList.push_back(I);
       break;
     }
-    case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3684,7 +3683,7 @@
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If this function isn't dematerializable, this is a noop.
   if (!F || !isDematerializable(F))
@@ -3697,7 +3696,7 @@
   F->setIsMaterializable(true);
 }
 
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index 23d97fe..add1ab1 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp
@@ -21,7 +21,7 @@
 
 #define LOG_TAG "bcinfo"
 #include <cutils/log.h>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
 #include <cutils/properties.h>
 #endif
 
@@ -340,7 +340,7 @@
   }
   mRSFloatPrecision = RelaxedPragmaSeen ? RS_FP_Relaxed : RS_FP_Full;
 
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
   // Provide an override for precsiion via adb shell setprop
   // adb shell setprop debug.rs.precision rs_fp_full
   // adb shell setprop debug.rs.precision rs_fp_relaxed
diff --git a/bcinfo/Wrap/Android.mk b/bcinfo/Wrap/Android.mk
index 7da8b3f..1b5db36 100644
--- a/bcinfo/Wrap/Android.mk
+++ b/bcinfo/Wrap/Android.mk
@@ -16,7 +16,7 @@
 
 LOCAL_PATH:= $(call my-dir)
 
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
 include $(LLVM_ROOT_PATH)/llvm.mk
 
 llvm_wrap_SRC_FILES := \
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 75cde37..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -80,13 +80,11 @@
 
   enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
 
-  bool addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM);
   bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
-  bool addInvariantPass(llvm::legacy::PassManager &pPM);
-  bool addInvokeHelperPass(llvm::legacy::PassManager &pPM);
-  bool addPostLTOCustomPasses(llvm::legacy::PassManager &pPM);
+  void addExpandKernelPass(llvm::legacy::PassManager &pPM);
+  void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
+  void addInvariantPass(llvm::legacy::PassManager &pPM);
+  void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
 
 public:
   Compiler();
diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h
@@ -25,7 +25,7 @@
 namespace bcc {
 
 llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
 
 llvm::FunctionPass *
 createRSInvariantPass();
diff --git a/include/bcc/Support/Properties.h b/include/bcc/Support/Properties.h
index c82901c..4c3c404 100644
--- a/include/bcc/Support/Properties.h
+++ b/include/bcc/Support/Properties.h
@@ -20,12 +20,12 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
 #include <cutils/properties.h>
 #endif
 
 static inline uint32_t getProperty(const char *str) {
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
     char buf[PROPERTY_VALUE_MAX];
     property_get(str, buf, "0");
     return atoi(buf);
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index c314b6c..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -143,6 +143,8 @@
 }
 
 
+// This function has complete responsibility for creating and executing the
+// exact list of compiler passes.
 enum Compiler::ErrorCode Compiler::runPasses(Script &pScript,
                                              llvm::raw_pwrite_stream &pResult) {
   // Pass manager for link-time optimization
@@ -153,10 +155,13 @@
 
   passes.add(createTargetTransformInfoWrapperPass(mTarget->getTargetIRAnalysis()));
 
-  // Add our custom passes.
-  if (!addCustomPasses(pScript, passes)) {
+  // Add some initial custom passes.
+  addInvokeHelperPass(passes);
+  addExpandKernelPass(passes);
+  addInvariantPass(passes);
+  if (!addInternalizeSymbolsPass(pScript, passes))
     return kErrCustomPasses;
-  }
+  addGlobalInfoPass(pScript, passes);
 
   if (mTarget->getOptLevel() == llvm::CodeGenOpt::None) {
     passes.add(llvm::createGlobalOptimizerPass());
@@ -187,9 +192,9 @@
 
   // These passes have to come after LTO, since we don't want to examine
   // functions that are never actually called.
-  if (!addPostLTOCustomPasses(passes)) {
-    return kErrCustomPasses;
-  }
+  if (llvm::Triple(getTargetMachine().getTargetTriple()).getArch() == llvm::Triple::x86_64)
+    passes.add(createRSX86_64CallConvPass());  // Add pass to correct calling convention for X86-64.
+  passes.add(createRSIsThreadablePass());      // Add pass to mark script as threadable.
 
   // RSEmbedInfoPass needs to come after we have scanned for non-threadable
   // functions.
@@ -324,9 +329,11 @@
   size_t exportVarCount = me.getExportVarCount();
   size_t exportFuncCount = me.getExportFuncCount();
   size_t exportForEachCount = me.getExportForEachSignatureCount();
+  size_t exportReduceCount = me.getExportReduceCount();
   const char **exportVarNameList = me.getExportVarNameList();
   const char **exportFuncNameList = me.getExportFuncNameList();
   const char **exportForEachNameList = me.getExportForEachNameList();
+  const char **exportReduceNameList = me.getExportReduceNameList();
   size_t i;
 
   for (i = 0; i < exportVarCount; ++i) {
@@ -337,18 +344,22 @@
     export_symbols.push_back(exportFuncNameList[i]);
   }
 
-  // Expanded foreach functions should not be internalized, too.
-  // expanded_foreach_funcs keeps the .expand version of the kernel names
-  // around until createInternalizePass() is finished making its own
-  // copy of the visible symbols.
-  std::vector<std::string> expanded_foreach_funcs;
+  // Expanded foreach and reduce functions should not be
+  // internalized. expanded_funcs keeps the names of the expanded
+  // functions around until createInternalizePass() is finished making
+  // its own copy of the visible symbols.
+  std::vector<std::string> expanded_funcs;
+  expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
   for (i = 0; i < exportForEachCount; ++i) {
-    expanded_foreach_funcs.push_back(
-        std::string(exportForEachNameList[i]) + ".expand");
+    expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+  }
+  for (i = 0; i < exportReduceCount; ++i) {
+    expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
   }
 
-  for (i = 0; i < exportForEachCount; i++) {
-      export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+  for (auto &symbol_name : expanded_funcs) {
+    export_symbols.push_back(symbol_name.c_str());
   }
 
   pPM.add(llvm::createInternalizePass(export_symbols));
@@ -356,69 +367,31 @@
   return true;
 }
 
-bool Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
   llvm::Triple arch(getTargetMachine().getTargetTriple());
   if (arch.isArch64Bit()) {
     pPM.add(createRSInvokeHelperPass());
   }
-  return true;
 }
 
-bool Compiler::addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM) {
-  // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+  // Expand ForEach and reduce on CPU path to reduce launch overhead.
   bool pEnableStepOpt = true;
-  pPM.add(createRSForEachExpandPass(pEnableStepOpt));
-
-  return true;
+  pPM.add(createRSKernelExpandPass(pEnableStepOpt));
 }
 
-bool Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
+void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
   // Add additional information about RS global variables inside the Module.
   RSScript &script = static_cast<RSScript &>(pScript);
   if (script.getEmbedGlobalInfo()) {
     pPM.add(createRSGlobalInfoPass(script.getEmbedGlobalInfoSkipConstant()));
   }
-
-  return true;
 }
 
-bool Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
   // Mark Loads from RsExpandKernelDriverInfo as "load.invariant".
   // Should run after ExpandForEach and before inlining.
   pPM.add(createRSInvariantPass());
-
-  return true;
-}
-
-bool Compiler::addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM) {
-  if (!addInvokeHelperPass(pPM))
-    return false;
-
-  if (!addExpandForEachPass(pScript, pPM))
-    return false;
-
-  if (!addInvariantPass(pPM))
-    return false;
-
-  if (!addInternalizeSymbolsPass(pScript, pPM))
-    return false;
-
-  if (!addGlobalInfoPass(pScript, pPM))
-    return false;
-
-  return true;
-}
-
-bool Compiler::addPostLTOCustomPasses(llvm::legacy::PassManager &pPM) {
-  // Add pass to correct calling convention for X86-64.
-  llvm::Triple arch(getTargetMachine().getTargetTriple());
-  if (arch.getArch() == llvm::Triple::x86_64)
-    pPM.add(createRSX86_64CallConvPass());
-
-  // Add pass to mark script as threadable.
-  pPM.add(createRSIsThreadablePass());
-
-  return true;
 }
 
 enum Compiler::ErrorCode Compiler::screenGlobalFunctions(Script &pScript) {
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -24,7 +24,7 @@
 libbcc_renderscript_SRC_FILES := \
   RSCompilerDriver.cpp \
   RSEmbedInfo.cpp \
-  RSForEachExpand.cpp \
+  RSKernelExpand.cpp \
   RSGlobalInfoPass.cpp \
   RSInvariant.cpp \
   RSScript.cpp \
diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index 77478d9..7cc4ffb 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp
@@ -42,7 +42,7 @@
 #include <sstream>
 #include <string>
 
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
 #include <cutils/properties.h>
 #endif
 #include <utils/StopWatch.h>
@@ -379,6 +379,7 @@
 
   pScript.setEmbedGlobalInfo(mEmbedGlobalInfo);
   pScript.setEmbedGlobalInfoSkipConstant(mEmbedGlobalInfoSkipConstant);
+  pScript.setLinkRuntimeCallback(getLinkRuntimeCallback());
 
   Compiler::ErrorCode status = compileScript(pScript, pOut, pOut, pRuntimePath,
                                              pBuildChecksum, pDumpIR);
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -73,11 +73,13 @@
     size_t exportVarCount = me.getExportVarCount();
     size_t exportFuncCount = me.getExportFuncCount();
     size_t exportForEachCount = me.getExportForEachSignatureCount();
+    size_t exportReduceCount = me.getExportReduceCount();
     size_t objectSlotCount = me.getObjectSlotCount();
     size_t pragmaCount = me.getPragmaCount();
     const char **exportVarNameList = me.getExportVarNameList();
     const char **exportFuncNameList = me.getExportFuncNameList();
     const char **exportForEachNameList = me.getExportForEachNameList();
+    const char **exportReduceNameList = me.getExportReduceNameList();
     const uint32_t *exportForEachSignatureList =
         me.getExportForEachSignatureList();
     const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
         << exportForEachNameList[i] << "\n";
     }
 
+    s << "exportReduceCount: " << exportReduceCount << "\n";
+    for (i = 0; i < exportReduceCount; ++i) {
+      s << exportReduceNameList[i] << "\n";
+    }
+
     s << "objectSlotCount: " << objectSlotCount << "\n";
     for (i = 0; i < objectSlotCount; ++i) {
       s << objectSlotList[i] << "\n";
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSForEachExpand.cpp
deleted file mode 100644
index 3e70b1d..0000000
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ /dev/null
@@ -1,1046 +0,0 @@
-/*
- * Copyright 2012, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bcc/Assert.h"
-#include "bcc/Renderscript/RSTransforms.h"
-
-#include <cstdlib>
-#include <functional>
-
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/MDBuilder.h>
-#include <llvm/IR/Module.h>
-#include <llvm/Pass.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/IR/DataLayout.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Type.h>
-#include <llvm/Transforms/Utils/BasicBlockUtils.h>
-
-#include "bcc/Config/Config.h"
-#include "bcc/Support/Log.h"
-
-#include "bcinfo/MetadataExtractor.h"
-
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
-
-using namespace bcc;
-
-namespace {
-
-static const bool gEnableRsTbaa = true;
-
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
- */
-class RSForEachExpandPass : public llvm::ModulePass {
-public:
-  static char ID;
-
-private:
-  static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
-
-  enum RsLaunchDimensionsField {
-    RsLaunchDimensionsFieldX,
-    RsLaunchDimensionsFieldY,
-    RsLaunchDimensionsFieldZ,
-    RsLaunchDimensionsFieldLod,
-    RsLaunchDimensionsFieldFace,
-    RsLaunchDimensionsFieldArray,
-
-    RsLaunchDimensionsFieldCount
-  };
-
-  enum RsExpandKernelDriverInfoPfxField {
-    RsExpandKernelDriverInfoPfxFieldInPtr,
-    RsExpandKernelDriverInfoPfxFieldInStride,
-    RsExpandKernelDriverInfoPfxFieldInLen,
-    RsExpandKernelDriverInfoPfxFieldOutPtr,
-    RsExpandKernelDriverInfoPfxFieldOutStride,
-    RsExpandKernelDriverInfoPfxFieldOutLen,
-    RsExpandKernelDriverInfoPfxFieldDim,
-    RsExpandKernelDriverInfoPfxFieldCurrent,
-    RsExpandKernelDriverInfoPfxFieldUsr,
-    RsExpandKernelDriverInfoPfxFieldUsLenr,
-
-    RsExpandKernelDriverInfoPfxFieldCount
-  };
-
-  llvm::Module *Module;
-  llvm::LLVMContext *Context;
-
-  /*
-   * Pointer to LLVM type information for the the function signature
-   * for expanded kernels.  This must be re-calculated for each
-   * module the pass is run on.
-   */
-  llvm::FunctionType *ExpandedFunctionType;
-
-  uint32_t mExportForEachCount;
-  const char **mExportForEachNameList;
-  const uint32_t *mExportForEachSignatureList;
-
-  // Turns on optimization of allocation stride values.
-  bool mEnableStepOpt;
-
-  uint32_t getRootSignature(llvm::Function *Function) {
-    const llvm::NamedMDNode *ExportForEachMetadata =
-        Module->getNamedMetadata("#rs_export_foreach");
-
-    if (!ExportForEachMetadata) {
-      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
-      for (llvm::Function::arg_iterator B = Function->arg_begin(),
-                                        E = Function->arg_end();
-           B != E;
-           ++B) {
-        RootArgTys.push_back(B->getType());
-      }
-
-      // For pre-ICS bitcode, we may not have signature information. In that
-      // case, we use the size of the RootArgTys to select the number of
-      // arguments.
-      return (1 << RootArgTys.size()) - 1;
-    }
-
-    if (ExportForEachMetadata->getNumOperands() == 0) {
-      return 0;
-    }
-
-    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
-
-    // We only handle the case for legacy root() functions here, so this is
-    // hard-coded to look at only the first such function.
-    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
-    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
-      llvm::Metadata *SigMD = SigNode->getOperand(0);
-      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
-        llvm::StringRef SigString = SigS->getString();
-        uint32_t Signature = 0;
-        if (SigString.getAsInteger(10, Signature)) {
-          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
-          return 0;
-        }
-        return Signature;
-      }
-    }
-
-    return 0;
-  }
-
-  bool isStepOptSupported(llvm::Type *AllocType) {
-
-    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
-    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
-
-    if (mEnableStepOpt) {
-      return false;
-    }
-
-    if (AllocType == VoidPtrTy) {
-      return false;
-    }
-
-    if (!PT) {
-      return false;
-    }
-
-    // remaining conditions are 64-bit only
-    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
-      return true;
-    }
-
-    // coerce suggests an upconverted struct type, which we can't support
-    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
-      return false;
-    }
-
-    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
-    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
-    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
-    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
-      return false;
-    }
-
-    return true;
-  }
-
-  // Get the actual value we should use to step through an allocation.
-  //
-  // Normally the value we use to step through an allocation is given to us by
-  // the driver. However, for certain primitive data types, we can derive an
-  // integer constant for the step value. We use this integer constant whenever
-  // possible to allow further compiler optimizations to take place.
-  //
-  // DL - Target Data size/layout information.
-  // T - Type of allocation (should be a pointer).
-  // OrigStep - Original step increment (root.expand() input from driver).
-  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
-                            llvm::Value *OrigStep) {
-    bccAssert(DL);
-    bccAssert(AllocType);
-    bccAssert(OrigStep);
-    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
-    if (isStepOptSupported(AllocType)) {
-      llvm::Type *ET = PT->getElementType();
-      uint64_t ETSize = DL->getTypeAllocSize(ET);
-      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
-      return llvm::ConstantInt::get(Int32Ty, ETSize);
-    } else {
-      return OrigStep;
-    }
-  }
-
-  /// Builds the types required by the pass for the given context.
-  void buildTypes(void) {
-    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
-
-    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
-    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
-    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
-    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
-    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
-    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
-    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
-
-    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
-     *
-     * struct RsLaunchDimensions {
-     *   uint32_t x;
-     *   uint32_t y;
-     *   uint32_t z;
-     *   uint32_t lod;
-     *   uint32_t face;
-     *   uint32_t array[4];
-     * };
-     */
-    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
-    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
-    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
-    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
-    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
-    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
-    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
-    llvm::StructType *RsLaunchDimensionsTy =
-        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
-
-    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
-     *
-     * struct RsExpandKernelDriverInfoPfx {
-     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
-     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
-     *     uint32_t inLen;
-     *
-     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
-     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
-     *     uint32_t outLen;
-     *
-     *     // Dimension of the launch
-     *     RsLaunchDimensions dim;
-     *
-     *     // The walking iterator of the launch
-     *     RsLaunchDimensions current;
-     *
-     *     const void *usr;
-     *     uint32_t usrLen;
-     *
-     *     // Items below this line are not used by the compiler and can be change in the driver.
-     *     // So the compiler must assume there are an unknown number of fields of unknown type
-     *     // beginning here.
-     * };
-     *
-     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
-     */
-    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
-    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
-    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
-    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
-    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
-    llvm::StructType *RsExpandKernelDriverInfoPfxTy =
-        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
-
-    // Create the function type for expanded kernels.
-
-    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
-
-    llvm::SmallVector<llvm::Type*, 8> ParamTypes;
-    ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x1
-    ParamTypes.push_back(Int32Ty);                          // uint32_t x2
-    ParamTypes.push_back(Int32Ty);                          // uint32_t outstep
-
-    ExpandedFunctionType =
-        llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
-                                false);
-  }
-
-  /// @brief Create skeleton of the expanded function.
-  ///
-  /// This creates a function with the following signature:
-  ///
-  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
-  ///         uint32_t outstep)
-  ///
-  llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
-    llvm::Function *ExpandedFunction =
-      llvm::Function::Create(ExpandedFunctionType,
-                             llvm::GlobalValue::ExternalLinkage,
-                             OldName + ".expand", Module);
-
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
-    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
-    (AI++)->setName("p");
-    (AI++)->setName("x1");
-    (AI++)->setName("x2");
-    (AI++)->setName("arg_outstep");
-
-    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
-                                                       ExpandedFunction);
-    llvm::IRBuilder<> Builder(Begin);
-    Builder.CreateRetVoid();
-
-    return ExpandedFunction;
-  }
-
-  /// @brief Create an empty loop
-  ///
-  /// Create a loop of the form:
-  ///
-  /// for (i = LowerBound; i < UpperBound; i++)
-  ///   ;
-  ///
-  /// After the loop has been created, the builder is set such that
-  /// instructions can be added to the loop body.
-  ///
-  /// @param Builder The builder to use to build this loop. The current
-  ///                position of the builder is the position the loop
-  ///                will be inserted.
-  /// @param LowerBound The first value of the loop iterator
-  /// @param UpperBound The maximal value of the loop iterator
-  /// @param LoopIV A reference that will be set to the loop iterator.
-  /// @return The BasicBlock that will be executed after the loop.
-  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
-                               llvm::Value *LowerBound,
-                               llvm::Value *UpperBound,
-                               llvm::PHINode **LoopIV) {
-    bccAssert(LowerBound->getType() == UpperBound->getType());
-
-    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
-    llvm::Value *Cond, *IVNext;
-    llvm::PHINode *IV;
-
-    CondBB = Builder.GetInsertBlock();
-    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
-    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
-
-    // if (LowerBound < Upperbound)
-    //   goto LoopHeader
-    // else
-    //   goto AfterBB
-    CondBB->getTerminator()->eraseFromParent();
-    Builder.SetInsertPoint(CondBB);
-    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
-    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
-
-    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
-    // iv.next = iv + 1
-    // if (iv.next < Upperbound)
-    //   goto LoopHeader
-    // else
-    //   goto AfterBB
-    Builder.SetInsertPoint(HeaderBB);
-    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
-    IV->addIncoming(LowerBound, CondBB);
-    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
-    IV->addIncoming(IVNext, HeaderBB);
-    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
-    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
-    AfterBB->setName("Exit");
-    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
-    *LoopIV = IV;
-    return AfterBB;
-  }
-
-  // Finish building the outgoing argument list for calling a ForEach-able function.
-  //
-  // ArgVector - on input, the non-special arguments
-  //             on output, the non-special arguments combined with the special arguments
-  //               from SpecialArgVector
-  // SpecialArgVector - special arguments (from ExpandSpecialArguments())
-  // SpecialArgContextIdx - return value of ExpandSpecialArguments()
-  //                          (position of context argument in SpecialArgVector)
-  // CalleeFunction - the ForEach-able function being called
-  // Builder - for inserting code into the caller function
-  template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
-  void finishArgList(      llvm::SmallVector<llvm::Value *, ArgVectorLen>        &ArgVector,
-                     const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
-                     const int SpecialArgContextIdx,
-                     const llvm::Function &CalleeFunction,
-                     llvm::IRBuilder<> &CallerBuilder) {
-    /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
-     * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
-     * two types represent the same thing).  Therefore, we must introduce a pointer cast when
-     * generating a call to the kernel function.
-     */
-    const int ArgContextIdx =
-        SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
-    ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
-    if (ArgContextIdx >= 0) {
-      llvm::Type *ContextArgType = nullptr;
-      int ArgIdx = ArgContextIdx;
-      for (const auto &Arg : CalleeFunction.getArgumentList()) {
-        if (!ArgIdx--) {
-          ContextArgType = Arg.getType();
-          break;
-        }
-      }
-      bccAssert(ContextArgType);
-      ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
-    }
-  }
-
-public:
-  RSForEachExpandPass(bool pEnableStepOpt = true)
-      : ModulePass(ID), Module(nullptr), Context(nullptr),
-        mEnableStepOpt(pEnableStepOpt) {
-
-  }
-
-  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
-    // This pass does not use any other analysis passes, but it does
-    // add/wrap the existing functions in the module (thus altering the CFG).
-  }
-
-  // Build contribution to outgoing argument list for calling a
-  // ForEach-able function, based on the special parameters of that
-  // function.
-  //
-  // Signature - metadata bits for the signature of the ForEach-able function
-  // X, Arg_p - values derived directly from expanded function,
-  //            suitable for computing arguments for the ForEach-able function
-  // CalleeArgs - contribution is accumulated here
-  // Bump - invoked once for each contributed outgoing argument
-  //
-  // Return value is the (zero-based) position of the context (Arg_p)
-  // argument in the CalleeArgs vector, or a negative value if the
-  // context argument is not placed in the CalleeArgs vector.
-  int ExpandSpecialArguments(uint32_t Signature,
-                             llvm::Value *X,
-                             llvm::Value *Arg_p,
-                             llvm::IRBuilder<> &Builder,
-                             llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
-                             std::function<void ()> Bump) {
-
-    bccAssert(CalleeArgs.empty());
-
-    int Return = -1;
-    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
-      CalleeArgs.push_back(Arg_p);
-      Bump();
-      Return = CalleeArgs.size() - 1;
-    }
-
-    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
-      CalleeArgs.push_back(X);
-      Bump();
-    }
-
-    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
-        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
-
-      llvm::Value *Current = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldCurrent);
-
-      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
-        llvm::Value *Y = Builder.CreateLoad(
-            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldY), "Y");
-
-        CalleeArgs.push_back(Y);
-        Bump();
-      }
-
-      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
-        llvm::Value *Z = Builder.CreateLoad(
-            Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldZ), "Z");
-        CalleeArgs.push_back(Z);
-        Bump();
-      }
-    }
-
-    return Return;
-  }
-
-  /* Performs the actual optimization on a selected function. On success, the
-   * Module will contain a new function of the name "<NAME>.expand" that
-   * invokes <NAME>() in a loop with the appropriate parameters.
-   */
-  bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
-    ALOGV("Expanding ForEach-able Function %s",
-          Function->getName().str().c_str());
-
-    if (!Signature) {
-      Signature = getRootSignature(Function);
-      if (!Signature) {
-        // We couldn't determine how to expand this function based on its
-        // function signature.
-        return false;
-      }
-    }
-
-    llvm::DataLayout DL(Module);
-
-    llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
-
-    /*
-     * Extract the expanded function's parameters.  It is guaranteed by
-     * createEmptyExpandedFunction that there will be five parameters.
-     */
-
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
-    llvm::Function::arg_iterator ExpandedFunctionArgIter =
-      ExpandedFunction->arg_begin();
-
-    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
-
-    llvm::Value *InStep  = nullptr;
-    llvm::Value *OutStep = nullptr;
-
-    // Construct the actual function body.
-    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
-
-    // Collect and construct the arguments for the kernel().
-    // Note that we load any loop-invariant arguments before entering the Loop.
-    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
-
-    llvm::Type  *InTy      = nullptr;
-    llvm::Value *InBasePtr = nullptr;
-    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
-      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
-      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
-      llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, 0);
-      llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
-                                                      "instep_addr");
-
-      InTy = (FunctionArgIter++)->getType();
-      InStep = getStepValue(&DL, InTy, InStepArg);
-
-      InStep->setName("instep");
-
-      llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, 0);
-      InBasePtr = Builder.CreateLoad(InputAddr, "input_base");
-    }
-
-    llvm::Type *OutTy = nullptr;
-    llvm::Value *OutBasePtr = nullptr;
-    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
-      OutTy = (FunctionArgIter++)->getType();
-      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
-      OutStep->setName("outstep");
-      OutBasePtr = Builder.CreateLoad(
-                     Builder.CreateConstInBoundsGEP2_32(nullptr,
-                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
-                         0, 0));
-    }
-
-    llvm::Value *UsrData = nullptr;
-    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
-      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
-      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
-          Builder.CreateStructGEP(nullptr, Arg_p,  RsExpandKernelDriverInfoPfxFieldUsr)), UsrDataTy);
-      UsrData->setName("UsrData");
-    }
-
-    llvm::PHINode *IV;
-    createLoop(Builder, Arg_x1, Arg_x2, &IV);
-
-    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
-    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
-                                                            [&FunctionArgIter]() { FunctionArgIter++; });
-
-    bccAssert(FunctionArgIter == Function->arg_end());
-
-    // Populate the actual call to kernel().
-    llvm::SmallVector<llvm::Value*, 8> RootArgs;
-
-    llvm::Value *InPtr  = nullptr;
-    llvm::Value *OutPtr = nullptr;
-
-    // Calculate the current input and output pointers
-    //
-    // We always calculate the input/output pointers with a GEP operating on i8
-    // values and only cast at the very end to OutTy. This is because the step
-    // between two values is given in bytes.
-    //
-    // TODO: We could further optimize the output by using a GEP operation of
-    // type 'OutTy' in cases where the element type of the allocation allows.
-    if (OutBasePtr) {
-      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-      OutOffset = Builder.CreateMul(OutOffset, OutStep);
-      OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
-      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
-    }
-
-    if (InBasePtr) {
-      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
-      InOffset = Builder.CreateMul(InOffset, InStep);
-      InPtr = Builder.CreateGEP(InBasePtr, InOffset);
-      InPtr = Builder.CreatePointerCast(InPtr, InTy);
-    }
-
-    if (InPtr) {
-      RootArgs.push_back(InPtr);
-    }
-
-    if (OutPtr) {
-      RootArgs.push_back(OutPtr);
-    }
-
-    if (UsrData) {
-      RootArgs.push_back(UsrData);
-    }
-
-    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
-
-    Builder.CreateCall(Function, RootArgs);
-
-    return true;
-  }
-
-  /* Expand a pass-by-value kernel.
-   */
-  bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
-    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
-    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
-
-    // TODO: Refactor this to share functionality with ExpandFunction.
-    llvm::DataLayout DL(Module);
-
-    llvm::Function *ExpandedFunction =
-      createEmptyExpandedFunction(Function->getName());
-
-    /*
-     * Extract the expanded function's parameters.  It is guaranteed by
-     * createEmptyExpandedFunction that there will be five parameters.
-     */
-
-    bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
-    llvm::Function::arg_iterator ExpandedFunctionArgIter =
-      ExpandedFunction->arg_begin();
-
-    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
-    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
-
-    // Construct the actual function body.
-    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
-
-    // Create TBAA meta-data.
-    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
-                 *TBAAAllocation, *TBAAPointer;
-    llvm::MDBuilder MDHelper(*Context);
-
-    TBAARenderScriptDistinct =
-      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
-    TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
-        TBAARenderScriptDistinct);
-    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
-                                                       TBAARenderScript);
-    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
-                                                      TBAAAllocation, 0);
-    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
-                                                    TBAARenderScript);
-    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
-
-    llvm::MDNode *AliasingDomain, *AliasingScope;
-    AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
-    AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
-
-    /*
-     * Collect and construct the arguments for the kernel().
-     *
-     * Note that we load any loop-invariant arguments before entering the Loop.
-     */
-    size_t NumInputs = Function->arg_size();
-
-    // No usrData parameter on kernels.
-    bccAssert(
-        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
-
-    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
-
-    // Check the return type
-    llvm::Type     *OutTy            = nullptr;
-    llvm::Value    *OutStep          = nullptr;
-    llvm::LoadInst *OutBasePtr       = nullptr;
-    llvm::Value    *CastedOutBasePtr = nullptr;
-
-    bool PassOutByPointer = false;
-
-    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
-      llvm::Type *OutBaseTy = Function->getReturnType();
-
-      if (OutBaseTy->isVoidTy()) {
-        PassOutByPointer = true;
-        OutTy = ArgIter->getType();
-
-        ArgIter++;
-        --NumInputs;
-      } else {
-        // We don't increment Args, since we are using the actual return type.
-        OutTy = OutBaseTy->getPointerTo();
-      }
-
-      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
-      OutStep->setName("outstep");
-      OutBasePtr = Builder.CreateLoad(
-                     Builder.CreateConstInBoundsGEP2_32(nullptr,
-                         Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
-                         0, 0));
-
-      if (gEnableRsTbaa) {
-        OutBasePtr->setMetadata("tbaa", TBAAPointer);
-      }
-
-      OutBasePtr->setMetadata("alias.scope", AliasingScope);
-
-      CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
-    }
-
-    llvm::PHINode *IV;
-    createLoop(Builder, Arg_x1, Arg_x2, &IV);
-
-    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
-    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
-                                                            [&NumInputs]() { --NumInputs; });
-
-    llvm::SmallVector<llvm::Type*,  8> InTypes;
-    llvm::SmallVector<llvm::Value*, 8> InSteps;
-    llvm::SmallVector<llvm::Value*, 8> InBasePtrs;
-    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
-
-    bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
-
-    if (NumInputs > 0) {
-      llvm::Value *InsBasePtr  = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
-      llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
-      llvm::Instruction *AllocaInsertionPoint = &*ExpandedFunction->getEntryBlock().begin();
-      for (size_t InputIndex = 0; InputIndex < NumInputs;
-           ++InputIndex, ArgIter++) {
-
-        llvm::Value    *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, InputIndex);
-        llvm::LoadInst *InStepArg  = Builder.CreateLoad(InStepAddr,
-                                                          "instep_addr");
-
-        llvm::Type *InType = ArgIter->getType();
-
-        /*
-         * AArch64 calling conventions dictate that structs of sufficient size
-         * get passed by pointer instead of passed by value.  This, combined
-         * with the fact that we don't allow kernels to operate on pointer
-         * data means that if we see a kernel with a pointer parameter we know
-         * that it is struct input that has been promoted.  As such we don't
-         * need to convert its type to a pointer.  Later we will need to know
-         * to create a temporary copy on the stack, so we save this information
-         * in InStructTempSlots.
-         */
-        if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
-          llvm::Type *ElementType = PtrType->getElementType();
-          uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-          llvm::Value *Slot = new llvm::AllocaInst(ElementType,
-                                                   nullptr,
-                                                   Alignment,
-                                                   "input_struct_slot",
-                                                   AllocaInsertionPoint);
-          InStructTempSlots.push_back(Slot);
-        } else {
-          InType = InType->getPointerTo();
-          InStructTempSlots.push_back(nullptr);
-        }
-
-        llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
-
-        InStep->setName("instep");
-
-        llvm::Value    *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, InputIndex);
-        llvm::LoadInst *InBasePtr = Builder.CreateLoad(InputAddr,
-                                                         "input_base");
-        llvm::Value    *CastInBasePtr = Builder.CreatePointerCast(InBasePtr,
-                                                                    InType, "casted_in");
-        if (gEnableRsTbaa) {
-          InBasePtr->setMetadata("tbaa", TBAAPointer);
-        }
-
-        InBasePtr->setMetadata("alias.scope", AliasingScope);
-
-        InTypes.push_back(InType);
-        InSteps.push_back(InStep);
-        InBasePtrs.push_back(CastInBasePtr);
-      }
-    }
-
-    // Populate the actual call to kernel().
-    llvm::SmallVector<llvm::Value*, 8> RootArgs;
-
-    // Calculate the current input and output pointers
-    //
-    //
-    // We always calculate the input/output pointers with a GEP operating on i8
-    // values combined with a multiplication and only cast at the very end to
-    // OutTy.  This is to account for dynamic stepping sizes when the value
-    // isn't apparent at compile time.  In the (very common) case when we know
-    // the step size at compile time, due to haveing complete type information
-    // this multiplication will optmized out and produces code equivalent to a
-    // a GEP on a pointer of the correct type.
-
-    // Output
-
-    llvm::Value *OutPtr = nullptr;
-    if (CastedOutBasePtr) {
-      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
-      OutPtr    = Builder.CreateGEP(CastedOutBasePtr, OutOffset);
-
-      if (PassOutByPointer) {
-        RootArgs.push_back(OutPtr);
-      }
-    }
-
-    // Inputs
-
-    if (NumInputs > 0) {
-      llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
-
-      for (size_t Index = 0; Index < NumInputs; ++Index) {
-        llvm::Value *InPtr    = Builder.CreateGEP(InBasePtrs[Index], Offset);
-        llvm::Value *Input;
-
-        if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
-          // Pass a pointer to a temporary on the stack, rather than
-          // passing a pointer to the original value. We do not want
-          // the kernel to potentially modify the input data.
-
-          llvm::Type *ElementType = llvm::cast<llvm::PointerType>(
-                                        InPtr->getType())->getElementType();
-          uint64_t StoreSize = DL.getTypeStoreSize(ElementType);
-          uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-
-          Builder.CreateMemCpy(TemporarySlot, InPtr, StoreSize, Alignment,
-                               /* isVolatile = */ false,
-                               /* !tbaa = */ gEnableRsTbaa ? TBAAAllocation : nullptr,
-                               /* !tbaa.struct = */ nullptr,
-                               /* !alias.scope = */ AliasingScope);
-          Input = TemporarySlot;
-        } else {
-          llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
-
-          if (gEnableRsTbaa) {
-            InputLoad->setMetadata("tbaa", TBAAAllocation);
-          }
-
-          InputLoad->setMetadata("alias.scope", AliasingScope);
-
-          Input = InputLoad;
-        }
-
-        RootArgs.push_back(Input);
-      }
-    }
-
-    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
-
-    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
-
-    if (OutPtr && !PassOutByPointer) {
-      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
-      if (gEnableRsTbaa) {
-        Store->setMetadata("tbaa", TBAAAllocation);
-      }
-      Store->setMetadata("alias.scope", AliasingScope);
-    }
-
-    return true;
-  }
-
-  /// @brief Checks if pointers to allocation internals are exposed
-  ///
-  /// This function verifies if through the parameters passed to the kernel
-  /// or through calls to the runtime library the script gains access to
-  /// pointers pointing to data within a RenderScript Allocation.
-  /// If we know we control all loads from and stores to data within
-  /// RenderScript allocations and if we know the run-time internal accesses
-  /// are all annotated with RenderScript TBAA metadata, only then we
-  /// can safely use TBAA to distinguish between generic and from-allocation
-  /// pointers.
-  bool allocPointersExposed(llvm::Module &Module) {
-    // Old style kernel function can expose pointers to elements within
-    // allocations.
-    // TODO: Extend analysis to allow simple cases of old-style kernels.
-    for (size_t i = 0; i < mExportForEachCount; ++i) {
-      const char *Name = mExportForEachNameList[i];
-      uint32_t Signature = mExportForEachSignatureList[i];
-      if (Module.getFunction(Name) &&
-          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
-        return true;
-      }
-    }
-
-    // Check for library functions that expose a pointer to an Allocation or
-    // that are not yet annotated with RenderScript-specific tbaa information.
-    static const std::vector<const char *> Funcs{
-      // rsGetElementAt(...)
-      "_Z14rsGetElementAt13rs_allocationj",
-      "_Z14rsGetElementAt13rs_allocationjj",
-      "_Z14rsGetElementAt13rs_allocationjjj",
-
-      // rsSetElementAt()
-      "_Z14rsSetElementAt13rs_allocationPvj",
-      "_Z14rsSetElementAt13rs_allocationPvjj",
-      "_Z14rsSetElementAt13rs_allocationPvjjj",
-
-      // rsGetElementAtYuv_uchar_Y()
-      "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
-
-      // rsGetElementAtYuv_uchar_U()
-      "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
-
-      // rsGetElementAtYuv_uchar_V()
-      "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
-    };
-
-    for (auto FI : Funcs) {
-      llvm::Function *Function = Module.getFunction(FI);
-
-      if (!Function) {
-        ALOGE("Missing run-time function '%s'", FI);
-        return true;
-      }
-
-      if (Function->getNumUses() > 0) {
-        return true;
-      }
-    }
-
-    return false;
-  }
-
-  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
-  ///
-  /// The TBAA metadata used to annotate loads/stores from RenderScript
-  /// Allocations is generated in a separate TBAA tree with a
-  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
-  /// all nodes in unrelated alias analysis trees. This function makes the
-  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
-  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
-  /// the connected trees every access to an Allocation is resolved to
-  /// must-alias if compared to a normal C/C++ access.
-  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
-    llvm::MDBuilder MDHelper(*Context);
-    llvm::MDNode *TBAARenderScriptDistinct =
-      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
-    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
-        "RenderScript TBAA", TBAARenderScriptDistinct);
-    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
-    TBAARenderScript->replaceOperandWith(1, TBAARoot);
-  }
-
-  virtual bool runOnModule(llvm::Module &Module) {
-    bool Changed  = false;
-    this->Module  = &Module;
-    this->Context = &Module.getContext();
-
-    this->buildTypes();
-
-    bcinfo::MetadataExtractor me(&Module);
-    if (!me.extract()) {
-      ALOGE("Could not extract metadata from module!");
-      return false;
-    }
-    mExportForEachCount = me.getExportForEachSignatureCount();
-    mExportForEachNameList = me.getExportForEachNameList();
-    mExportForEachSignatureList = me.getExportForEachSignatureList();
-
-    bool AllocsExposed = allocPointersExposed(Module);
-
-    for (size_t i = 0; i < mExportForEachCount; ++i) {
-      const char *name = mExportForEachNameList[i];
-      uint32_t signature = mExportForEachSignatureList[i];
-      llvm::Function *kernel = Module.getFunction(name);
-      if (kernel) {
-        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
-          Changed |= ExpandKernel(kernel, signature);
-          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
-        } else if (kernel->getReturnType()->isVoidTy()) {
-          Changed |= ExpandFunction(kernel, signature);
-          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
-        } else {
-          // There are some graphics root functions that are not
-          // expanded, but that will be called directly. For those
-          // functions, we can not set the linkage to internal.
-        }
-      }
-    }
-
-    if (gEnableRsTbaa && !AllocsExposed) {
-      connectRenderScriptTBAAMetadata(Module);
-    }
-
-    return Changed;
-  }
-
-  virtual const char *getPassName() const {
-    return "ForEach-able Function Expansion";
-  }
-
-}; // end RSForEachExpandPass
-
-} // end anonymous namespace
-
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
-
-namespace bcc {
-
-llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
-  return new RSForEachExpandPass(pEnableStepOpt);
-}
-
-} // end namespace bcc
diff --git a/lib/Renderscript/RSKernelExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
new file mode 100644
index 0000000..34611d7
--- /dev/null
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright 2012, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bcc/Assert.h"
+#include "bcc/Renderscript/RSTransforms.h"
+
+#include <cstdlib>
+#include <functional>
+
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "bcc/Config/Config.h"
+#include "bcc/Support/Log.h"
+
+#include "bcinfo/MetadataExtractor.h"
+
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
+
+using namespace bcc;
+
+namespace {
+
+static const bool gEnableRsTbaa = true;
+
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
+ */
+class RSKernelExpandPass : public llvm::ModulePass {
+public:
+  static char ID;
+
+private:
+  static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
+
+  enum RsLaunchDimensionsField {
+    RsLaunchDimensionsFieldX,
+    RsLaunchDimensionsFieldY,
+    RsLaunchDimensionsFieldZ,
+    RsLaunchDimensionsFieldLod,
+    RsLaunchDimensionsFieldFace,
+    RsLaunchDimensionsFieldArray,
+
+    RsLaunchDimensionsFieldCount
+  };
+
+  enum RsExpandKernelDriverInfoPfxField {
+    RsExpandKernelDriverInfoPfxFieldInPtr,
+    RsExpandKernelDriverInfoPfxFieldInStride,
+    RsExpandKernelDriverInfoPfxFieldInLen,
+    RsExpandKernelDriverInfoPfxFieldOutPtr,
+    RsExpandKernelDriverInfoPfxFieldOutStride,
+    RsExpandKernelDriverInfoPfxFieldOutLen,
+    RsExpandKernelDriverInfoPfxFieldDim,
+    RsExpandKernelDriverInfoPfxFieldCurrent,
+    RsExpandKernelDriverInfoPfxFieldUsr,
+    RsExpandKernelDriverInfoPfxFieldUsLenr,
+
+    RsExpandKernelDriverInfoPfxFieldCount
+  };
+
+  llvm::Module *Module;
+  llvm::LLVMContext *Context;
+
+  /*
+   * Pointers to LLVM type information for the the function signatures
+   * for expanded functions. These must be re-calculated for each module
+   * the pass is run on.
+   */
+  llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
+
+  uint32_t mExportForEachCount;
+  const char **mExportForEachNameList;
+  const uint32_t *mExportForEachSignatureList;
+
+  uint32_t mExportReduceCount;
+  const char **mExportReduceNameList;
+
+  // Turns on optimization of allocation stride values.
+  bool mEnableStepOpt;
+
+  uint32_t getRootSignature(llvm::Function *Function) {
+    const llvm::NamedMDNode *ExportForEachMetadata =
+        Module->getNamedMetadata("#rs_export_foreach");
+
+    if (!ExportForEachMetadata) {
+      llvm::SmallVector<llvm::Type*, 8> RootArgTys;
+      for (llvm::Function::arg_iterator B = Function->arg_begin(),
+                                        E = Function->arg_end();
+           B != E;
+           ++B) {
+        RootArgTys.push_back(B->getType());
+      }
+
+      // For pre-ICS bitcode, we may not have signature information. In that
+      // case, we use the size of the RootArgTys to select the number of
+      // arguments.
+      return (1 << RootArgTys.size()) - 1;
+    }
+
+    if (ExportForEachMetadata->getNumOperands() == 0) {
+      return 0;
+    }
+
+    bccAssert(ExportForEachMetadata->getNumOperands() > 0);
+
+    // We only handle the case for legacy root() functions here, so this is
+    // hard-coded to look at only the first such function.
+    llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
+    if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
+      llvm::Metadata *SigMD = SigNode->getOperand(0);
+      if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
+        llvm::StringRef SigString = SigS->getString();
+        uint32_t Signature = 0;
+        if (SigString.getAsInteger(10, Signature)) {
+          ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
+          return 0;
+        }
+        return Signature;
+      }
+    }
+
+    return 0;
+  }
+
+  bool isStepOptSupported(llvm::Type *AllocType) {
+
+    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
+    llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
+
+    if (mEnableStepOpt) {
+      return false;
+    }
+
+    if (AllocType == VoidPtrTy) {
+      return false;
+    }
+
+    if (!PT) {
+      return false;
+    }
+
+    // remaining conditions are 64-bit only
+    if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
+      return true;
+    }
+
+    // coerce suggests an upconverted struct type, which we can't support
+    if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
+      return false;
+    }
+
+    // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
+    llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
+    llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
+    if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
+      return false;
+    }
+
+    return true;
+  }
+
+  // Get the actual value we should use to step through an allocation.
+  //
+  // Normally the value we use to step through an allocation is given to us by
+  // the driver. However, for certain primitive data types, we can derive an
+  // integer constant for the step value. We use this integer constant whenever
+  // possible to allow further compiler optimizations to take place.
+  //
+  // DL - Target Data size/layout information.
+  // T - Type of allocation (should be a pointer).
+  // OrigStep - Original step increment (root.expand() input from driver).
+  llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
+                            llvm::Value *OrigStep) {
+    bccAssert(DL);
+    bccAssert(AllocType);
+    bccAssert(OrigStep);
+    llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
+    if (isStepOptSupported(AllocType)) {
+      llvm::Type *ET = PT->getElementType();
+      uint64_t ETSize = DL->getTypeAllocSize(ET);
+      llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
+      return llvm::ConstantInt::get(Int32Ty, ETSize);
+    } else {
+      return OrigStep;
+    }
+  }
+
+  /// Builds the types required by the pass for the given context.
+  void buildTypes(void) {
+    // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
+
+    llvm::Type *Int8Ty                   = llvm::Type::getInt8Ty(*Context);
+    llvm::Type *Int8PtrTy                = Int8Ty->getPointerTo();
+    llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
+    llvm::Type *Int32Ty                  = llvm::Type::getInt32Ty(*Context);
+    llvm::Type *Int32ArrayInputLimitTy   = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
+    llvm::Type *VoidPtrTy                = llvm::Type::getInt8PtrTy(*Context);
+    llvm::Type *Int32Array4Ty            = llvm::ArrayType::get(Int32Ty, 4);
+
+    /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
+     *
+     * struct RsLaunchDimensions {
+     *   uint32_t x;
+     *   uint32_t y;
+     *   uint32_t z;
+     *   uint32_t lod;
+     *   uint32_t face;
+     *   uint32_t array[4];
+     * };
+     */
+    llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
+    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t x
+    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t y
+    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t z
+    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t lod
+    RsLaunchDimensionsTypes.push_back(Int32Ty);       // uint32_t face
+    RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
+    llvm::StructType *RsLaunchDimensionsTy =
+        llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
+
+    /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
+     *
+     * struct RsExpandKernelDriverInfoPfx {
+     *     const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
+     *     uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
+     *     uint32_t inLen;
+     *
+     *     uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
+     *     uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
+     *     uint32_t outLen;
+     *
+     *     // Dimension of the launch
+     *     RsLaunchDimensions dim;
+     *
+     *     // The walking iterator of the launch
+     *     RsLaunchDimensions current;
+     *
+     *     const void *usr;
+     *     uint32_t usrLen;
+     *
+     *     // Items below this line are not used by the compiler and can be change in the driver.
+     *     // So the compiler must assume there are an unknown number of fields of unknown type
+     *     // beginning here.
+     * };
+     *
+     * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
+     */
+    llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t inLen
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy);   // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t outLen
+    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions dim
+    RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy);     // RsLaunchDimensions current
+    RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy);                // const void *usr
+    RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty);                  // uint32_t usrLen
+    llvm::StructType *RsExpandKernelDriverInfoPfxTy =
+        llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
+
+    // Create the function type for expanded kernels.
+    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
+
+    llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+    // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+    ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+        {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
+
+    // void (void *inBuf, void *outBuf, uint32_t len)
+    ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
+  }
+
+  /// @brief Create skeleton of the expanded foreach kernel.
+  ///
+  /// This creates a function with the following signature:
+  ///
+  ///   void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
+  ///         uint32_t outstep)
+  ///
+  llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
+    llvm::Function *ExpandedFunction =
+      llvm::Function::Create(ExpandedForEachType,
+                             llvm::GlobalValue::ExternalLinkage,
+                             OldName + ".expand", Module);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+    (AI++)->setName("p");
+    (AI++)->setName("x1");
+    (AI++)->setName("x2");
+    (AI++)->setName("arg_outstep");
+    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+                                                       ExpandedFunction);
+    llvm::IRBuilder<> Builder(Begin);
+    Builder.CreateRetVoid();
+    return ExpandedFunction;
+  }
+
+  // Create skeleton of the expanded reduce kernel.
+  //
+  // This creates a function with the following signature:
+  //
+  //   void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+  //
+  llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+    llvm::Function *ExpandedFunction =
+      llvm::Function::Create(ExpandedReduceType,
+                             llvm::GlobalValue::ExternalLinkage,
+                             OldName + ".expand", Module);
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+    llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+    using llvm::Attribute;
+
+    llvm::Argument *InBuf = &(*AI++);
+    InBuf->setName("inBuf");
+    InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+    llvm::Argument *OutBuf = &(*AI++);
+    OutBuf->setName("outBuf");
+    OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+    (AI++)->setName("len");
+
+    llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+                                                       ExpandedFunction);
+    llvm::IRBuilder<> Builder(Begin);
+    Builder.CreateRetVoid();
+
+    return ExpandedFunction;
+  }
+
+  /// @brief Create an empty loop
+  ///
+  /// Create a loop of the form:
+  ///
+  /// for (i = LowerBound; i < UpperBound; i++)
+  ///   ;
+  ///
+  /// After the loop has been created, the builder is set such that
+  /// instructions can be added to the loop body.
+  ///
+  /// @param Builder The builder to use to build this loop. The current
+  ///                position of the builder is the position the loop
+  ///                will be inserted.
+  /// @param LowerBound The first value of the loop iterator
+  /// @param UpperBound The maximal value of the loop iterator
+  /// @param LoopIV A reference that will be set to the loop iterator.
+  /// @return The BasicBlock that will be executed after the loop.
+  llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
+                               llvm::Value *LowerBound,
+                               llvm::Value *UpperBound,
+                               llvm::PHINode **LoopIV) {
+    bccAssert(LowerBound->getType() == UpperBound->getType());
+
+    llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
+    llvm::Value *Cond, *IVNext;
+    llvm::PHINode *IV;
+
+    CondBB = Builder.GetInsertBlock();
+    AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
+    HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
+
+    // if (LowerBound < Upperbound)
+    //   goto LoopHeader
+    // else
+    //   goto AfterBB
+    CondBB->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(CondBB);
+    Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
+    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
+
+    // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
+    // iv.next = iv + 1
+    // if (iv.next < Upperbound)
+    //   goto LoopHeader
+    // else
+    //   goto AfterBB
+    Builder.SetInsertPoint(HeaderBB);
+    IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
+    IV->addIncoming(LowerBound, CondBB);
+    IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
+    IV->addIncoming(IVNext, HeaderBB);
+    Cond = Builder.CreateICmpULT(IVNext, UpperBound);
+    Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
+    AfterBB->setName("Exit");
+    Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
+    *LoopIV = IV;
+    return AfterBB;
+  }
+
+  // Finish building the outgoing argument list for calling a ForEach-able function.
+  //
+  // ArgVector - on input, the non-special arguments
+  //             on output, the non-special arguments combined with the special arguments
+  //               from SpecialArgVector
+  // SpecialArgVector - special arguments (from ExpandSpecialArguments())
+  // SpecialArgContextIdx - return value of ExpandSpecialArguments()
+  //                          (position of context argument in SpecialArgVector)
+  // CalleeFunction - the ForEach-able function being called
+  // Builder - for inserting code into the caller function
+  template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
+  void finishArgList(      llvm::SmallVector<llvm::Value *, ArgVectorLen>        &ArgVector,
+                     const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
+                     const int SpecialArgContextIdx,
+                     const llvm::Function &CalleeFunction,
+                     llvm::IRBuilder<> &CallerBuilder) {
+    /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
+     * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
+     * two types represent the same thing).  Therefore, we must introduce a pointer cast when
+     * generating a call to the kernel function.
+     */
+    const int ArgContextIdx =
+        SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
+    ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
+    if (ArgContextIdx >= 0) {
+      llvm::Type *ContextArgType = nullptr;
+      int ArgIdx = ArgContextIdx;
+      for (const auto &Arg : CalleeFunction.getArgumentList()) {
+        if (!ArgIdx--) {
+          ContextArgType = Arg.getType();
+          break;
+        }
+      }
+      bccAssert(ContextArgType);
+      ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
+    }
+  }
+
+  // GEPHelper() returns a SmallVector of values suitable for passing
+  // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
+  // the returned data type. It is sized so that the SmallVector
+  // returned by GEPHelper() never needs to do a heap allocation for
+  // any list of GEP indices it encounters in the code.
+  typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
+
+  // Helper for turning a list of constant integer GEP indices into a
+  // SmallVector of llvm::Value*. The return value is suitable for
+  // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
+  //
+  // Inputs:
+  //   I32Args should be integers which represent the index arguments
+  //   to a GEP instruction.
+  //
+  // Returns:
+  //   Returns a SmallVector of ConstantInts.
+  SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
+    SmallGEPIndices Out(I32Args.size());
+    llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
+    std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
+                   [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
+    return Out;
+  }
+
+public:
+  RSKernelExpandPass(bool pEnableStepOpt = true)
+      : ModulePass(ID), Module(nullptr), Context(nullptr),
+        mEnableStepOpt(pEnableStepOpt) {
+
+  }
+
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    // This pass does not use any other analysis passes, but it does
+    // add/wrap the existing functions in the module (thus altering the CFG).
+  }
+
+  // Build contribution to outgoing argument list for calling a
+  // ForEach-able function, based on the special parameters of that
+  // function.
+  //
+  // Signature - metadata bits for the signature of the ForEach-able function
+  // X, Arg_p - values derived directly from expanded function,
+  //            suitable for computing arguments for the ForEach-able function
+  // CalleeArgs - contribution is accumulated here
+  // Bump - invoked once for each contributed outgoing argument
+  // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
+  //                            this function can insert loop-invariant loads
+  //
+  // Return value is the (zero-based) position of the context (Arg_p)
+  // argument in the CalleeArgs vector, or a negative value if the
+  // context argument is not placed in the CalleeArgs vector.
+  int ExpandSpecialArguments(uint32_t Signature,
+                             llvm::Value *X,
+                             llvm::Value *Arg_p,
+                             llvm::IRBuilder<> &Builder,
+                             llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
+                             std::function<void ()> Bump,
+                             llvm::Instruction *LoopHeaderInsertionPoint) {
+
+    bccAssert(CalleeArgs.empty());
+
+    int Return = -1;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
+      CalleeArgs.push_back(Arg_p);
+      Bump();
+      Return = CalleeArgs.size() - 1;
+    }
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
+      CalleeArgs.push_back(X);
+      Bump();
+    }
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
+        bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
+      bccAssert(LoopHeaderInsertionPoint);
+
+      // Y and Z are loop invariant, so they can be hoisted out of the
+      // loop. Set the IRBuilder insertion point to the loop header.
+      auto OldInsertionPoint = Builder.saveIP();
+      Builder.SetInsertPoint(LoopHeaderInsertionPoint);
+
+      if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
+        SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+          RsLaunchDimensionsFieldY}));
+        llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
+        CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
+        Bump();
+      }
+
+      if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
+        SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+          RsLaunchDimensionsFieldZ}));
+        llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
+        CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
+        Bump();
+      }
+
+      Builder.restoreIP(OldInsertionPoint);
+    }
+
+    return Return;
+  }
+
+  /* Performs the actual optimization on a selected function. On success, the
+   * Module will contain a new function of the name "<NAME>.expand" that
+   * invokes <NAME>() in a loop with the appropriate parameters.
+   */
+  bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
+    ALOGV("Expanding ForEach-able Function %s",
+          Function->getName().str().c_str());
+
+    if (!Signature) {
+      Signature = getRootSignature(Function);
+      if (!Signature) {
+        // We couldn't determine how to expand this function based on its
+        // function signature.
+        return false;
+      }
+    }
+
+    llvm::DataLayout DL(Module);
+
+    llvm::Function *ExpandedFunction =
+      createEmptyExpandedForEachKernel(Function->getName());
+
+    /*
+     * Extract the expanded function's parameters.  It is guaranteed by
+     * createEmptyExpandedFunction that there will be four parameters.
+     */
+
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+
+    llvm::Function::arg_iterator ExpandedFunctionArgIter =
+      ExpandedFunction->arg_begin();
+
+    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
+
+    llvm::Value *InStep  = nullptr;
+    llvm::Value *OutStep = nullptr;
+
+    // Construct the actual function body.
+    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+    // Collect and construct the arguments for the kernel().
+    // Note that we load any loop-invariant arguments before entering the Loop.
+    llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
+
+    llvm::Type  *InTy      = nullptr;
+    llvm::Value *InBufPtr = nullptr;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
+      SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
+      llvm::LoadInst *InStepArg  = Builder.CreateLoad(
+        Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
+
+      InTy = (FunctionArgIter++)->getType();
+      InStep = getStepValue(&DL, InTy, InStepArg);
+
+      InStep->setName("instep");
+
+      SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
+      InBufPtr = Builder.CreateLoad(
+        Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
+    }
+
+    llvm::Type *OutTy = nullptr;
+    llvm::Value *OutBasePtr = nullptr;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
+      OutTy = (FunctionArgIter++)->getType();
+      OutStep = getStepValue(&DL, OutTy, Arg_outstep);
+      OutStep->setName("outstep");
+      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
+    }
+
+    llvm::Value *UsrData = nullptr;
+    if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
+      llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
+      llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
+      UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
+      UsrData->setName("UsrData");
+    }
+
+    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
+    llvm::PHINode *IV;
+    createLoop(Builder, Arg_x1, Arg_x2, &IV);
+
+    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
+    const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
+                                                            [&FunctionArgIter]() { FunctionArgIter++; },
+                                                            LoopHeader->getTerminator());
+
+    bccAssert(FunctionArgIter == Function->arg_end());
+
+    // Populate the actual call to kernel().
+    llvm::SmallVector<llvm::Value*, 8> RootArgs;
+
+    llvm::Value *InPtr  = nullptr;
+    llvm::Value *OutPtr = nullptr;
+
+    // Calculate the current input and output pointers
+    //
+    // We always calculate the input/output pointers with a GEP operating on i8
+    // values and only cast at the very end to OutTy. This is because the step
+    // between two values is given in bytes.
+    //
+    // TODO: We could further optimize the output by using a GEP operation of
+    // type 'OutTy' in cases where the element type of the allocation allows.
+    if (OutBasePtr) {
+      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+      OutOffset = Builder.CreateMul(OutOffset, OutStep);
+      OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
+      OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
+    }
+
+    if (InBufPtr) {
+      llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
+      InOffset = Builder.CreateMul(InOffset, InStep);
+      InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
+      InPtr = Builder.CreatePointerCast(InPtr, InTy);
+    }
+
+    if (InPtr) {
+      RootArgs.push_back(InPtr);
+    }
+
+    if (OutPtr) {
+      RootArgs.push_back(OutPtr);
+    }
+
+    if (UsrData) {
+      RootArgs.push_back(UsrData);
+    }
+
+    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
+
+    Builder.CreateCall(Function, RootArgs);
+
+    return true;
+  }
+
+  /* Expand a pass-by-value foreach kernel.
+   */
+  bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
+    bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
+    ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
+
+    // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
+    llvm::DataLayout DL(Module);
+
+    llvm::Function *ExpandedFunction =
+      createEmptyExpandedForEachKernel(Function->getName());
+
+    /*
+     * Extract the expanded function's parameters.  It is guaranteed by
+     * createEmptyExpandedFunction that there will be four parameters.
+     */
+
+    bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+
+    llvm::Function::arg_iterator ExpandedFunctionArgIter =
+      ExpandedFunction->arg_begin();
+
+    llvm::Value *Arg_p       = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_x1      = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_x2      = &*(ExpandedFunctionArgIter++);
+    // Arg_outstep is not used by expanded new-style forEach kernels.
+
+    // Construct the actual function body.
+    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+    // Create TBAA meta-data.
+    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
+                 *TBAAAllocation, *TBAAPointer;
+    llvm::MDBuilder MDHelper(*Context);
+
+    TBAARenderScriptDistinct =
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+        TBAARenderScriptDistinct);
+    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+                                                       TBAARenderScript);
+    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+                                                      TBAAAllocation, 0);
+    TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
+                                                    TBAARenderScript);
+    TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
+
+    /*
+     * Collect and construct the arguments for the kernel().
+     *
+     * Note that we load any loop-invariant arguments before entering the Loop.
+     */
+    size_t NumRemainingInputs = Function->arg_size();
+
+    // No usrData parameter on kernels.
+    bccAssert(
+        !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
+
+    llvm::Function::arg_iterator ArgIter = Function->arg_begin();
+
+    // Check the return type
+    llvm::Type     *OutTy            = nullptr;
+    llvm::LoadInst *OutBasePtr       = nullptr;
+    llvm::Value    *CastedOutBasePtr = nullptr;
+
+    bool PassOutByPointer = false;
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
+      llvm::Type *OutBaseTy = Function->getReturnType();
+
+      if (OutBaseTy->isVoidTy()) {
+        PassOutByPointer = true;
+        OutTy = ArgIter->getType();
+
+        ArgIter++;
+        --NumRemainingInputs;
+      } else {
+        // We don't increment Args, since we are using the actual return type.
+        OutTy = OutBaseTy->getPointerTo();
+      }
+
+      SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+      OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
+
+      if (gEnableRsTbaa) {
+        OutBasePtr->setMetadata("tbaa", TBAAPointer);
+      }
+
+      CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
+    }
+
+    llvm::SmallVector<llvm::Type*,  8> InTypes;
+    llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
+    llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
+
+    bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
+
+    // Create the loop structure.
+    llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
+    llvm::PHINode *IV;
+    createLoop(Builder, Arg_x1, Arg_x2, &IV);
+
+    llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
+    const int CalleeArgsContextIdx =
+      ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
+                             [&NumRemainingInputs]() { --NumRemainingInputs; },
+                             LoopHeader->getTerminator());
+
+    // After ExpandSpecialArguments() gets called, NumRemainingInputs
+    // counts the number of arguments to the kernel that correspond to
+    // an array entry from the InPtr field of the DriverInfo
+    // structure.
+    const size_t NumInPtrArguments = NumRemainingInputs;
+
+    if (NumInPtrArguments > 0) {
+      // Extract information about input slots and step sizes. The work done
+      // here is loop-invariant, so we can hoist the operations out of the loop.
+      auto OldInsertionPoint = Builder.saveIP();
+      Builder.SetInsertPoint(LoopHeader->getTerminator());
+
+      for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
+        llvm::Type *InType = ArgIter->getType();
+
+        /*
+         * AArch64 calling conventions dictate that structs of sufficient size
+         * get passed by pointer instead of passed by value.  This, combined
+         * with the fact that we don't allow kernels to operate on pointer
+         * data means that if we see a kernel with a pointer parameter we know
+         * that it is a struct input that has been promoted.  As such we don't
+         * need to convert its type to a pointer.  Later we will need to know
+         * to create a temporary copy on the stack, so we save this information
+         * in InStructTempSlots.
+         */
+        if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
+          llvm::Type *ElementType = PtrType->getElementType();
+          InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
+                                                           "input_struct_slot"));
+        } else {
+          InType = InType->getPointerTo();
+          InStructTempSlots.push_back(nullptr);
+        }
+
+        SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
+          static_cast<int32_t>(InputIndex)}));
+        llvm::Value    *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
+        llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
+        llvm::Value    *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
+        if (gEnableRsTbaa) {
+          InBufPtr->setMetadata("tbaa", TBAAPointer);
+        }
+
+        InTypes.push_back(InType);
+        InBufPtrs.push_back(CastInBufPtr);
+      }
+
+      Builder.restoreIP(OldInsertionPoint);
+    }
+
+    // Populate the actual call to kernel().
+    llvm::SmallVector<llvm::Value*, 8> RootArgs;
+
+    // Calculate the current input and output pointers.
+
+    // Output
+
+    llvm::Value *OutPtr = nullptr;
+    if (CastedOutBasePtr) {
+      llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+      OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
+
+      if (PassOutByPointer) {
+        RootArgs.push_back(OutPtr);
+      }
+    }
+
+    // Inputs
+
+    if (NumInPtrArguments > 0) {
+      llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
+
+      for (size_t Index = 0; Index < NumInPtrArguments; ++Index) {
+        llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
+        llvm::Value *Input;
+
+        llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
+
+        if (gEnableRsTbaa) {
+          InputLoad->setMetadata("tbaa", TBAAAllocation);
+        }
+
+        if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
+          // Pass a pointer to a temporary on the stack, rather than
+          // passing a pointer to the original value. We do not want
+          // the kernel to potentially modify the input data.
+
+          // Note: don't annotate with TBAA, since the kernel might
+          // have its own TBAA annotations for the pointer argument.
+          Builder.CreateStore(InputLoad, TemporarySlot);
+          Input = TemporarySlot;
+        } else {
+          Input = InputLoad;
+        }
+
+        RootArgs.push_back(Input);
+      }
+    }
+
+    finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
+
+    llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
+
+    if (OutPtr && !PassOutByPointer) {
+      RetVal->setName("call.result");
+      llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
+      if (gEnableRsTbaa) {
+        Store->setMetadata("tbaa", TBAAAllocation);
+      }
+    }
+
+    return true;
+  }
+
+  // Expand a reduce-style kernel function.
+  //
+  // The input is a kernel which represents a binary operation,
+  // of the form
+  //
+  //   define foo @func(foo %a, foo %b),
+  //
+  // (More generally, it can be of the forms
+  //
+  //   define void @func(foo* %ret, foo* %a, foo* %b)
+  //   define void @func(foo* %ret, foo1 %a, foo1 %b)
+  //   define foo1 @func(foo2 %a, foo2 %b)
+  //
+  // as a result of argument / return value conversions. Here, "foo1"
+  // and "foo2" refer to possibly coerced types, and the coerced
+  // argument type may be different from the coerced return type. See
+  // "Note on coercion" below.)
+  //
+  // Note also, we do not expect to encounter any case when the
+  // arguments are promoted to pointers but the return value is
+  // unpromoted to pointer, e.g.
+  //
+  //   define foo1 @func(foo* %a, foo* %b)
+  //
+  // and we will throw an assertion in this case.)
+  //
+  // The input kernel gets expanded into a kernel of the form
+  //
+  //   define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+  //
+  // which performs a serial reduction of `len` elements from `inBuf`,
+  // and stores the result into `outBuf`. In pseudocode, @func.expand
+  // does:
+  //
+  //   inArr := (foo *)inBuf;
+  //   accum := inArr[0];
+  //   for (i := 1; i < len; ++i) {
+  //     accum := foo(accum, inArr[i]);
+  //   }
+  //   *(foo *)outBuf := accum;
+  //
+  // Note on coercion
+  //
+  // Both the return value and the argument types may undergo internal
+  // coercion in clang as part of call lowering. As a result, the
+  // return value type may differ from the argument type even if the
+  // types in the RenderScript signaure are the same. For instance, the
+  // kernel
+  //
+  //   int3 add(int3 a, int3 b) { return a + b; }
+  //
+  // gets lowered by clang as
+  //
+  //   define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+  //
+  // under AArch64. The details of this process are found in clang,
+  // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+  // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+  // is passed by pointer, then the pointed-to type is not coerced.
+  //
+  // Since we lack the original type information, this code does loads
+  // and stores of allocation data by way of pointers to the coerced
+  // type.
+  bool ExpandReduce(llvm::Function *Function) {
+    bccAssert(Function);
+
+    ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+    llvm::DataLayout DL(Module);
+
+    // TBAA Metadata
+    llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+    llvm::MDBuilder MDHelper(*Context);
+
+    TBAARenderScriptDistinct =
+      MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+    TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+        TBAARenderScriptDistinct);
+    TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+                                                       TBAARenderScript);
+    TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+                                                      TBAAAllocation, 0);
+
+    llvm::Function *ExpandedFunction =
+      createEmptyExpandedReduceKernel(Function->getName());
+
+    // Extract the expanded kernel's parameters.  It is guaranteed by
+    // createEmptyExpandedFunction that there will be 3 parameters.
+    auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+    llvm::Value *Arg_inBuf  = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+    llvm::Value *Arg_len    = &*(ExpandedFunctionArgIter++);
+
+    bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+    // Check if, instead of returning a value, the original kernel has
+    // a pointer parameter which points to a temporary buffer into
+    // which the return value gets written.
+    const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+    bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+    // Check if, instead of being passed by value, the inputs to the
+    // original kernel are passed by pointer.
+    auto FirstArgIter = Function->arg_begin();
+    // The second argument is always an input to the original kernel.
+    auto SecondArgIter = std::next(FirstArgIter);
+    const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+    // Get the output type (i.e. return type of the original kernel).
+    llvm::PointerType *OutPtrTy = nullptr;
+    llvm::Type *OutTy = nullptr;
+    if (ReturnValuePointerStyle) {
+      OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+      bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+      OutTy = OutPtrTy->getElementType();
+    } else {
+      OutTy = Function->getReturnType();
+      bccAssert(!OutTy->isVoidTy());
+      OutPtrTy = OutTy->getPointerTo();
+    }
+
+    // Get the input type (type of the arguments to the original
+    // kernel). Some input types are different from the output type,
+    // due to explicit coercion that the compiler performs when
+    // lowering the parameters. See "Note on coercion" above.
+    llvm::PointerType *InPtrTy;
+    llvm::Type *InTy;
+    if (InputsPointerStyle) {
+      InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+      bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+      bccAssert(ReturnValuePointerStyle);
+      bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+                "Input type mismatch");
+      InTy = InPtrTy->getElementType();
+    } else {
+      InTy = SecondArgIter->getType();
+      InPtrTy = InTy->getPointerTo();
+      if (!ReturnValuePointerStyle) {
+        bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+      } else {
+        bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+                  "Input type mismatch");
+      }
+    }
+
+    // The input type should take up the same amount of space in
+    // memory as the output type.
+    bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+    // Construct the actual function body.
+    llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+    // Cast input and output buffers to appropriate types.
+    llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+    llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+    // Create a slot to pass temporary results back. This needs to be
+    // separate from the accumulator slot because the kernel may mark
+    // the return value slot as noalias.
+    llvm::Value *ReturnBuf = nullptr;
+    if (ReturnValuePointerStyle) {
+      ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+    }
+
+    // Create a slot to hold the second input if the inputs are passed
+    // by pointer to the original kernel. We cannot directly pass a
+    // pointer to the input buffer, because the kernel may modify its
+    // inputs.
+    llvm::Value *SecondInputTempBuf = nullptr;
+    if (InputsPointerStyle) {
+      SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+    }
+
+    // Create a slot to accumulate temporary results, and fill it with
+    // the first value.
+    llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+    // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+    llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+      Builder.CreatePointerCast(InBuf, OutPtrTy));
+    if (gEnableRsTbaa) {
+      FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+    }
+    // Memory operations with AccumBuf shouldn't be marked with
+    // RenderScript TBAA, since this might conflict with TBAA metadata
+    // in the kernel function when AccumBuf is passed by pointer.
+    Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+    // Loop body
+
+    // Create the loop structure. Note that the first input in the input buffer
+    // has already been accumulated, so that we start at index 1.
+    llvm::PHINode *IndVar;
+    llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+    llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+    llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+    // Set up arguments and call the original (unexpanded) kernel.
+    //
+    // The original kernel can have at most 3 arguments, which is
+    // achieved when the signature looks like:
+    //
+    //    define void @func(foo* %ret, bar %a, bar %b)
+    //
+    // (bar can be one of foo/foo.coerce/foo*).
+    llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+    if (ReturnValuePointerStyle) {
+      KernelArgs.push_back(ReturnBuf);
+    }
+
+    if (InputsPointerStyle) {
+      bccAssert(ReturnValuePointerStyle);
+      // Because the return buffer is copied back into the
+      // accumulator, it's okay if the accumulator is overwritten.
+      KernelArgs.push_back(AccumBuf);
+
+      llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        InputLoad->setMetadata("tbaa", TBAAAllocation);
+      }
+      Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+      KernelArgs.push_back(SecondInputTempBuf);
+    } else {
+      // InPtrTy may be different from OutPtrTy (the type of
+      // AccumBuf), so first cast the accumulator buffer to the
+      // pointer type corresponding to the input argument type.
+      KernelArgs.push_back(
+        Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+      llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+      if (gEnableRsTbaa) {
+        LoadedArg->setMetadata("tbaa", TBAAAllocation);
+      }
+      KernelArgs.push_back(LoadedArg);
+    }
+
+    llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+    const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+    const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+    // Store the output in the accumulator.
+    if (ReturnValuePointerStyle) {
+      Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+    } else {
+      Builder.CreateStore(RetVal, AccumBuf);
+    }
+
+    // Loop exit
+    Builder.SetInsertPoint(Exit, Exit->begin());
+
+    llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+    llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+    if (gEnableRsTbaa) {
+      OutputStore->setMetadata("tbaa", TBAAAllocation);
+    }
+
+    return true;
+  }
+
+  /// @brief Checks if pointers to allocation internals are exposed
+  ///
+  /// This function verifies if through the parameters passed to the kernel
+  /// or through calls to the runtime library the script gains access to
+  /// pointers pointing to data within a RenderScript Allocation.
+  /// If we know we control all loads from and stores to data within
+  /// RenderScript allocations and if we know the run-time internal accesses
+  /// are all annotated with RenderScript TBAA metadata, only then we
+  /// can safely use TBAA to distinguish between generic and from-allocation
+  /// pointers.
+  bool allocPointersExposed(llvm::Module &Module) {
+    // Old style kernel function can expose pointers to elements within
+    // allocations.
+    // TODO: Extend analysis to allow simple cases of old-style kernels.
+    for (size_t i = 0; i < mExportForEachCount; ++i) {
+      const char *Name = mExportForEachNameList[i];
+      uint32_t Signature = mExportForEachSignatureList[i];
+      if (Module.getFunction(Name) &&
+          !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
+        return true;
+      }
+    }
+
+    // Check for library functions that expose a pointer to an Allocation or
+    // that are not yet annotated with RenderScript-specific tbaa information.
+    static const std::vector<const char *> Funcs{
+      // rsGetElementAt(...)
+      "_Z14rsGetElementAt13rs_allocationj",
+      "_Z14rsGetElementAt13rs_allocationjj",
+      "_Z14rsGetElementAt13rs_allocationjjj",
+
+      // rsSetElementAt()
+      "_Z14rsSetElementAt13rs_allocationPvj",
+      "_Z14rsSetElementAt13rs_allocationPvjj",
+      "_Z14rsSetElementAt13rs_allocationPvjjj",
+
+      // rsGetElementAtYuv_uchar_Y()
+      "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
+
+      // rsGetElementAtYuv_uchar_U()
+      "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
+
+      // rsGetElementAtYuv_uchar_V()
+      "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
+    };
+
+    for (auto FI : Funcs) {
+      llvm::Function *Function = Module.getFunction(FI);
+
+      if (!Function) {
+        ALOGE("Missing run-time function '%s'", FI);
+        return true;
+      }
+
+      if (Function->getNumUses() > 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
+  ///
+  /// The TBAA metadata used to annotate loads/stores from RenderScript
+  /// Allocations is generated in a separate TBAA tree with a
+  /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
+  /// all nodes in unrelated alias analysis trees. This function makes the
+  /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
+  /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
+  /// the connected trees every access to an Allocation is resolved to
+  /// must-alias if compared to a normal C/C++ access.
+  void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
+    llvm::MDBuilder MDHelper(*Context);
+    llvm::MDNode *TBAARenderScriptDistinct =
+      MDHelper.createTBAARoot("RenderScript Distinct TBAA");
+    llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
+        "RenderScript TBAA", TBAARenderScriptDistinct);
+    llvm::MDNode *TBAARoot     = MDHelper.createTBAARoot("Simple C/C++ TBAA");
+    TBAARenderScript->replaceOperandWith(1, TBAARoot);
+  }
+
+  virtual bool runOnModule(llvm::Module &Module) {
+    bool Changed  = false;
+    this->Module  = &Module;
+    Context = &Module.getContext();
+
+    buildTypes();
+
+    bcinfo::MetadataExtractor me(&Module);
+    if (!me.extract()) {
+      ALOGE("Could not extract metadata from module!");
+      return false;
+    }
+
+    // Expand forEach_* style kernels.
+    mExportForEachCount = me.getExportForEachSignatureCount();
+    mExportForEachNameList = me.getExportForEachNameList();
+    mExportForEachSignatureList = me.getExportForEachSignatureList();
+
+    for (size_t i = 0; i < mExportForEachCount; ++i) {
+      const char *name = mExportForEachNameList[i];
+      uint32_t signature = mExportForEachSignatureList[i];
+      llvm::Function *kernel = Module.getFunction(name);
+      if (kernel) {
+        if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
+          Changed |= ExpandForEach(kernel, signature);
+          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+        } else if (kernel->getReturnType()->isVoidTy()) {
+          Changed |= ExpandOldStyleForEach(kernel, signature);
+          kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+        } else {
+          // There are some graphics root functions that are not
+          // expanded, but that will be called directly. For those
+          // functions, we can not set the linkage to internal.
+        }
+      }
+    }
+
+    // Expand reduce_* style kernels.
+    mExportReduceCount = me.getExportReduceCount();
+    mExportReduceNameList = me.getExportReduceNameList();
+
+    for (size_t i = 0; i < mExportReduceCount; ++i) {
+      llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+      if (kernel) {
+        Changed |= ExpandReduce(kernel);
+      }
+    }
+
+    if (gEnableRsTbaa && !allocPointersExposed(Module)) {
+      connectRenderScriptTBAAMetadata(Module);
+    }
+
+    return Changed;
+  }
+
+  virtual const char *getPassName() const {
+    return "forEach_* and reduce_* function expansion";
+  }
+
+}; // end RSKernelExpandPass
+
+} // end anonymous namespace
+
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
+
+namespace bcc {
+
+llvm::ModulePass *
+createRSKernelExpandPass(bool pEnableStepOpt) {
+  return new RSKernelExpandPass(pEnableStepOpt);
+}
+
+} // end namespace bcc
diff --git a/lib/Renderscript/RSStubsWhiteList.cpp b/lib/Renderscript/RSStubsWhiteList.cpp
index b69681d..426fb43 100644
--- a/lib/Renderscript/RSStubsWhiteList.cpp
+++ b/lib/Renderscript/RSStubsWhiteList.cpp
@@ -1235,6 +1235,7 @@
 "_Z3madfff",
 "_Z3maxDv2_cS_",
 "_Z3maxDv2_fS_",
+"_Z3maxDv2_ff",
 "_Z3maxDv2_hS_",
 "_Z3maxDv2_iS_",
 "_Z3maxDv2_jS_",
@@ -1244,6 +1245,7 @@
 "_Z3maxDv2_tS_",
 "_Z3maxDv3_cS_",
 "_Z3maxDv3_fS_",
+"_Z3maxDv3_ff",
 "_Z3maxDv3_hS_",
 "_Z3maxDv3_iS_",
 "_Z3maxDv3_jS_",
@@ -1253,6 +1255,7 @@
 "_Z3maxDv3_tS_",
 "_Z3maxDv4_cS_",
 "_Z3maxDv4_fS_",
+"_Z3maxDv4_ff",
 "_Z3maxDv4_hS_",
 "_Z3maxDv4_iS_",
 "_Z3maxDv4_jS_",
@@ -1271,6 +1274,7 @@
 "_Z3maxtt",
 "_Z3minDv2_cS_",
 "_Z3minDv2_fS_",
+"_Z3minDv2_ff",
 "_Z3minDv2_hS_",
 "_Z3minDv2_iS_",
 "_Z3minDv2_jS_",
@@ -1280,6 +1284,7 @@
 "_Z3minDv2_tS_",
 "_Z3minDv3_cS_",
 "_Z3minDv3_fS_",
+"_Z3minDv3_ff",
 "_Z3minDv3_hS_",
 "_Z3minDv3_iS_",
 "_Z3minDv3_jS_",
@@ -1289,6 +1294,7 @@
 "_Z3minDv3_tS_",
 "_Z3minDv4_cS_",
 "_Z3minDv4_fS_",
+"_Z3minDv4_ff",
 "_Z3minDv4_hS_",
 "_Z3minDv4_iS_",
 "_Z3minDv4_jS_",
diff --git a/lib/Support/CompilerConfig.cpp b/lib/Support/CompilerConfig.cpp
index eac26aa..71cd7cc 100644
--- a/lib/Support/CompilerConfig.cpp
+++ b/lib/Support/CompilerConfig.cpp
@@ -155,7 +155,9 @@
 #if defined(TARGET_BUILD)
     if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
 #ifndef FORCE_CPU_VARIANT_32
+#ifdef DEFAULT_ARM_CODEGEN
       setCPU(llvm::sys::getHostCPUName());
+#endif
 #else
 #define XSTR(S) #S
 #define STR(S) XSTR(S)
@@ -175,7 +177,9 @@
 #if defined(TARGET_BUILD)
     if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
 #ifndef FORCE_CPU_VARIANT_64
+#ifdef DEFAULT_ARM64_CODEGEN
       setCPU(llvm::sys::getHostCPUName());
+#endif
 #else
 #define XSTR(S) #S
 #define STR(S) XSTR(S)
diff --git a/libbcc-host-build.mk b/libbcc-host-build.mk
index 3a8839f..22f0f72 100644
--- a/libbcc-host-build.mk
+++ b/libbcc-host-build.mk
@@ -26,14 +26,6 @@
   $(RS_VERSION_DEFINE) \
   $(LOCAL_CFLAGS)
 
-ifneq ($(BOARD_OVERRIDE_RS_CPU_VARIANT_32),)
-LOCAL_CFLAGS += -DFORCE_CPU_VARIANT_32=$(BOARD_OVERRIDE_RS_CPU_VARIANT_32)
-endif
-
-ifneq ($(BOARD_OVERRIDE_RS_CPU_VARIANT_64),)
-LOCAL_CFLAGS += -DFORCE_CPU_VARIANT_64=$(BOARD_OVERRIDE_RS_CPU_VARIANT_64)
-endif
-
 ifeq ($(TARGET_BUILD_VARIANT),eng)
 libbcc_CFLAGS += -DANDROID_ENGINEERING_BUILD
 else
diff --git a/llvm-loadable-libbcc.mk b/llvm-loadable-libbcc.mk
new file mode 100644
index 0000000..7675167
--- /dev/null
+++ b/llvm-loadable-libbcc.mk
@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Checks whether libbcc can be built as an LLVM loadable module on the
+# host.
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := true
+
+ifdef USE_MINGW
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
+
+ifeq ($(HOST_OS),darwin)
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
+
+ifneq ($(FORCE_BUILD_LLVM_COMPONENTS),true)
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
diff --git a/tests/README.lit b/tests/README.lit
new file mode 100644
index 0000000..16fa305
--- /dev/null
+++ b/tests/README.lit
@@ -0,0 +1,7 @@
+To run the libbcc lit tests:
+ * Ensure `llvm-rs-as` is built, either by doing a top-level `make
+   checkbuild` or by doing `mm` from frameworks/compile/slang.
+ * Ensure that LLVM and libbcc are built with
+   `FORCE_BUILD_LLVM_COMPONENTS=true`.
+ * Ensure `opt` is built from external/llvm, either by top-level `make
+   checkbuild` or by doing `mm` from external/llvm.
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
new file mode 100644
index 0000000..1cf201a
--- /dev/null
+++ b/tests/libbcc/getelementptr.ll
@@ -0,0 +1,70 @@
+; This checks that RSForEachExpand generates getelementptr
+; instructions into the driver info structure as expected - namely,
+; that they index into the right positions of the structure and that
+; the instructions that are generated are in the loop header.
+
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
+
+; ModuleID = 'test_getelementptr.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; Old-style kernel
+define void @root(i32* nocapture %ain, i32* nocapture %out, i32 %x, i32 %y, i32 %z) {
+  ret void
+; CHECK: define void @root.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: Begin:
+; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
+; CHECK: load i32, i32* %instep_addr.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: Loop:
+}
+
+; New style kernel with multiple inputs
+define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
+  ret i32 0
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %arg_outstep)
+; CHECK: Begin:
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %input_buf.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep1
+; CHECK: Loop:
+}
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"foo"}
+!5 = !{!"91"}
+!6 = !{!"123"}
diff --git a/tests/libbcc/lit.cfg b/tests/libbcc/lit.cfg
index 5b3c749..109a9d7 100644
--- a/tests/libbcc/lit.cfg
+++ b/tests/libbcc/lit.cfg
@@ -24,17 +24,20 @@
 # test_source_root: The path where tests are located (default is the test suite
 # root).
 config.test_source_root = None
-config.test_exec_root = os.path.join(ANDROID_HOST_OUT, 'tests', 'bcinfo')
+config.test_exec_root = os.path.join(ANDROID_HOST_OUT, 'tests', 'libbcc')
 
-tools_dir = os.path.join(ANDROID_HOST_OUT, 'bin')
+tools_dir = os.pathsep.join([os.path.join(ANDROID_HOST_OUT, 'bin'),
+                             os.path.join(ANDROID_HOST_OUT, 'lib64')])
 
 # Based on LLVM's lit.cfg: "For each occurrence of an llvm tool name
 # as its own word, replace it with the full path to the build directory
 # holding that tool."
 for pattern in [r"\bFileCheck\b",
                 r"\bllvm-rs-as\b",
-                r"\bbcinfo\b"]:
-    tool_match = re.match(r"^(\\)?((\| )?)\W+b([0-9A-Za-z-_]+)\\b\W*$",
+                r"\bbcinfo\b",
+                r"\bopt\b",
+                r"\blibbcc.so\b"]:
+    tool_match = re.match(r"^(\\)?((\| )?)\W+b([\.0-9A-Za-z-_]+)\\b\W*$",
                           pattern)
     tool_pipe = tool_match.group(2)
     tool_name = tool_match.group(4)
diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
new file mode 100644
index 0000000..5b0a270
--- /dev/null
+++ b/tests/libbcc/tbaa-through-alloca.ll
@@ -0,0 +1,71 @@
+; This test checks that the code doesn't aggressively apply TBAA
+; metadata to temporaries that are passed by pointer to kernels.
+
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+%struct.int5.0 = type { [5 x i32] }
+
+; Function Attrs: nounwind
+define void @add1_int5(%struct.int5.0* noalias nocapture sret %agg.result, %struct.int5.0* nocapture %in) #0 {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds %struct.int5.0, %struct.int5.0* %in, i64 0, i32 0, i64 %indvars.iv
+; CHECK: MayAlias: %load_from_input{{.*}} <-> store %struct.int5.0 %input, %struct.int5.0* %input_struct_slot
+  %load_from_input = load i32, i32* %2, align 4, !tbaa !9
+  %3 = add nsw i32 %load_from_input, 1
+  store i32 %3, i32* %2, align 4, !tbaa !9
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 5
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  %5 = bitcast %struct.int5.0* %agg.result to i8*
+  %6 = bitcast %struct.int5.0* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 20, i32 4, i1 false), !tbaa.struct !13
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+!\23rs_export_type = !{!7}
+!\25int5 = !{!8}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1_int5"}
+!5 = !{!"0"}
+!6 = !{!"35"}
+!7 = !{!"int5"}
+!8 = !{!"data", !"<ConstantArray>"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{i64 0, i64 20, !14}
+!14 = !{!11, !11, i64 0}
diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
new file mode 100644
index 0000000..6d8cb48
--- /dev/null
+++ b/tests/libbcc/tbaa.ll
@@ -0,0 +1,43 @@
+; Basic test of TBAA that should report that pointer loads do not
+; alias with stores to allocations.
+
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; CHECK:   NoAlias:   %0 = load {{.*}}, i8** %out_buf.gep, !tbaa {{.*}} <->   store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+; CHECK:   NoAlias:   %input_buf = load i8*, i8** %input_buf.gep, !tbaa {{.*}} <->   store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+
+; Function Attrs: nounwind readnone
+define i32 @add1(i32 %in) #0 {
+  %1 = add nsw i32 %in, 1
+  ret i32 %1
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1"}
+!5 = !{!"0"}
+!6 = !{!"35"}