Merge "If a general reduction kernel lacks a combiner function, synthesize one."
am: 1ca1ee8

* commit '1ca1ee856d334c7e3bdd7dad6184d10384790512':
  If a general reduction kernel lacks a combiner function, synthesize one.

Change-Id: I579c085edd93169f9cb9db8f9554d96d060dfb14
diff --git a/include/bcc/Renderscript/RSUtils.h b/include/bcc/Renderscript/RSUtils.h
index 19b45f6..fbd5ed0 100644
--- a/include/bcc/Renderscript/RSUtils.h
+++ b/include/bcc/Renderscript/RSUtils.h
@@ -21,6 +21,9 @@
 
 #include <llvm/IR/Type.h>
 #include <llvm/IR/DerivedTypes.h>
+#include <llvm/ADT/StringRef.h>
+
+#include <string>
 
 namespace {
 
@@ -102,4 +105,12 @@
 
 }  // end namespace
 
+// When we have a general reduction kernel with no combiner function,
+// we will synthesize a combiner function from the accumulator
+// function.  Given the accumulator function name, what should be the
+// name of the combiner function?
+static inline std::string nameReduceNewCombinerFromAccumulator(llvm::StringRef accumName) {
+  return std::string(accumName) + ".combiner";
+}
+
 #endif // BCC_RS_UTILS_H
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index 7b291f0..1988da3 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -35,6 +35,7 @@
 #include "bcc/Config/Config.h"
 #include "bcc/Renderscript/RSScript.h"
 #include "bcc/Renderscript/RSTransforms.h"
+#include "bcc/Renderscript/RSUtils.h"
 #include "bcc/Script.h"
 #include "bcc/Source.h"
 #include "bcc/Support/CompilerConfig.h"
@@ -389,7 +390,11 @@
   for (i = 0; i < exportReduceNewCount; ++i) {
     keep_funcs.push_back(std::string(exportReduceNewList[i].mAccumulatorName) + ".expand");
     keepFuncsPushBackIfPresent(exportReduceNewList[i].mInitializerName);
-    keepFuncsPushBackIfPresent(exportReduceNewList[i].mCombinerName);
+    if (exportReduceNewList[i].mCombinerName != nullptr) {
+      keep_funcs.push_back(exportReduceNewList[i].mCombinerName);
+    } else {
+      keep_funcs.push_back(nameReduceNewCombinerFromAccumulator(exportReduceNewList[i].mAccumulatorName));
+    }
     keepFuncsPushBackIfPresent(exportReduceNewList[i].mOutConverterName);
   }
 
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index 54e0acb..2d2e69f 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -17,6 +17,7 @@
 #include "bcc/Assert.h"
 #include "bcc/Config/Config.h"
 #include "bcc/Renderscript/RSTransforms.h"
+#include "bcc/Renderscript/RSUtils.h"
 #include "bcc/Support/Log.h"
 #include "bcinfo/MetadataExtractor.h"
 #include "rsDefines.h"
@@ -139,7 +140,9 @@
         << reduceNew.mReduceName << " - "
         << reduceNewFnName(reduceNew.mInitializerName) << " - "
         << reduceNewFnName(reduceNew.mAccumulatorName) << " - "
-        << reduceNewFnName(reduceNew.mCombinerName) << " - "
+        << ((reduceNew.mCombinerName != nullptr)
+            ? reduceNew.mCombinerName
+            : nameReduceNewCombinerFromAccumulator(reduceNew.mAccumulatorName)) << " - "
         << reduceNewFnName(reduceNew.mOutConverterName) << " - "
         << reduceNewFnName(reduceNew.mHalterName)
         << "\n";
diff --git a/lib/Renderscript/RSKernelExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
index 674e51f..893b186 100644
--- a/lib/Renderscript/RSKernelExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -16,6 +16,7 @@
 
 #include "bcc/Assert.h"
 #include "bcc/Renderscript/RSTransforms.h"
+#include "bcc/Renderscript/RSUtils.h"
 
 #include <cstdlib>
 #include <functional>
@@ -1513,6 +1514,80 @@
     return true;
   }
 
+  // Create a combiner function for a general reduce-style kernel that lacks one,
+  // by calling the accumulator function.
+  //
+  // The accumulator function must be of the form
+  //
+  //   define void @accumFn(accumType* %accum, accumType %in)
+  //
+  // A combiner function will be generated of the form
+  //
+  //   define void @accumFn.combiner(accumType* %accum, accumType* %other) {
+  //     %1 = load accumType, accumType* %other
+  //     call void @accumFn(accumType* %accum, accumType %1);
+  //   }
+  bool CreateReduceNewCombinerFromAccumulator(llvm::Function *FnAccumulator) {
+    ALOGV("Creating combiner from accumulator %s for general reduce kernel",
+          FnAccumulator->getName().str().c_str());
+
+    using llvm::Attribute;
+
+    bccAssert(FnAccumulator->arg_size() == 2);
+    auto AccumulatorArgIter = FnAccumulator->arg_begin();
+    llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++);
+    llvm::Value *AccumulatorArg_in    = &*(AccumulatorArgIter++);
+    llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType();
+    bccAssert(AccumulatorArgType->isPointerTy());
+
+    llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
+    llvm::FunctionType *CombinerType =
+        llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false);
+    llvm::Function *FnCombiner =
+        llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage,
+                               nameReduceNewCombinerFromAccumulator(FnAccumulator->getName()),
+                               Module);
+
+    auto CombinerArgIter = FnCombiner->arg_begin();
+
+    llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++);
+    CombinerArg_accum->setName("accum");
+    CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1,
+                                                       llvm::makeArrayRef(Attribute::NoCapture)));
+
+    llvm::Argument *CombinerArg_other = &(*CombinerArgIter++);
+    CombinerArg_other->setName("other");
+    CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1,
+                                                       llvm::makeArrayRef(Attribute::NoCapture)));
+
+    llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner);
+    llvm::IRBuilder<> Builder(BB);
+
+    if (AccumulatorArg_in->getType()->isPointerTy()) {
+      // Types of sufficient size get passed by pointer-to-copy rather
+      // than passed by value.  An accumulator cannot take a pointer
+      // at the user level; so if we see a pointer here, we know that
+      // we have a pass-by-pointer-to-copy case.
+      llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType();
+      llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy");
+      Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem);
+      Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem });
+    } else {
+      llvm::Value *TypeAdjustedOther = CombinerArg_other;
+      if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) {
+        // Call lowering by frontend has done some type coercion
+        TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other,
+                                                      AccumulatorArg_in->getType()->getPointerTo(),
+                                                      "cast");
+      }
+      llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther);
+      Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther });
+    }
+    Builder.CreateRetVoid();
+
+    return true;
+  }
+
   /// @brief Checks if pointers to allocation internals are exposed
   ///
   /// This function verifies if through the parameters passed to the kernel
@@ -1647,7 +1722,7 @@
     const size_t ExportReduceNewCount = me.getExportReduceNewCount();
     const bcinfo::MetadataExtractor::ReduceNew *ExportReduceNewList = me.getExportReduceNewList();
     //   Note that functions can be shared between kernels
-    FunctionSet PromotedFunctions, ExpandedAccumulators;
+    FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners;
 
     for (size_t i = 0; i < ExportReduceNewCount; ++i) {
       Changed |= PromoteReduceNewFunction(ExportReduceNewList[i].mInitializerName, PromotedFunctions);
@@ -1661,6 +1736,10 @@
         Changed |= ExpandReduceNewAccumulator(accumulator,
                                               ExportReduceNewList[i].mSignature,
                                               ExportReduceNewList[i].mInputCount);
+      if (!ExportReduceNewList[i].mCombinerName) {
+        if (AccumulatorsForCombiners.insert(accumulator).second)
+          Changed |= CreateReduceNewCombinerFromAccumulator(accumulator);
+      }
     }
 
     if (gEnableRsTbaa && !allocPointersExposed(Module)) {