am 03136c21: (-s ours) am 3de81cfc: am 7648c386: Merge "Update libbcc for LLVM rebase to r239765"
* commit '03136c210173e95c42b6b4af7c1bc5b82e1cbae6':
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 47c23c7..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -81,7 +81,7 @@
enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
- void addExpandForEachPass(llvm::legacy::PassManager &pPM);
+ void addExpandKernelPass(llvm::legacy::PassManager &pPM);
void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
void addInvariantPass(llvm::legacy::PassManager &pPM);
void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h
@@ -25,7 +25,7 @@
namespace bcc {
llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
llvm::FunctionPass *
createRSInvariantPass();
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index 2da44b1..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -157,7 +157,7 @@
// Add some initial custom passes.
addInvokeHelperPass(passes);
- addExpandForEachPass(passes);
+ addExpandKernelPass(passes);
addInvariantPass(passes);
if (!addInternalizeSymbolsPass(pScript, passes))
return kErrCustomPasses;
@@ -329,9 +329,11 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
size_t i;
for (i = 0; i < exportVarCount; ++i) {
@@ -342,18 +344,22 @@
export_symbols.push_back(exportFuncNameList[i]);
}
- // Expanded foreach functions should not be internalized, too.
- // expanded_foreach_funcs keeps the .expand version of the kernel names
- // around until createInternalizePass() is finished making its own
- // copy of the visible symbols.
- std::vector<std::string> expanded_foreach_funcs;
+ // Expanded foreach and reduce functions should not be
+ // internalized. expanded_funcs keeps the names of the expanded
+ // functions around until createInternalizePass() is finished making
+ // its own copy of the visible symbols.
+ std::vector<std::string> expanded_funcs;
+ expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
for (i = 0; i < exportForEachCount; ++i) {
- expanded_foreach_funcs.push_back(
- std::string(exportForEachNameList[i]) + ".expand");
+ expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+ }
+ for (i = 0; i < exportReduceCount; ++i) {
+ expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
}
- for (i = 0; i < exportForEachCount; i++) {
- export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+ for (auto &symbol_name : expanded_funcs) {
+ export_symbols.push_back(symbol_name.c_str());
}
pPM.add(llvm::createInternalizePass(export_symbols));
@@ -368,10 +374,10 @@
}
}
-void Compiler::addExpandForEachPass(llvm::legacy::PassManager &pPM) {
- // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+ // Expand ForEach and reduce on CPU path to reduce launch overhead.
bool pEnableStepOpt = true;
- pPM.add(createRSForEachExpandPass(pEnableStepOpt));
+ pPM.add(createRSKernelExpandPass(pEnableStepOpt));
}
void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -24,7 +24,7 @@
libbcc_renderscript_SRC_FILES := \
RSCompilerDriver.cpp \
RSEmbedInfo.cpp \
- RSForEachExpand.cpp \
+ RSKernelExpand.cpp \
RSGlobalInfoPass.cpp \
RSInvariant.cpp \
RSScript.cpp \
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -73,11 +73,13 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
size_t objectSlotCount = me.getObjectSlotCount();
size_t pragmaCount = me.getPragmaCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
const uint32_t *exportForEachSignatureList =
me.getExportForEachSignatureList();
const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
<< exportForEachNameList[i] << "\n";
}
+ s << "exportReduceCount: " << exportReduceCount << "\n";
+ for (i = 0; i < exportReduceCount; ++i) {
+ s << exportReduceNameList[i] << "\n";
+ }
+
s << "objectSlotCount: " << objectSlotCount << "\n";
for (i = 0; i < objectSlotCount; ++i) {
s << objectSlotList[i] << "\n";
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
similarity index 70%
rename from lib/Renderscript/RSForEachExpand.cpp
rename to lib/Renderscript/RSKernelExpand.cpp
index e91e4ed..34611d7 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -38,7 +38,14 @@
#include "bcinfo/MetadataExtractor.h"
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
using namespace bcc;
@@ -46,15 +53,17 @@
static const bool gEnableRsTbaa = true;
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
*/
-class RSForEachExpandPass : public llvm::ModulePass {
+class RSKernelExpandPass : public llvm::ModulePass {
public:
static char ID;
@@ -91,16 +100,19 @@
llvm::LLVMContext *Context;
/*
- * Pointer to LLVM type information for the the function signature
- * for expanded kernels. This must be re-calculated for each
- * module the pass is run on.
+ * Pointers to LLVM type information for the the function signatures
+ * for expanded functions. These must be re-calculated for each module
+ * the pass is run on.
*/
- llvm::FunctionType *ExpandedFunctionType;
+ llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
uint32_t mExportForEachCount;
const char **mExportForEachNameList;
const uint32_t *mExportForEachSignatureList;
+ uint32_t mExportReduceCount;
+ const char **mExportReduceNameList;
+
// Turns on optimization of allocation stride values.
bool mEnableStepOpt;
@@ -286,41 +298,68 @@
llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
// Create the function type for expanded kernels.
+ llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+ // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+ ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+ {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
- llvm::SmallVector<llvm::Type*, 8> ParamTypes;
- ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
- ParamTypes.push_back(Int32Ty); // uint32_t x1
- ParamTypes.push_back(Int32Ty); // uint32_t x2
- ParamTypes.push_back(Int32Ty); // uint32_t outstep
-
- ExpandedFunctionType =
- llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
- false);
+ // void (void *inBuf, void *outBuf, uint32_t len)
+ ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
}
- /// @brief Create skeleton of the expanded function.
+ /// @brief Create skeleton of the expanded foreach kernel.
///
/// This creates a function with the following signature:
///
/// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
/// uint32_t outstep)
///
- llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
+ llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
llvm::Function *ExpandedFunction =
- llvm::Function::Create(ExpandedFunctionType,
+ llvm::Function::Create(ExpandedForEachType,
llvm::GlobalValue::ExternalLinkage,
OldName + ".expand", Module);
-
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
(AI++)->setName("p");
(AI++)->setName("x1");
(AI++)->setName("x2");
(AI++)->setName("arg_outstep");
+ llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+ ExpandedFunction);
+ llvm::IRBuilder<> Builder(Begin);
+ Builder.CreateRetVoid();
+ return ExpandedFunction;
+ }
+
+ // Create skeleton of the expanded reduce kernel.
+ //
+ // This creates a function with the following signature:
+ //
+ // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+ //
+ llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+ llvm::Function *ExpandedFunction =
+ llvm::Function::Create(ExpandedReduceType,
+ llvm::GlobalValue::ExternalLinkage,
+ OldName + ".expand", Module);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+ llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+ using llvm::Attribute;
+
+ llvm::Argument *InBuf = &(*AI++);
+ InBuf->setName("inBuf");
+ InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ llvm::Argument *OutBuf = &(*AI++);
+ OutBuf->setName("outBuf");
+ OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ (AI++)->setName("len");
llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
ExpandedFunction);
@@ -444,7 +483,7 @@
//
// Returns:
// Returns a SmallVector of ConstantInts.
- SmallGEPIndices GEPHelper(std::initializer_list<int32_t> I32Args) {
+ SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
SmallGEPIndices Out(I32Args.size());
llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
@@ -453,7 +492,7 @@
}
public:
- RSForEachExpandPass(bool pEnableStepOpt = true)
+ RSKernelExpandPass(bool pEnableStepOpt = true)
: ModulePass(ID), Module(nullptr), Context(nullptr),
mEnableStepOpt(pEnableStepOpt) {
@@ -536,7 +575,7 @@
* Module will contain a new function of the name "<NAME>.expand" that
* invokes <NAME>() in a loop with the appropriate parameters.
*/
- bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
+ bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
ALOGV("Expanding ForEach-able Function %s",
Function->getName().str().c_str());
@@ -552,14 +591,14 @@
llvm::DataLayout DL(Module);
llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
+ createEmptyExpandedForEachKernel(Function->getName());
/*
* Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
+ * createEmptyExpandedFunction that there will be four parameters.
*/
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator ExpandedFunctionArgIter =
ExpandedFunction->arg_begin();
@@ -672,24 +711,24 @@
return true;
}
- /* Expand a pass-by-value kernel.
+ /* Expand a pass-by-value foreach kernel.
*/
- bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
+ bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
- // TODO: Refactor this to share functionality with ExpandFunction.
+ // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
llvm::DataLayout DL(Module);
llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
+ createEmptyExpandedForEachKernel(Function->getName());
/*
* Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
+ * createEmptyExpandedFunction that there will be four parameters.
*/
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator ExpandedFunctionArgIter =
ExpandedFunction->arg_begin();
@@ -697,7 +736,7 @@
llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
+ // Arg_outstep is not used by expanded new-style forEach kernels.
// Construct the actual function body.
llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
@@ -708,8 +747,8 @@
llvm::MDBuilder MDHelper(*Context);
TBAARenderScriptDistinct =
- MDHelper.createTBAARoot("RenderScript Distinct TBAA");
- TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
TBAARenderScriptDistinct);
TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
TBAARenderScript);
@@ -734,7 +773,6 @@
// Check the return type
llvm::Type *OutTy = nullptr;
- llvm::Value *OutStep = nullptr;
llvm::LoadInst *OutBasePtr = nullptr;
llvm::Value *CastedOutBasePtr = nullptr;
@@ -754,8 +792,6 @@
OutTy = OutBaseTy->getPointerTo();
}
- OutStep = getStepValue(&DL, OutTy, Arg_outstep);
- OutStep->setName("outstep");
SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
@@ -767,7 +803,6 @@
}
llvm::SmallVector<llvm::Type*, 8> InTypes;
- llvm::SmallVector<llvm::Value*, 8> InSteps;
llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
@@ -797,11 +832,6 @@
Builder.SetInsertPoint(LoopHeader->getTerminator());
for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
- SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride,
- static_cast<int32_t>(InputIndex)}));
- llvm::Value *InStepAddr = Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep");
- llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr, "instep_addr");
-
llvm::Type *InType = ArgIter->getType();
/*
@@ -823,10 +853,6 @@
InStructTempSlots.push_back(nullptr);
}
- llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
-
- InStep->setName("instep");
-
SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
static_cast<int32_t>(InputIndex)}));
llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
@@ -837,7 +863,6 @@
}
InTypes.push_back(InType);
- InSteps.push_back(InStep);
InBufPtrs.push_back(CastInBufPtr);
}
@@ -854,7 +879,6 @@
llvm::Value *OutPtr = nullptr;
if (CastedOutBasePtr) {
llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
if (PassOutByPointer) {
@@ -909,6 +933,269 @@
return true;
}
+ // Expand a reduce-style kernel function.
+ //
+ // The input is a kernel which represents a binary operation,
+ // of the form
+ //
+ // define foo @func(foo %a, foo %b),
+ //
+ // (More generally, it can be of the forms
+ //
+ // define void @func(foo* %ret, foo* %a, foo* %b)
+ // define void @func(foo* %ret, foo1 %a, foo1 %b)
+ // define foo1 @func(foo2 %a, foo2 %b)
+ //
+ // as a result of argument / return value conversions. Here, "foo1"
+ // and "foo2" refer to possibly coerced types, and the coerced
+ // argument type may be different from the coerced return type. See
+ // "Note on coercion" below.)
+ //
+ // Note also, we do not expect to encounter any case when the
+ // arguments are promoted to pointers but the return value is
+ // unpromoted to pointer, e.g.
+ //
+ // define foo1 @func(foo* %a, foo* %b)
+ //
+ // and we will throw an assertion in this case.)
+ //
+ // The input kernel gets expanded into a kernel of the form
+ //
+ // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+ //
+ // which performs a serial reduction of `len` elements from `inBuf`,
+ // and stores the result into `outBuf`. In pseudocode, @func.expand
+ // does:
+ //
+ // inArr := (foo *)inBuf;
+ // accum := inArr[0];
+ // for (i := 1; i < len; ++i) {
+ // accum := foo(accum, inArr[i]);
+ // }
+ // *(foo *)outBuf := accum;
+ //
+ // Note on coercion
+ //
+ // Both the return value and the argument types may undergo internal
+ // coercion in clang as part of call lowering. As a result, the
+ // return value type may differ from the argument type even if the
+ // types in the RenderScript signaure are the same. For instance, the
+ // kernel
+ //
+ // int3 add(int3 a, int3 b) { return a + b; }
+ //
+ // gets lowered by clang as
+ //
+ // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+ //
+ // under AArch64. The details of this process are found in clang,
+ // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+ // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+ // is passed by pointer, then the pointed-to type is not coerced.
+ //
+ // Since we lack the original type information, this code does loads
+ // and stores of allocation data by way of pointers to the coerced
+ // type.
+ bool ExpandReduce(llvm::Function *Function) {
+ bccAssert(Function);
+
+ ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+ llvm::DataLayout DL(Module);
+
+ // TBAA Metadata
+ llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+ llvm::MDBuilder MDHelper(*Context);
+
+ TBAARenderScriptDistinct =
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+ TBAARenderScriptDistinct);
+ TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+ TBAARenderScript);
+ TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+ TBAAAllocation, 0);
+
+ llvm::Function *ExpandedFunction =
+ createEmptyExpandedReduceKernel(Function->getName());
+
+ // Extract the expanded kernel's parameters. It is guaranteed by
+ // createEmptyExpandedFunction that there will be 3 parameters.
+ auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+ llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++);
+
+ bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+ // Check if, instead of returning a value, the original kernel has
+ // a pointer parameter which points to a temporary buffer into
+ // which the return value gets written.
+ const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+ bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+ // Check if, instead of being passed by value, the inputs to the
+ // original kernel are passed by pointer.
+ auto FirstArgIter = Function->arg_begin();
+ // The second argument is always an input to the original kernel.
+ auto SecondArgIter = std::next(FirstArgIter);
+ const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+ // Get the output type (i.e. return type of the original kernel).
+ llvm::PointerType *OutPtrTy = nullptr;
+ llvm::Type *OutTy = nullptr;
+ if (ReturnValuePointerStyle) {
+ OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+ bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+ OutTy = OutPtrTy->getElementType();
+ } else {
+ OutTy = Function->getReturnType();
+ bccAssert(!OutTy->isVoidTy());
+ OutPtrTy = OutTy->getPointerTo();
+ }
+
+ // Get the input type (type of the arguments to the original
+ // kernel). Some input types are different from the output type,
+ // due to explicit coercion that the compiler performs when
+ // lowering the parameters. See "Note on coercion" above.
+ llvm::PointerType *InPtrTy;
+ llvm::Type *InTy;
+ if (InputsPointerStyle) {
+ InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+ bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+ bccAssert(ReturnValuePointerStyle);
+ bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+ "Input type mismatch");
+ InTy = InPtrTy->getElementType();
+ } else {
+ InTy = SecondArgIter->getType();
+ InPtrTy = InTy->getPointerTo();
+ if (!ReturnValuePointerStyle) {
+ bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+ } else {
+ bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+ "Input type mismatch");
+ }
+ }
+
+ // The input type should take up the same amount of space in
+ // memory as the output type.
+ bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+ // Construct the actual function body.
+ llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+ // Cast input and output buffers to appropriate types.
+ llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+ llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+ // Create a slot to pass temporary results back. This needs to be
+ // separate from the accumulator slot because the kernel may mark
+ // the return value slot as noalias.
+ llvm::Value *ReturnBuf = nullptr;
+ if (ReturnValuePointerStyle) {
+ ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+ }
+
+ // Create a slot to hold the second input if the inputs are passed
+ // by pointer to the original kernel. We cannot directly pass a
+ // pointer to the input buffer, because the kernel may modify its
+ // inputs.
+ llvm::Value *SecondInputTempBuf = nullptr;
+ if (InputsPointerStyle) {
+ SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+ }
+
+ // Create a slot to accumulate temporary results, and fill it with
+ // the first value.
+ llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+ // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+ llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+ Builder.CreatePointerCast(InBuf, OutPtrTy));
+ if (gEnableRsTbaa) {
+ FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ // Memory operations with AccumBuf shouldn't be marked with
+ // RenderScript TBAA, since this might conflict with TBAA metadata
+ // in the kernel function when AccumBuf is passed by pointer.
+ Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+ // Loop body
+
+ // Create the loop structure. Note that the first input in the input buffer
+ // has already been accumulated, so that we start at index 1.
+ llvm::PHINode *IndVar;
+ llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+ llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+ llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+ // Set up arguments and call the original (unexpanded) kernel.
+ //
+ // The original kernel can have at most 3 arguments, which is
+ // achieved when the signature looks like:
+ //
+ // define void @func(foo* %ret, bar %a, bar %b)
+ //
+ // (bar can be one of foo/foo.coerce/foo*).
+ llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+ if (ReturnValuePointerStyle) {
+ KernelArgs.push_back(ReturnBuf);
+ }
+
+ if (InputsPointerStyle) {
+ bccAssert(ReturnValuePointerStyle);
+ // Because the return buffer is copied back into the
+ // accumulator, it's okay if the accumulator is overwritten.
+ KernelArgs.push_back(AccumBuf);
+
+ llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ InputLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+ KernelArgs.push_back(SecondInputTempBuf);
+ } else {
+ // InPtrTy may be different from OutPtrTy (the type of
+ // AccumBuf), so first cast the accumulator buffer to the
+ // pointer type corresponding to the input argument type.
+ KernelArgs.push_back(
+ Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+ llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ LoadedArg->setMetadata("tbaa", TBAAAllocation);
+ }
+ KernelArgs.push_back(LoadedArg);
+ }
+
+ llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+ const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+ const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+ // Store the output in the accumulator.
+ if (ReturnValuePointerStyle) {
+ Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+ } else {
+ Builder.CreateStore(RetVal, AccumBuf);
+ }
+
+ // Loop exit
+ Builder.SetInsertPoint(Exit, Exit->begin());
+
+ llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+ llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+ if (gEnableRsTbaa) {
+ OutputStore->setMetadata("tbaa", TBAAAllocation);
+ }
+
+ return true;
+ }
+
/// @brief Checks if pointers to allocation internals are exposed
///
/// This function verifies if through the parameters passed to the kernel
@@ -994,31 +1281,31 @@
virtual bool runOnModule(llvm::Module &Module) {
bool Changed = false;
this->Module = &Module;
- this->Context = &Module.getContext();
+ Context = &Module.getContext();
- this->buildTypes();
+ buildTypes();
bcinfo::MetadataExtractor me(&Module);
if (!me.extract()) {
ALOGE("Could not extract metadata from module!");
return false;
}
+
+ // Expand forEach_* style kernels.
mExportForEachCount = me.getExportForEachSignatureCount();
mExportForEachNameList = me.getExportForEachNameList();
mExportForEachSignatureList = me.getExportForEachSignatureList();
- bool AllocsExposed = allocPointersExposed(Module);
-
for (size_t i = 0; i < mExportForEachCount; ++i) {
const char *name = mExportForEachNameList[i];
uint32_t signature = mExportForEachSignatureList[i];
llvm::Function *kernel = Module.getFunction(name);
if (kernel) {
if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
- Changed |= ExpandKernel(kernel, signature);
+ Changed |= ExpandForEach(kernel, signature);
kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
} else if (kernel->getReturnType()->isVoidTy()) {
- Changed |= ExpandFunction(kernel, signature);
+ Changed |= ExpandOldStyleForEach(kernel, signature);
kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
} else {
// There are some graphics root functions that are not
@@ -1028,7 +1315,18 @@
}
}
- if (gEnableRsTbaa && !AllocsExposed) {
+ // Expand reduce_* style kernels.
+ mExportReduceCount = me.getExportReduceCount();
+ mExportReduceNameList = me.getExportReduceNameList();
+
+ for (size_t i = 0; i < mExportReduceCount; ++i) {
+ llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+ if (kernel) {
+ Changed |= ExpandReduce(kernel);
+ }
+ }
+
+ if (gEnableRsTbaa && !allocPointersExposed(Module)) {
connectRenderScriptTBAAMetadata(Module);
}
@@ -1036,21 +1334,21 @@
}
virtual const char *getPassName() const {
- return "ForEach-able Function Expansion";
+ return "forEach_* and reduce_* function expansion";
}
-}; // end RSForEachExpandPass
+}; // end RSKernelExpandPass
} // end anonymous namespace
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
namespace bcc {
llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
- return new RSForEachExpandPass(pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt) {
+ return new RSKernelExpandPass(pEnableStepOpt);
}
} // end namespace bcc
diff --git a/lib/Renderscript/RSStubsWhiteList.cpp b/lib/Renderscript/RSStubsWhiteList.cpp
index b69681d..426fb43 100644
--- a/lib/Renderscript/RSStubsWhiteList.cpp
+++ b/lib/Renderscript/RSStubsWhiteList.cpp
@@ -1235,6 +1235,7 @@
"_Z3madfff",
"_Z3maxDv2_cS_",
"_Z3maxDv2_fS_",
+"_Z3maxDv2_ff",
"_Z3maxDv2_hS_",
"_Z3maxDv2_iS_",
"_Z3maxDv2_jS_",
@@ -1244,6 +1245,7 @@
"_Z3maxDv2_tS_",
"_Z3maxDv3_cS_",
"_Z3maxDv3_fS_",
+"_Z3maxDv3_ff",
"_Z3maxDv3_hS_",
"_Z3maxDv3_iS_",
"_Z3maxDv3_jS_",
@@ -1253,6 +1255,7 @@
"_Z3maxDv3_tS_",
"_Z3maxDv4_cS_",
"_Z3maxDv4_fS_",
+"_Z3maxDv4_ff",
"_Z3maxDv4_hS_",
"_Z3maxDv4_iS_",
"_Z3maxDv4_jS_",
@@ -1271,6 +1274,7 @@
"_Z3maxtt",
"_Z3minDv2_cS_",
"_Z3minDv2_fS_",
+"_Z3minDv2_ff",
"_Z3minDv2_hS_",
"_Z3minDv2_iS_",
"_Z3minDv2_jS_",
@@ -1280,6 +1284,7 @@
"_Z3minDv2_tS_",
"_Z3minDv3_cS_",
"_Z3minDv3_fS_",
+"_Z3minDv3_ff",
"_Z3minDv3_hS_",
"_Z3minDv3_iS_",
"_Z3minDv3_jS_",
@@ -1289,6 +1294,7 @@
"_Z3minDv3_tS_",
"_Z3minDv4_cS_",
"_Z3minDv4_fS_",
+"_Z3minDv4_ff",
"_Z3minDv4_hS_",
"_Z3minDv4_iS_",
"_Z3minDv4_jS_",
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
index 6f3e175..1cf201a 100644
--- a/tests/libbcc/getelementptr.ll
+++ b/tests/libbcc/getelementptr.ll
@@ -3,7 +3,7 @@
; that they index into the right positions of the structure and that
; the instructions that are generated are in the loop header.
-; RUN: opt -load libbcc.so -foreachexp -S < %s | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
; ModuleID = 'test_getelementptr.bc'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -41,7 +41,7 @@
; New style kernel with multiple inputs
define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
ret i32 0
-; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %arg_outstep)
; CHECK: Begin:
; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
; CHECK: load i8*, i8** %out_buf.gep
@@ -49,14 +49,10 @@
; CHECK: load i32, i32* %Y.gep
; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
; CHECK: load i32, i32* %Z.gep
-; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
-; CHECK: load i32, i32* %instep_addr.gep
; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
; CHECK: load i8*, i8** %input_buf.gep
-; CHECK: %instep_addr.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 1
-; CHECK: load i32, i32* %instep_addr.gep1
-; CHECK: %input_buf.gep3 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
-; CHECK: load i8*, i8** %input_buf.gep3
+; CHECK: %input_buf.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep1
; CHECK: Loop:
}
diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
index 3e9e3ab..5b0a270 100644
--- a/tests/libbcc/tbaa-through-alloca.ll
+++ b/tests/libbcc/tbaa-through-alloca.ll
@@ -1,7 +1,7 @@
; This test checks that the code doesn't aggressively apply TBAA
; metadata to temporaries that are passed by pointer to kernels.
-; RUN: opt -load libbcc.so -foreachexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
; ModuleID = 'kernel.bc'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
index 81972f2..6d8cb48 100644
--- a/tests/libbcc/tbaa.ll
+++ b/tests/libbcc/tbaa.ll
@@ -1,7 +1,7 @@
; Basic test of TBAA that should report that pointer loads do not
; alias with stores to allocations.
-; RUN: opt -load libbcc.so -foreachexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
; ModuleID = 'kernel.bc'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"