am 5f23bccc: (-s ours) am c7aac9dd: am 20328131: Merge "If the host supports it, build libbcc as an LLVM loadable module."
* commit '5f23bcccf58fc33f84f3daef86f852f8c7268892':
diff --git a/bcinfo/BitReader_2_7/Android.mk b/bcinfo/BitReader_2_7/Android.mk
index 5cd3b7b..181c731 100644
--- a/bcinfo/BitReader_2_7/Android.mk
+++ b/bcinfo/BitReader_2_7/Android.mk
@@ -1,6 +1,6 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
bitcode_reader_2_7_SRC_FILES := \
diff --git a/bcinfo/BitReader_2_7/BitcodeReader.cpp b/bcinfo/BitReader_2_7/BitcodeReader.cpp
index ea910ee..894b801 100644
--- a/bcinfo/BitReader_2_7/BitcodeReader.cpp
+++ b/bcinfo/BitReader_2_7/BitcodeReader.cpp
@@ -262,9 +262,9 @@
bool isDematerializable(const GlobalValue *GV) const override;
std::error_code materialize(GlobalValue *GV) override;
- std::error_code MaterializeModule(Module *M) override;
+ std::error_code materializeModule(Module *M) override;
std::vector<StructType *> getIdentifiedStructTypes() const override;
- void Dematerialize(GlobalValue *GV) override;
+ void dematerialize(GlobalValue *GV) override;
/// @brief Main interface to parsing a bitcode buffer.
/// @returns true if an error occurred.
@@ -2302,8 +2302,7 @@
return Error("Invalid type for value");
auto *NewGA =
- GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
- getDecodedLinkage(Record[2]), "", TheModule);
+ GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
// Old bitcode files didn't have visibility field.
if (Record.size() > 3)
NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3141,7 +3140,7 @@
InstructionList.push_back(I);
break;
}
- case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+ case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
unsigned OpNum = 0;
Value *Val, *Ptr;
if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3349,7 +3348,7 @@
return DeferredFunctionInfo.count(const_cast<Function*>(F));
}
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
Function *F = dyn_cast<Function>(GV);
// If this function isn't dematerializable, this is a noop.
if (!F || !isDematerializable(F))
@@ -3362,7 +3361,7 @@
F->setIsMaterializable(true);
}
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
assert(M == TheModule &&
"Can only Materialize the Module this BitcodeReader is attached to.");
// Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/BitReader_3_0/Android.mk b/bcinfo/BitReader_3_0/Android.mk
index b425475..95ccd40 100644
--- a/bcinfo/BitReader_3_0/Android.mk
+++ b/bcinfo/BitReader_3_0/Android.mk
@@ -1,6 +1,6 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
bitcode_reader_3_0_SRC_FILES := \
diff --git a/bcinfo/BitReader_3_0/BitcodeReader.cpp b/bcinfo/BitReader_3_0/BitcodeReader.cpp
index 0c99f3b..0d1262c 100644
--- a/bcinfo/BitReader_3_0/BitcodeReader.cpp
+++ b/bcinfo/BitReader_3_0/BitcodeReader.cpp
@@ -500,9 +500,9 @@
bool isDematerializable(const GlobalValue *GV) const override;
std::error_code materialize(GlobalValue *GV) override;
- std::error_code MaterializeModule(Module *M) override;
+ std::error_code materializeModule(Module *M) override;
std::vector<StructType *> getIdentifiedStructTypes() const override;
- void Dematerialize(GlobalValue *GV) override;
+ void dematerialize(GlobalValue *GV) override;
/// @brief Main interface to parsing a bitcode buffer.
/// @returns true if an error occurred.
@@ -2570,8 +2570,7 @@
return Error("Invalid type for value");
auto *NewGA =
- GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
- getDecodedLinkage(Record[2]), "", TheModule);
+ GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
// Old bitcode files didn't have visibility field.
if (Record.size() > 3)
NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3420,7 +3419,7 @@
InstructionList.push_back(I);
break;
}
- case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+ case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
unsigned OpNum = 0;
Value *Val, *Ptr;
if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3684,7 +3683,7 @@
return DeferredFunctionInfo.count(const_cast<Function*>(F));
}
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
Function *F = dyn_cast<Function>(GV);
// If this function isn't dematerializable, this is a noop.
if (!F || !isDematerializable(F))
@@ -3697,7 +3696,7 @@
F->setIsMaterializable(true);
}
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
assert(M == TheModule &&
"Can only Materialize the Module this BitcodeReader is attached to.");
// Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index 23d97fe..add1ab1 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp
@@ -21,7 +21,7 @@
#define LOG_TAG "bcinfo"
#include <cutils/log.h>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
#include <cutils/properties.h>
#endif
@@ -340,7 +340,7 @@
}
mRSFloatPrecision = RelaxedPragmaSeen ? RS_FP_Relaxed : RS_FP_Full;
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
// Provide an override for precsiion via adb shell setprop
// adb shell setprop debug.rs.precision rs_fp_full
// adb shell setprop debug.rs.precision rs_fp_relaxed
diff --git a/bcinfo/Wrap/Android.mk b/bcinfo/Wrap/Android.mk
index 7da8b3f..1b5db36 100644
--- a/bcinfo/Wrap/Android.mk
+++ b/bcinfo/Wrap/Android.mk
@@ -16,7 +16,7 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
llvm_wrap_SRC_FILES := \
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 75cde37..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -80,13 +80,11 @@
enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
- bool addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM);
bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addInvariantPass(llvm::legacy::PassManager &pPM);
- bool addInvokeHelperPass(llvm::legacy::PassManager &pPM);
- bool addPostLTOCustomPasses(llvm::legacy::PassManager &pPM);
+ void addExpandKernelPass(llvm::legacy::PassManager &pPM);
+ void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
+ void addInvariantPass(llvm::legacy::PassManager &pPM);
+ void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
public:
Compiler();
diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h
@@ -25,7 +25,7 @@
namespace bcc {
llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
llvm::FunctionPass *
createRSInvariantPass();
diff --git a/include/bcc/Support/Properties.h b/include/bcc/Support/Properties.h
index c82901c..4c3c404 100644
--- a/include/bcc/Support/Properties.h
+++ b/include/bcc/Support/Properties.h
@@ -20,12 +20,12 @@
#include <stdint.h>
#include <stdlib.h>
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
#include <cutils/properties.h>
#endif
static inline uint32_t getProperty(const char *str) {
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
char buf[PROPERTY_VALUE_MAX];
property_get(str, buf, "0");
return atoi(buf);
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index c314b6c..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -143,6 +143,8 @@
}
+// This function has complete responsibility for creating and executing the
+// exact list of compiler passes.
enum Compiler::ErrorCode Compiler::runPasses(Script &pScript,
llvm::raw_pwrite_stream &pResult) {
// Pass manager for link-time optimization
@@ -153,10 +155,13 @@
passes.add(createTargetTransformInfoWrapperPass(mTarget->getTargetIRAnalysis()));
- // Add our custom passes.
- if (!addCustomPasses(pScript, passes)) {
+ // Add some initial custom passes.
+ addInvokeHelperPass(passes);
+ addExpandKernelPass(passes);
+ addInvariantPass(passes);
+ if (!addInternalizeSymbolsPass(pScript, passes))
return kErrCustomPasses;
- }
+ addGlobalInfoPass(pScript, passes);
if (mTarget->getOptLevel() == llvm::CodeGenOpt::None) {
passes.add(llvm::createGlobalOptimizerPass());
@@ -187,9 +192,9 @@
// These passes have to come after LTO, since we don't want to examine
// functions that are never actually called.
- if (!addPostLTOCustomPasses(passes)) {
- return kErrCustomPasses;
- }
+ if (llvm::Triple(getTargetMachine().getTargetTriple()).getArch() == llvm::Triple::x86_64)
+ passes.add(createRSX86_64CallConvPass()); // Add pass to correct calling convention for X86-64.
+ passes.add(createRSIsThreadablePass()); // Add pass to mark script as threadable.
// RSEmbedInfoPass needs to come after we have scanned for non-threadable
// functions.
@@ -324,9 +329,11 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
size_t i;
for (i = 0; i < exportVarCount; ++i) {
@@ -337,18 +344,22 @@
export_symbols.push_back(exportFuncNameList[i]);
}
- // Expanded foreach functions should not be internalized, too.
- // expanded_foreach_funcs keeps the .expand version of the kernel names
- // around until createInternalizePass() is finished making its own
- // copy of the visible symbols.
- std::vector<std::string> expanded_foreach_funcs;
+ // Expanded foreach and reduce functions should not be
+ // internalized. expanded_funcs keeps the names of the expanded
+ // functions around until createInternalizePass() is finished making
+ // its own copy of the visible symbols.
+ std::vector<std::string> expanded_funcs;
+ expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
for (i = 0; i < exportForEachCount; ++i) {
- expanded_foreach_funcs.push_back(
- std::string(exportForEachNameList[i]) + ".expand");
+ expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+ }
+ for (i = 0; i < exportReduceCount; ++i) {
+ expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
}
- for (i = 0; i < exportForEachCount; i++) {
- export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+ for (auto &symbol_name : expanded_funcs) {
+ export_symbols.push_back(symbol_name.c_str());
}
pPM.add(llvm::createInternalizePass(export_symbols));
@@ -356,69 +367,31 @@
return true;
}
-bool Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
llvm::Triple arch(getTargetMachine().getTargetTriple());
if (arch.isArch64Bit()) {
pPM.add(createRSInvokeHelperPass());
}
- return true;
}
-bool Compiler::addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM) {
- // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+ // Expand ForEach and reduce on CPU path to reduce launch overhead.
bool pEnableStepOpt = true;
- pPM.add(createRSForEachExpandPass(pEnableStepOpt));
-
- return true;
+ pPM.add(createRSKernelExpandPass(pEnableStepOpt));
}
-bool Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
+void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
// Add additional information about RS global variables inside the Module.
RSScript &script = static_cast<RSScript &>(pScript);
if (script.getEmbedGlobalInfo()) {
pPM.add(createRSGlobalInfoPass(script.getEmbedGlobalInfoSkipConstant()));
}
-
- return true;
}
-bool Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
// Mark Loads from RsExpandKernelDriverInfo as "load.invariant".
// Should run after ExpandForEach and before inlining.
pPM.add(createRSInvariantPass());
-
- return true;
-}
-
-bool Compiler::addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM) {
- if (!addInvokeHelperPass(pPM))
- return false;
-
- if (!addExpandForEachPass(pScript, pPM))
- return false;
-
- if (!addInvariantPass(pPM))
- return false;
-
- if (!addInternalizeSymbolsPass(pScript, pPM))
- return false;
-
- if (!addGlobalInfoPass(pScript, pPM))
- return false;
-
- return true;
-}
-
-bool Compiler::addPostLTOCustomPasses(llvm::legacy::PassManager &pPM) {
- // Add pass to correct calling convention for X86-64.
- llvm::Triple arch(getTargetMachine().getTargetTriple());
- if (arch.getArch() == llvm::Triple::x86_64)
- pPM.add(createRSX86_64CallConvPass());
-
- // Add pass to mark script as threadable.
- pPM.add(createRSIsThreadablePass());
-
- return true;
}
enum Compiler::ErrorCode Compiler::screenGlobalFunctions(Script &pScript) {
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -24,7 +24,7 @@
libbcc_renderscript_SRC_FILES := \
RSCompilerDriver.cpp \
RSEmbedInfo.cpp \
- RSForEachExpand.cpp \
+ RSKernelExpand.cpp \
RSGlobalInfoPass.cpp \
RSInvariant.cpp \
RSScript.cpp \
diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index b9a32c1..7cc4ffb 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp
@@ -42,7 +42,7 @@
#include <sstream>
#include <string>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
#include <cutils/properties.h>
#endif
#include <utils/StopWatch.h>
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -73,11 +73,13 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
size_t objectSlotCount = me.getObjectSlotCount();
size_t pragmaCount = me.getPragmaCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
const uint32_t *exportForEachSignatureList =
me.getExportForEachSignatureList();
const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
<< exportForEachNameList[i] << "\n";
}
+ s << "exportReduceCount: " << exportReduceCount << "\n";
+ for (i = 0; i < exportReduceCount; ++i) {
+ s << exportReduceNameList[i] << "\n";
+ }
+
s << "objectSlotCount: " << objectSlotCount << "\n";
for (i = 0; i < objectSlotCount; ++i) {
s << objectSlotList[i] << "\n";
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
similarity index 69%
rename from lib/Renderscript/RSForEachExpand.cpp
rename to lib/Renderscript/RSKernelExpand.cpp
index ce1fb58..34611d7 100644
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -38,7 +38,14 @@
#include "bcinfo/MetadataExtractor.h"
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
using namespace bcc;
@@ -46,15 +53,17 @@
static const bool gEnableRsTbaa = true;
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
*/
-class RSForEachExpandPass : public llvm::ModulePass {
+class RSKernelExpandPass : public llvm::ModulePass {
public:
static char ID;
@@ -91,16 +100,19 @@
llvm::LLVMContext *Context;
/*
- * Pointer to LLVM type information for the the function signature
- * for expanded kernels. This must be re-calculated for each
- * module the pass is run on.
+ * Pointers to LLVM type information for the the function signatures
+ * for expanded functions. These must be re-calculated for each module
+ * the pass is run on.
*/
- llvm::FunctionType *ExpandedFunctionType;
+ llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
uint32_t mExportForEachCount;
const char **mExportForEachNameList;
const uint32_t *mExportForEachSignatureList;
+ uint32_t mExportReduceCount;
+ const char **mExportReduceNameList;
+
// Turns on optimization of allocation stride values.
bool mEnableStepOpt;
@@ -286,41 +298,68 @@
llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
// Create the function type for expanded kernels.
+ llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+ // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+ ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+ {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
- llvm::SmallVector<llvm::Type*, 8> ParamTypes;
- ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
- ParamTypes.push_back(Int32Ty); // uint32_t x1
- ParamTypes.push_back(Int32Ty); // uint32_t x2
- ParamTypes.push_back(Int32Ty); // uint32_t outstep
-
- ExpandedFunctionType =
- llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
- false);
+ // void (void *inBuf, void *outBuf, uint32_t len)
+ ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
}
- /// @brief Create skeleton of the expanded function.
+ /// @brief Create skeleton of the expanded foreach kernel.
///
/// This creates a function with the following signature:
///
/// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
/// uint32_t outstep)
///
- llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
+ llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
llvm::Function *ExpandedFunction =
- llvm::Function::Create(ExpandedFunctionType,
+ llvm::Function::Create(ExpandedForEachType,
llvm::GlobalValue::ExternalLinkage,
OldName + ".expand", Module);
-
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
(AI++)->setName("p");
(AI++)->setName("x1");
(AI++)->setName("x2");
(AI++)->setName("arg_outstep");
+ llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+ ExpandedFunction);
+ llvm::IRBuilder<> Builder(Begin);
+ Builder.CreateRetVoid();
+ return ExpandedFunction;
+ }
+
+ // Create skeleton of the expanded reduce kernel.
+ //
+ // This creates a function with the following signature:
+ //
+ // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+ //
+ llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+ llvm::Function *ExpandedFunction =
+ llvm::Function::Create(ExpandedReduceType,
+ llvm::GlobalValue::ExternalLinkage,
+ OldName + ".expand", Module);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+ llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+ using llvm::Attribute;
+
+ llvm::Argument *InBuf = &(*AI++);
+ InBuf->setName("inBuf");
+ InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ llvm::Argument *OutBuf = &(*AI++);
+ OutBuf->setName("outBuf");
+ OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ (AI++)->setName("len");
llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
ExpandedFunction);
@@ -444,7 +483,7 @@
//
// Returns:
// Returns a SmallVector of ConstantInts.
- SmallGEPIndices GEPHelper(std::initializer_list<int32_t> I32Args) {
+ SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
SmallGEPIndices Out(I32Args.size());
llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
@@ -453,7 +492,7 @@
}
public:
- RSForEachExpandPass(bool pEnableStepOpt = true)
+ RSKernelExpandPass(bool pEnableStepOpt = true)
: ModulePass(ID), Module(nullptr), Context(nullptr),
mEnableStepOpt(pEnableStepOpt) {
@@ -536,7 +575,7 @@
* Module will contain a new function of the name "<NAME>.expand" that
* invokes <NAME>() in a loop with the appropriate parameters.
*/
- bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
+ bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
ALOGV("Expanding ForEach-able Function %s",
Function->getName().str().c_str());
@@ -552,14 +591,14 @@
llvm::DataLayout DL(Module);
llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
+ createEmptyExpandedForEachKernel(Function->getName());
/*
* Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
+ * createEmptyExpandedFunction that there will be four parameters.
*/
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator ExpandedFunctionArgIter =
ExpandedFunction->arg_begin();
@@ -672,24 +711,24 @@
return true;
}
- /* Expand a pass-by-value kernel.
+ /* Expand a pass-by-value foreach kernel.
*/
- bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
+ bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
- // TODO: Refactor this to share functionality with ExpandFunction.
+ // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
llvm::DataLayout DL(Module);
llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
+ createEmptyExpandedForEachKernel(Function->getName());
/*
* Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
+ * createEmptyExpandedFunction that there will be four parameters.
*/
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
llvm::Function::arg_iterator ExpandedFunctionArgIter =
ExpandedFunction->arg_begin();
@@ -697,7 +736,7 @@
llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
+ // Arg_outstep is not used by expanded new-style forEach kernels.
// Construct the actual function body.
llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
@@ -708,8 +747,8 @@
llvm::MDBuilder MDHelper(*Context);
TBAARenderScriptDistinct =
- MDHelper.createTBAARoot("RenderScript Distinct TBAA");
- TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
TBAARenderScriptDistinct);
TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
TBAARenderScript);
@@ -719,10 +758,6 @@
TBAARenderScript);
TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
- llvm::MDNode *AliasingDomain, *AliasingScope;
- AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
- AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
-
/*
* Collect and construct the arguments for the kernel().
*
@@ -738,7 +773,6 @@
// Check the return type
llvm::Type *OutTy = nullptr;
- llvm::Value *OutStep = nullptr;
llvm::LoadInst *OutBasePtr = nullptr;
llvm::Value *CastedOutBasePtr = nullptr;
@@ -758,8 +792,6 @@
OutTy = OutBaseTy->getPointerTo();
}
- OutStep = getStepValue(&DL, OutTy, Arg_outstep);
- OutStep->setName("outstep");
SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
@@ -767,13 +799,10 @@
OutBasePtr->setMetadata("tbaa", TBAAPointer);
}
- OutBasePtr->setMetadata("alias.scope", AliasingScope);
-
CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
}
llvm::SmallVector<llvm::Type*, 8> InTypes;
- llvm::SmallVector<llvm::Value*, 8> InSteps;
llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
@@ -803,11 +832,6 @@
Builder.SetInsertPoint(LoopHeader->getTerminator());
for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
- SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride,
- static_cast<int32_t>(InputIndex)}));
- llvm::Value *InStepAddr = Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep");
- llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr, "instep_addr");
-
llvm::Type *InType = ArgIter->getType();
/*
@@ -829,10 +853,6 @@
InStructTempSlots.push_back(nullptr);
}
- llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
-
- InStep->setName("instep");
-
SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
static_cast<int32_t>(InputIndex)}));
llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
@@ -842,10 +862,7 @@
InBufPtr->setMetadata("tbaa", TBAAPointer);
}
- InBufPtr->setMetadata("alias.scope", AliasingScope);
-
InTypes.push_back(InType);
- InSteps.push_back(InStep);
InBufPtrs.push_back(CastInBufPtr);
}
@@ -855,23 +872,13 @@
// Populate the actual call to kernel().
llvm::SmallVector<llvm::Value*, 8> RootArgs;
- // Calculate the current input and output pointers
- //
- //
- // We always calculate the input/output pointers with a GEP operating on i8
- // values combined with a multiplication and only cast at the very end to
- // OutTy. This is to account for dynamic stepping sizes when the value
- // isn't apparent at compile time. In the (very common) case when we know
- // the step size at compile time, due to haveing complete type information
- // this multiplication will optmized out and produces code equivalent to a
- // a GEP on a pointer of the correct type.
+ // Calculate the current input and output pointers.
// Output
llvm::Value *OutPtr = nullptr;
if (CastedOutBasePtr) {
llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
if (PassOutByPointer) {
@@ -888,31 +895,22 @@
llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
llvm::Value *Input;
+ llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
+
+ if (gEnableRsTbaa) {
+ InputLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+
if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
// Pass a pointer to a temporary on the stack, rather than
// passing a pointer to the original value. We do not want
// the kernel to potentially modify the input data.
- llvm::Type *ElementType = llvm::cast<llvm::PointerType>(
- InPtr->getType())->getElementType();
- uint64_t StoreSize = DL.getTypeStoreSize(ElementType);
- uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-
- Builder.CreateMemCpy(TemporarySlot, InPtr, StoreSize, Alignment,
- /* isVolatile = */ false,
- /* !tbaa = */ gEnableRsTbaa ? TBAAAllocation : nullptr,
- /* !tbaa.struct = */ nullptr,
- /* !alias.scope = */ AliasingScope);
+ // Note: don't annotate with TBAA, since the kernel might
+ // have its own TBAA annotations for the pointer argument.
+ Builder.CreateStore(InputLoad, TemporarySlot);
Input = TemporarySlot;
} else {
- llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
-
- if (gEnableRsTbaa) {
- InputLoad->setMetadata("tbaa", TBAAAllocation);
- }
-
- InputLoad->setMetadata("alias.scope", AliasingScope);
-
Input = InputLoad;
}
@@ -925,11 +923,274 @@
llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
if (OutPtr && !PassOutByPointer) {
+ RetVal->setName("call.result");
llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
if (gEnableRsTbaa) {
Store->setMetadata("tbaa", TBAAAllocation);
}
- Store->setMetadata("alias.scope", AliasingScope);
+ }
+
+ return true;
+ }
+
+ // Expand a reduce-style kernel function.
+ //
+ // The input is a kernel which represents a binary operation,
+ // of the form
+ //
+ // define foo @func(foo %a, foo %b),
+ //
+ // (More generally, it can be of the forms
+ //
+ // define void @func(foo* %ret, foo* %a, foo* %b)
+ // define void @func(foo* %ret, foo1 %a, foo1 %b)
+ // define foo1 @func(foo2 %a, foo2 %b)
+ //
+ // as a result of argument / return value conversions. Here, "foo1"
+ // and "foo2" refer to possibly coerced types, and the coerced
+ // argument type may be different from the coerced return type. See
+ // "Note on coercion" below.)
+ //
+ // Note also, we do not expect to encounter any case when the
+ // arguments are promoted to pointers but the return value is
+ // unpromoted to pointer, e.g.
+ //
+ // define foo1 @func(foo* %a, foo* %b)
+ //
+ // and we will throw an assertion in this case.)
+ //
+ // The input kernel gets expanded into a kernel of the form
+ //
+ // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+ //
+ // which performs a serial reduction of `len` elements from `inBuf`,
+ // and stores the result into `outBuf`. In pseudocode, @func.expand
+ // does:
+ //
+ // inArr := (foo *)inBuf;
+ // accum := inArr[0];
+ // for (i := 1; i < len; ++i) {
+ // accum := foo(accum, inArr[i]);
+ // }
+ // *(foo *)outBuf := accum;
+ //
+ // Note on coercion
+ //
+ // Both the return value and the argument types may undergo internal
+ // coercion in clang as part of call lowering. As a result, the
+ // return value type may differ from the argument type even if the
+ // types in the RenderScript signaure are the same. For instance, the
+ // kernel
+ //
+ // int3 add(int3 a, int3 b) { return a + b; }
+ //
+ // gets lowered by clang as
+ //
+ // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+ //
+ // under AArch64. The details of this process are found in clang,
+ // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+ // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+ // is passed by pointer, then the pointed-to type is not coerced.
+ //
+ // Since we lack the original type information, this code does loads
+ // and stores of allocation data by way of pointers to the coerced
+ // type.
+ bool ExpandReduce(llvm::Function *Function) {
+ bccAssert(Function);
+
+ ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+ llvm::DataLayout DL(Module);
+
+ // TBAA Metadata
+ llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+ llvm::MDBuilder MDHelper(*Context);
+
+ TBAARenderScriptDistinct =
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+ TBAARenderScriptDistinct);
+ TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+ TBAARenderScript);
+ TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+ TBAAAllocation, 0);
+
+ llvm::Function *ExpandedFunction =
+ createEmptyExpandedReduceKernel(Function->getName());
+
+ // Extract the expanded kernel's parameters. It is guaranteed by
+ // createEmptyExpandedFunction that there will be 3 parameters.
+ auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+ llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++);
+
+ bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+ // Check if, instead of returning a value, the original kernel has
+ // a pointer parameter which points to a temporary buffer into
+ // which the return value gets written.
+ const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+ bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+ // Check if, instead of being passed by value, the inputs to the
+ // original kernel are passed by pointer.
+ auto FirstArgIter = Function->arg_begin();
+ // The second argument is always an input to the original kernel.
+ auto SecondArgIter = std::next(FirstArgIter);
+ const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+ // Get the output type (i.e. return type of the original kernel).
+ llvm::PointerType *OutPtrTy = nullptr;
+ llvm::Type *OutTy = nullptr;
+ if (ReturnValuePointerStyle) {
+ OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+ bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+ OutTy = OutPtrTy->getElementType();
+ } else {
+ OutTy = Function->getReturnType();
+ bccAssert(!OutTy->isVoidTy());
+ OutPtrTy = OutTy->getPointerTo();
+ }
+
+ // Get the input type (type of the arguments to the original
+ // kernel). Some input types are different from the output type,
+ // due to explicit coercion that the compiler performs when
+ // lowering the parameters. See "Note on coercion" above.
+ llvm::PointerType *InPtrTy;
+ llvm::Type *InTy;
+ if (InputsPointerStyle) {
+ InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+ bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+ bccAssert(ReturnValuePointerStyle);
+ bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+ "Input type mismatch");
+ InTy = InPtrTy->getElementType();
+ } else {
+ InTy = SecondArgIter->getType();
+ InPtrTy = InTy->getPointerTo();
+ if (!ReturnValuePointerStyle) {
+ bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+ } else {
+ bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+ "Input type mismatch");
+ }
+ }
+
+ // The input type should take up the same amount of space in
+ // memory as the output type.
+ bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+ // Construct the actual function body.
+ llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+ // Cast input and output buffers to appropriate types.
+ llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+ llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+ // Create a slot to pass temporary results back. This needs to be
+ // separate from the accumulator slot because the kernel may mark
+ // the return value slot as noalias.
+ llvm::Value *ReturnBuf = nullptr;
+ if (ReturnValuePointerStyle) {
+ ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+ }
+
+ // Create a slot to hold the second input if the inputs are passed
+ // by pointer to the original kernel. We cannot directly pass a
+ // pointer to the input buffer, because the kernel may modify its
+ // inputs.
+ llvm::Value *SecondInputTempBuf = nullptr;
+ if (InputsPointerStyle) {
+ SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+ }
+
+ // Create a slot to accumulate temporary results, and fill it with
+ // the first value.
+ llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+ // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+ llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+ Builder.CreatePointerCast(InBuf, OutPtrTy));
+ if (gEnableRsTbaa) {
+ FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ // Memory operations with AccumBuf shouldn't be marked with
+ // RenderScript TBAA, since this might conflict with TBAA metadata
+ // in the kernel function when AccumBuf is passed by pointer.
+ Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+ // Loop body
+
+ // Create the loop structure. Note that the first input in the input buffer
+ // has already been accumulated, so that we start at index 1.
+ llvm::PHINode *IndVar;
+ llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+ llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+ llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+ // Set up arguments and call the original (unexpanded) kernel.
+ //
+ // The original kernel can have at most 3 arguments, which is
+ // achieved when the signature looks like:
+ //
+ // define void @func(foo* %ret, bar %a, bar %b)
+ //
+ // (bar can be one of foo/foo.coerce/foo*).
+ llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+ if (ReturnValuePointerStyle) {
+ KernelArgs.push_back(ReturnBuf);
+ }
+
+ if (InputsPointerStyle) {
+ bccAssert(ReturnValuePointerStyle);
+ // Because the return buffer is copied back into the
+ // accumulator, it's okay if the accumulator is overwritten.
+ KernelArgs.push_back(AccumBuf);
+
+ llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ InputLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+ KernelArgs.push_back(SecondInputTempBuf);
+ } else {
+ // InPtrTy may be different from OutPtrTy (the type of
+ // AccumBuf), so first cast the accumulator buffer to the
+ // pointer type corresponding to the input argument type.
+ KernelArgs.push_back(
+ Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+ llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ LoadedArg->setMetadata("tbaa", TBAAAllocation);
+ }
+ KernelArgs.push_back(LoadedArg);
+ }
+
+ llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+ const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+ const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+ // Store the output in the accumulator.
+ if (ReturnValuePointerStyle) {
+ Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+ } else {
+ Builder.CreateStore(RetVal, AccumBuf);
+ }
+
+ // Loop exit
+ Builder.SetInsertPoint(Exit, Exit->begin());
+
+ llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+ llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+ if (gEnableRsTbaa) {
+ OutputStore->setMetadata("tbaa", TBAAAllocation);
}
return true;
@@ -1020,31 +1281,31 @@
virtual bool runOnModule(llvm::Module &Module) {
bool Changed = false;
this->Module = &Module;
- this->Context = &Module.getContext();
+ Context = &Module.getContext();
- this->buildTypes();
+ buildTypes();
bcinfo::MetadataExtractor me(&Module);
if (!me.extract()) {
ALOGE("Could not extract metadata from module!");
return false;
}
+
+ // Expand forEach_* style kernels.
mExportForEachCount = me.getExportForEachSignatureCount();
mExportForEachNameList = me.getExportForEachNameList();
mExportForEachSignatureList = me.getExportForEachSignatureList();
- bool AllocsExposed = allocPointersExposed(Module);
-
for (size_t i = 0; i < mExportForEachCount; ++i) {
const char *name = mExportForEachNameList[i];
uint32_t signature = mExportForEachSignatureList[i];
llvm::Function *kernel = Module.getFunction(name);
if (kernel) {
if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
- Changed |= ExpandKernel(kernel, signature);
+ Changed |= ExpandForEach(kernel, signature);
kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
} else if (kernel->getReturnType()->isVoidTy()) {
- Changed |= ExpandFunction(kernel, signature);
+ Changed |= ExpandOldStyleForEach(kernel, signature);
kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
} else {
// There are some graphics root functions that are not
@@ -1054,7 +1315,18 @@
}
}
- if (gEnableRsTbaa && !AllocsExposed) {
+ // Expand reduce_* style kernels.
+ mExportReduceCount = me.getExportReduceCount();
+ mExportReduceNameList = me.getExportReduceNameList();
+
+ for (size_t i = 0; i < mExportReduceCount; ++i) {
+ llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+ if (kernel) {
+ Changed |= ExpandReduce(kernel);
+ }
+ }
+
+ if (gEnableRsTbaa && !allocPointersExposed(Module)) {
connectRenderScriptTBAAMetadata(Module);
}
@@ -1062,21 +1334,21 @@
}
virtual const char *getPassName() const {
- return "ForEach-able Function Expansion";
+ return "forEach_* and reduce_* function expansion";
}
-}; // end RSForEachExpandPass
+}; // end RSKernelExpandPass
} // end anonymous namespace
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
namespace bcc {
llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
- return new RSForEachExpandPass(pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt) {
+ return new RSKernelExpandPass(pEnableStepOpt);
}
} // end namespace bcc
diff --git a/lib/Renderscript/RSStubsWhiteList.cpp b/lib/Renderscript/RSStubsWhiteList.cpp
index b69681d..426fb43 100644
--- a/lib/Renderscript/RSStubsWhiteList.cpp
+++ b/lib/Renderscript/RSStubsWhiteList.cpp
@@ -1235,6 +1235,7 @@
"_Z3madfff",
"_Z3maxDv2_cS_",
"_Z3maxDv2_fS_",
+"_Z3maxDv2_ff",
"_Z3maxDv2_hS_",
"_Z3maxDv2_iS_",
"_Z3maxDv2_jS_",
@@ -1244,6 +1245,7 @@
"_Z3maxDv2_tS_",
"_Z3maxDv3_cS_",
"_Z3maxDv3_fS_",
+"_Z3maxDv3_ff",
"_Z3maxDv3_hS_",
"_Z3maxDv3_iS_",
"_Z3maxDv3_jS_",
@@ -1253,6 +1255,7 @@
"_Z3maxDv3_tS_",
"_Z3maxDv4_cS_",
"_Z3maxDv4_fS_",
+"_Z3maxDv4_ff",
"_Z3maxDv4_hS_",
"_Z3maxDv4_iS_",
"_Z3maxDv4_jS_",
@@ -1271,6 +1274,7 @@
"_Z3maxtt",
"_Z3minDv2_cS_",
"_Z3minDv2_fS_",
+"_Z3minDv2_ff",
"_Z3minDv2_hS_",
"_Z3minDv2_iS_",
"_Z3minDv2_jS_",
@@ -1280,6 +1284,7 @@
"_Z3minDv2_tS_",
"_Z3minDv3_cS_",
"_Z3minDv3_fS_",
+"_Z3minDv3_ff",
"_Z3minDv3_hS_",
"_Z3minDv3_iS_",
"_Z3minDv3_jS_",
@@ -1289,6 +1294,7 @@
"_Z3minDv3_tS_",
"_Z3minDv4_cS_",
"_Z3minDv4_fS_",
+"_Z3minDv4_ff",
"_Z3minDv4_hS_",
"_Z3minDv4_iS_",
"_Z3minDv4_jS_",
diff --git a/lib/Support/CompilerConfig.cpp b/lib/Support/CompilerConfig.cpp
index eac26aa..71cd7cc 100644
--- a/lib/Support/CompilerConfig.cpp
+++ b/lib/Support/CompilerConfig.cpp
@@ -155,7 +155,9 @@
#if defined(TARGET_BUILD)
if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
#ifndef FORCE_CPU_VARIANT_32
+#ifdef DEFAULT_ARM_CODEGEN
setCPU(llvm::sys::getHostCPUName());
+#endif
#else
#define XSTR(S) #S
#define STR(S) XSTR(S)
@@ -175,7 +177,9 @@
#if defined(TARGET_BUILD)
if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
#ifndef FORCE_CPU_VARIANT_64
+#ifdef DEFAULT_ARM64_CODEGEN
setCPU(llvm::sys::getHostCPUName());
+#endif
#else
#define XSTR(S) #S
#define STR(S) XSTR(S)
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
index 6f3e175..1cf201a 100644
--- a/tests/libbcc/getelementptr.ll
+++ b/tests/libbcc/getelementptr.ll
@@ -3,7 +3,7 @@
; that they index into the right positions of the structure and that
; the instructions that are generated are in the loop header.
-; RUN: opt -load libbcc.so -foreachexp -S < %s | FileCheck %s
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
; ModuleID = 'test_getelementptr.bc'
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
@@ -41,7 +41,7 @@
; New style kernel with multiple inputs
define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
ret i32 0
-; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %arg_outstep)
; CHECK: Begin:
; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
; CHECK: load i8*, i8** %out_buf.gep
@@ -49,14 +49,10 @@
; CHECK: load i32, i32* %Y.gep
; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
; CHECK: load i32, i32* %Z.gep
-; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
-; CHECK: load i32, i32* %instep_addr.gep
; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
; CHECK: load i8*, i8** %input_buf.gep
-; CHECK: %instep_addr.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 1
-; CHECK: load i32, i32* %instep_addr.gep1
-; CHECK: %input_buf.gep3 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
-; CHECK: load i8*, i8** %input_buf.gep3
+; CHECK: %input_buf.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep1
; CHECK: Loop:
}
diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
new file mode 100644
index 0000000..5b0a270
--- /dev/null
+++ b/tests/libbcc/tbaa-through-alloca.ll
@@ -0,0 +1,71 @@
+; This test checks that the code doesn't aggressively apply TBAA
+; metadata to temporaries that are passed by pointer to kernels.
+
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+%struct.int5.0 = type { [5 x i32] }
+
+; Function Attrs: nounwind
+define void @add1_int5(%struct.int5.0* noalias nocapture sret %agg.result, %struct.int5.0* nocapture %in) #0 {
+ br label %1
+
+; <label>:1 ; preds = %1, %0
+ %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+ %2 = getelementptr inbounds %struct.int5.0, %struct.int5.0* %in, i64 0, i32 0, i64 %indvars.iv
+; CHECK: MayAlias: %load_from_input{{.*}} <-> store %struct.int5.0 %input, %struct.int5.0* %input_struct_slot
+ %load_from_input = load i32, i32* %2, align 4, !tbaa !9
+ %3 = add nsw i32 %load_from_input, 1
+ store i32 %3, i32* %2, align 4, !tbaa !9
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 5
+ br i1 %exitcond, label %4, label %1
+
+; <label>:4 ; preds = %1
+ %5 = bitcast %struct.int5.0* %agg.result to i8*
+ %6 = bitcast %struct.int5.0* %in to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 20, i32 4, i1 false), !tbaa.struct !13
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+!\23rs_export_type = !{!7}
+!\25int5 = !{!8}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1_int5"}
+!5 = !{!"0"}
+!6 = !{!"35"}
+!7 = !{!"int5"}
+!8 = !{!"data", !"<ConstantArray>"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{i64 0, i64 20, !14}
+!14 = !{!11, !11, i64 0}
diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
new file mode 100644
index 0000000..6d8cb48
--- /dev/null
+++ b/tests/libbcc/tbaa.ll
@@ -0,0 +1,43 @@
+; Basic test of TBAA that should report that pointer loads do not
+; alias with stores to allocations.
+
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; CHECK: NoAlias: %0 = load {{.*}}, i8** %out_buf.gep, !tbaa {{.*}} <-> store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+; CHECK: NoAlias: %input_buf = load i8*, i8** %input_buf.gep, !tbaa {{.*}} <-> store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+
+; Function Attrs: nounwind readnone
+define i32 @add1(i32 %in) #0 {
+ %1 = add nsw i32 %in, 1
+ ret i32 %1
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1"}
+!5 = !{!"0"}
+!6 = !{!"35"}