am 434917b0: (-s ours) am a38e29ba: Fix a bug where kernels could modify the input allocation.
* commit '434917b0125bcb2a638697eb21cb3d89a0ad8f24':
diff --git a/Android.mk b/Android.mk
index 1ab8386..ad21289 100644
--- a/Android.mk
+++ b/Android.mk
@@ -98,12 +98,20 @@
libcutils \
liblog
-LOCAL_SHARED_LIBRARIES := libbcinfo libLLVM
+LOCAL_SHARED_LIBRARIES := libbcinfo
ifndef USE_MINGW
LOCAL_LDLIBS := -ldl -lpthread
endif
+include $(LIBBCC_ROOT_PATH)/llvm-loadable-libbcc.mk
+
+ifeq ($(CAN_BUILD_HOST_LLVM_LOADABLE_MODULE),true)
+LOCAL_STATIC_LIBRARIES += libLLVMLinker
+else
+LOCAL_SHARED_LIBRARIES += libLLVM
+endif
+
include $(LIBBCC_HOST_BUILD_MK)
include $(LLVM_HOST_BUILD_MK)
include $(BUILD_HOST_SHARED_LIBRARY)
diff --git a/bcinfo/Android.mk b/bcinfo/Android.mk
index 5a55014..3da0d34 100644
--- a/bcinfo/Android.mk
+++ b/bcinfo/Android.mk
@@ -80,12 +80,17 @@
LOCAL_STATIC_LIBRARIES += $(libbcinfo_STATIC_LIBRARIES)
LOCAL_STATIC_LIBRARIES += libcutils liblog
-LOCAL_SHARED_LIBRARIES += libLLVM
ifndef USE_MINGW
LOCAL_LDLIBS := -ldl -lpthread
endif
+include $(LOCAL_PATH)/../llvm-loadable-libbcc.mk
+
+ifneq ($(CAN_BUILD_HOST_LLVM_LOADABLE_MODULE),true)
+LOCAL_SHARED_LIBRARIES += libLLVM
+endif
+
include $(LLVM_ROOT_PATH)/llvm-host-build.mk
include $(BUILD_HOST_SHARED_LIBRARY)
diff --git a/bcinfo/BitReader_2_7/Android.mk b/bcinfo/BitReader_2_7/Android.mk
index 5cd3b7b..181c731 100644
--- a/bcinfo/BitReader_2_7/Android.mk
+++ b/bcinfo/BitReader_2_7/Android.mk
@@ -1,6 +1,6 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
bitcode_reader_2_7_SRC_FILES := \
diff --git a/bcinfo/BitReader_2_7/BitcodeReader.cpp b/bcinfo/BitReader_2_7/BitcodeReader.cpp
index ea910ee..894b801 100644
--- a/bcinfo/BitReader_2_7/BitcodeReader.cpp
+++ b/bcinfo/BitReader_2_7/BitcodeReader.cpp
@@ -262,9 +262,9 @@
bool isDematerializable(const GlobalValue *GV) const override;
std::error_code materialize(GlobalValue *GV) override;
- std::error_code MaterializeModule(Module *M) override;
+ std::error_code materializeModule(Module *M) override;
std::vector<StructType *> getIdentifiedStructTypes() const override;
- void Dematerialize(GlobalValue *GV) override;
+ void dematerialize(GlobalValue *GV) override;
/// @brief Main interface to parsing a bitcode buffer.
/// @returns true if an error occurred.
@@ -2302,8 +2302,7 @@
return Error("Invalid type for value");
auto *NewGA =
- GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
- getDecodedLinkage(Record[2]), "", TheModule);
+ GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
// Old bitcode files didn't have visibility field.
if (Record.size() > 3)
NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3141,7 +3140,7 @@
InstructionList.push_back(I);
break;
}
- case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+ case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
unsigned OpNum = 0;
Value *Val, *Ptr;
if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3349,7 +3348,7 @@
return DeferredFunctionInfo.count(const_cast<Function*>(F));
}
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
Function *F = dyn_cast<Function>(GV);
// If this function isn't dematerializable, this is a noop.
if (!F || !isDematerializable(F))
@@ -3362,7 +3361,7 @@
F->setIsMaterializable(true);
}
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
assert(M == TheModule &&
"Can only Materialize the Module this BitcodeReader is attached to.");
// Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/BitReader_3_0/Android.mk b/bcinfo/BitReader_3_0/Android.mk
index b425475..95ccd40 100644
--- a/bcinfo/BitReader_3_0/Android.mk
+++ b/bcinfo/BitReader_3_0/Android.mk
@@ -1,6 +1,6 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
bitcode_reader_3_0_SRC_FILES := \
diff --git a/bcinfo/BitReader_3_0/BitcodeReader.cpp b/bcinfo/BitReader_3_0/BitcodeReader.cpp
index 0c99f3b..0d1262c 100644
--- a/bcinfo/BitReader_3_0/BitcodeReader.cpp
+++ b/bcinfo/BitReader_3_0/BitcodeReader.cpp
@@ -500,9 +500,9 @@
bool isDematerializable(const GlobalValue *GV) const override;
std::error_code materialize(GlobalValue *GV) override;
- std::error_code MaterializeModule(Module *M) override;
+ std::error_code materializeModule(Module *M) override;
std::vector<StructType *> getIdentifiedStructTypes() const override;
- void Dematerialize(GlobalValue *GV) override;
+ void dematerialize(GlobalValue *GV) override;
/// @brief Main interface to parsing a bitcode buffer.
/// @returns true if an error occurred.
@@ -2570,8 +2570,7 @@
return Error("Invalid type for value");
auto *NewGA =
- GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
- getDecodedLinkage(Record[2]), "", TheModule);
+ GlobalAlias::create(PTy, getDecodedLinkage(Record[2]), "", TheModule);
// Old bitcode files didn't have visibility field.
if (Record.size() > 3)
NewGA->setVisibility(GetDecodedVisibility(Record[3]));
@@ -3420,7 +3419,7 @@
InstructionList.push_back(I);
break;
}
- case bitc::FUNC_CODE_INST_STORE: { // STORE2:[ptrty, ptr, val, align, vol]
+ case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
unsigned OpNum = 0;
Value *Val, *Ptr;
if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
@@ -3684,7 +3683,7 @@
return DeferredFunctionInfo.count(const_cast<Function*>(F));
}
-void BitcodeReader::Dematerialize(GlobalValue *GV) {
+void BitcodeReader::dematerialize(GlobalValue *GV) {
Function *F = dyn_cast<Function>(GV);
// If this function isn't dematerializable, this is a noop.
if (!F || !isDematerializable(F))
@@ -3697,7 +3696,7 @@
F->setIsMaterializable(true);
}
-std::error_code BitcodeReader::MaterializeModule(Module *M) {
+std::error_code BitcodeReader::materializeModule(Module *M) {
assert(M == TheModule &&
"Can only Materialize the Module this BitcodeReader is attached to.");
// Iterate over the module, deserializing any functions that are still on
diff --git a/bcinfo/MetadataExtractor.cpp b/bcinfo/MetadataExtractor.cpp
index 468e940..add1ab1 100644
--- a/bcinfo/MetadataExtractor.cpp
+++ b/bcinfo/MetadataExtractor.cpp
@@ -21,7 +21,7 @@
#define LOG_TAG "bcinfo"
#include <cutils/log.h>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
#include <cutils/properties.h>
#endif
@@ -56,8 +56,75 @@
return false;
}
+const char *createStringFromValue(llvm::Metadata *m) {
+ auto ref = getStringOperand(m);
+ char *c = new char[ref.size() + 1];
+ memcpy(c, ref.data(), ref.size());
+ c[ref.size()] = '\0';
+ return c;
}
+// Collect metadata from NamedMDNodes that contain a list of names
+// (strings).
+//
+// Inputs:
+//
+// NamedMetadata - An LLVM metadata node, each of whose operands have
+// a string as their first entry
+//
+// NameList - A reference that will hold an allocated array of strings
+//
+// Count - A reference that will hold the length of the allocated
+// array of strings
+//
+// Return value:
+//
+// Return true on success, false on error.
+//
+// Upon success, the function sets NameList to an array of strings
+// corresponding the names found in the metadata. The function sets
+// Count to the number of entries in NameList.
+//
+// An error occurs if one of the metadata operands doesn't have a
+// first entry.
+bool populateNameMetadata(const llvm::NamedMDNode *NameMetadata,
+ const char **&NameList, size_t &Count) {
+ if (!NameMetadata) {
+ NameList = nullptr;
+ Count = 0;
+ return true;
+ }
+
+ Count = NameMetadata->getNumOperands();
+ if (!Count) {
+ NameList = nullptr;
+ return true;
+ }
+
+ NameList = new const char *[Count];
+
+ for (size_t i = 0; i < Count; i++) {
+ llvm::MDNode *Name = NameMetadata->getOperand(i);
+ if (Name && Name->getNumOperands() > 0) {
+ NameList[i] = createStringFromValue(Name->getOperand(0));
+ } else {
+ ALOGE("Metadata operand does not contain a name string");
+ for (size_t AllocatedIndex = 0; AllocatedIndex < i; AllocatedIndex++) {
+ delete [] NameList[AllocatedIndex];
+ }
+ delete [] NameList;
+ NameList = nullptr;
+ Count = 0;
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+} // end anonymous namespace
+
// Name of metadata node where pragma info resides (should be synced with
// slang.cpp)
static const llvm::StringRef PragmaMetadataName = "#pragma";
@@ -79,6 +146,10 @@
// (should be synced with slang_rs_metadata.h)
static const llvm::StringRef ExportForEachMetadataName = "#rs_export_foreach";
+// Name of metadata node where exported reduce name information resides
+// (should be synced with slang_rs_metadata.h)
+static const llvm::StringRef ExportReduceMetadataName = "#rs_export_reduce";
+
// Name of metadata node where RS object slot info resides (should be
// synced with slang_rs_metadata.h)
static const llvm::StringRef ObjectSlotMetadataName = "#rs_object_slots";
@@ -92,28 +163,31 @@
MetadataExtractor::MetadataExtractor(const char *bitcode, size_t bitcodeSize)
: mModule(nullptr), mBitcode(bitcode), mBitcodeSize(bitcodeSize),
mExportVarCount(0), mExportFuncCount(0), mExportForEachSignatureCount(0),
- mExportVarNameList(nullptr), mExportFuncNameList(nullptr),
- mExportForEachNameList(nullptr), mExportForEachSignatureList(nullptr),
- mExportForEachInputCountList(nullptr), mPragmaCount(0),
- mPragmaKeyList(nullptr), mPragmaValueList(nullptr), mObjectSlotCount(0),
- mObjectSlotList(nullptr), mRSFloatPrecision(RS_FP_Full),
- mIsThreadable(true), mBuildChecksum(nullptr) {
+ mExportReduceCount(0), mExportVarNameList(nullptr),
+ mExportFuncNameList(nullptr), mExportForEachNameList(nullptr),
+ mExportForEachSignatureList(nullptr),
+ mExportForEachInputCountList(nullptr), mExportReduceNameList(nullptr),
+ mPragmaCount(0), mPragmaKeyList(nullptr), mPragmaValueList(nullptr),
+ mObjectSlotCount(0), mObjectSlotList(nullptr),
+ mRSFloatPrecision(RS_FP_Full), mIsThreadable(true),
+ mBuildChecksum(nullptr) {
BitcodeWrapper wrapper(bitcode, bitcodeSize);
mTargetAPI = wrapper.getTargetAPI();
mCompilerVersion = wrapper.getCompilerVersion();
mOptimizationLevel = wrapper.getOptimizationLevel();
}
-
MetadataExtractor::MetadataExtractor(const llvm::Module *module)
- : mModule(module), mBitcode(nullptr), mBitcodeSize(0), mExportVarCount(0),
- mExportFuncCount(0), mExportForEachSignatureCount(0),
- mExportVarNameList(nullptr), mExportFuncNameList(nullptr),
- mExportForEachNameList(nullptr), mExportForEachSignatureList(nullptr),
- mExportForEachInputCountList(nullptr), mPragmaCount(0),
- mPragmaKeyList(nullptr), mPragmaValueList(nullptr), mObjectSlotCount(0),
- mObjectSlotList(nullptr), mRSFloatPrecision(RS_FP_Full),
- mIsThreadable(true), mBuildChecksum(nullptr) {
+ : mModule(module), mBitcode(nullptr), mBitcodeSize(0),
+ mExportVarCount(0), mExportFuncCount(0), mExportForEachSignatureCount(0),
+ mExportReduceCount(0), mExportVarNameList(nullptr),
+ mExportFuncNameList(nullptr), mExportForEachNameList(nullptr),
+ mExportForEachSignatureList(nullptr),
+ mExportForEachInputCountList(nullptr), mExportReduceNameList(nullptr),
+ mPragmaCount(0), mPragmaKeyList(nullptr), mPragmaValueList(nullptr),
+ mObjectSlotCount(0), mObjectSlotList(nullptr),
+ mRSFloatPrecision(RS_FP_Full), mIsThreadable(true),
+ mBuildChecksum(nullptr) {
mCompilerVersion = RS_VERSION; // Default to the actual current version.
mOptimizationLevel = 3;
}
@@ -150,6 +224,15 @@
delete [] mExportForEachSignatureList;
mExportForEachSignatureList = nullptr;
+ if (mExportReduceNameList) {
+ for (size_t i = 0; i < mExportReduceCount; i++) {
+ delete [] mExportReduceNameList[i];
+ mExportReduceNameList[i] = nullptr;
+ }
+ }
+ delete [] mExportReduceNameList;
+ mExportReduceNameList = nullptr;
+
for (size_t i = 0; i < mPragmaCount; i++) {
if (mPragmaKeyList) {
delete [] mPragmaKeyList[i];
@@ -208,15 +291,6 @@
}
-static const char *createStringFromValue(llvm::Metadata *m) {
- auto ref = getStringOperand(m);
- char *c = new char[ref.size() + 1];
- memcpy(c, ref.data(), ref.size());
- c[ref.size()] = '\0';
- return c;
-}
-
-
void MetadataExtractor::populatePragmaMetadata(
const llvm::NamedMDNode *PragmaMetadata) {
if (!PragmaMetadata) {
@@ -266,7 +340,7 @@
}
mRSFloatPrecision = RelaxedPragmaSeen ? RS_FP_Relaxed : RS_FP_Full;
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
// Provide an override for precsiion via adb shell setprop
// adb shell setprop debug.rs.precision rs_fp_full
// adb shell setprop debug.rs.precision rs_fp_relaxed
@@ -292,59 +366,6 @@
#endif
}
-
-bool MetadataExtractor::populateVarNameMetadata(
- const llvm::NamedMDNode *VarNameMetadata) {
- if (!VarNameMetadata) {
- return true;
- }
-
- mExportVarCount = VarNameMetadata->getNumOperands();
- if (!mExportVarCount) {
- return true;
- }
-
- const char **TmpNameList = new const char *[mExportVarCount];
-
- for (size_t i = 0; i < mExportVarCount; i++) {
- llvm::MDNode *Name = VarNameMetadata->getOperand(i);
- if (Name != nullptr && Name->getNumOperands() > 1) {
- TmpNameList[i] = createStringFromValue(Name->getOperand(0));
- }
- }
-
- mExportVarNameList = TmpNameList;
-
- return true;
-}
-
-
-bool MetadataExtractor::populateFuncNameMetadata(
- const llvm::NamedMDNode *FuncNameMetadata) {
- if (!FuncNameMetadata) {
- return true;
- }
-
- mExportFuncCount = FuncNameMetadata->getNumOperands();
- if (!mExportFuncCount) {
- return true;
- }
-
- const char **TmpNameList = new const char*[mExportFuncCount];
-
- for (size_t i = 0; i < mExportFuncCount; i++) {
- llvm::MDNode *Name = FuncNameMetadata->getOperand(i);
- if (Name != nullptr && Name->getNumOperands() == 1) {
- TmpNameList[i] = createStringFromValue(Name->getOperand(0));
- }
- }
-
- mExportFuncNameList = TmpNameList;
-
- return true;
-}
-
-
uint32_t MetadataExtractor::calculateNumInputs(const llvm::Function *Function,
uint32_t Signature) {
@@ -521,6 +542,8 @@
mModule->getNamedMetadata(ExportForEachNameMetadataName);
const llvm::NamedMDNode *ExportForEachMetadata =
mModule->getNamedMetadata(ExportForEachMetadataName);
+ const llvm::NamedMDNode *ExportReduceMetadata =
+ mModule->getNamedMetadata(ExportReduceMetadataName);
const llvm::NamedMDNode *PragmaMetadata =
mModule->getNamedMetadata(PragmaMetadataName);
const llvm::NamedMDNode *ObjectSlotMetadata =
@@ -530,17 +553,24 @@
const llvm::NamedMDNode *ChecksumMetadata =
mModule->getNamedMetadata(ChecksumMetadataName);
-
- if (!populateVarNameMetadata(ExportVarMetadata)) {
+ if (!populateNameMetadata(ExportVarMetadata, mExportVarNameList,
+ mExportVarCount)) {
ALOGE("Could not populate export variable metadata");
return false;
}
- if (!populateFuncNameMetadata(ExportFuncMetadata)) {
+ if (!populateNameMetadata(ExportFuncMetadata, mExportFuncNameList,
+ mExportFuncCount)) {
ALOGE("Could not populate export function metadata");
return false;
}
+ if (!populateNameMetadata(ExportReduceMetadata, mExportReduceNameList,
+ mExportReduceCount)) {
+ ALOGE("Could not populate export reduce metadata");
+ return false;
+ }
+
if (!populateForEachMetadata(ExportForEachNameMetadata,
ExportForEachMetadata)) {
ALOGE("Could not populate ForEach signature metadata");
diff --git a/bcinfo/Wrap/Android.mk b/bcinfo/Wrap/Android.mk
index 7da8b3f..1b5db36 100644
--- a/bcinfo/Wrap/Android.mk
+++ b/bcinfo/Wrap/Android.mk
@@ -16,7 +16,7 @@
LOCAL_PATH:= $(call my-dir)
-LLVM_ROOT_PATH := $(LOCAL_PATH)/../../../../../external/llvm
+LLVM_ROOT_PATH := external/llvm
include $(LLVM_ROOT_PATH)/llvm.mk
llvm_wrap_SRC_FILES := \
diff --git a/bcinfo/tools/main.cpp b/bcinfo/tools/main.cpp
index 0921151..c4e40f4 100644
--- a/bcinfo/tools/main.cpp
+++ b/bcinfo/tools/main.cpp
@@ -143,6 +143,12 @@
inputCountList[i]);
}
+ fprintf(info, "exportReduceCount: %zu\n", ME->getExportReduceCount());
+ const char **reduceNameList = ME->getExportReduceNameList();
+ for (size_t i = 0; i < ME->getExportReduceCount(); i++) {
+ fprintf(info, "%s\n", reduceNameList[i]);
+ }
+
fprintf(info, "objectSlotCount: %zu\n", ME->getObjectSlotCount());
const uint32_t *slotList = ME->getObjectSlotList();
for (size_t i = 0; i < ME->getObjectSlotCount(); i++) {
@@ -197,6 +203,13 @@
}
printf("\n");
+ printf("exportReduceCount: %zu\n", ME->getExportReduceCount());
+ const char **reduceNameList = ME->getExportReduceNameList();
+ for (size_t i = 0; i < ME->getExportReduceCount(); i++) {
+ printf("func[%zu]: %s\n", i, reduceNameList[i]);
+ }
+ printf("\n");
+
printf("pragmaCount: %zu\n", ME->getPragmaCount());
const char **keyList = ME->getPragmaKeyList();
const char **valueList = ME->getPragmaValueList();
diff --git a/include/bcc/Compiler.h b/include/bcc/Compiler.h
index 75cde37..8a30c38 100644
--- a/include/bcc/Compiler.h
+++ b/include/bcc/Compiler.h
@@ -80,13 +80,11 @@
enum ErrorCode runPasses(Script &pScript, llvm::raw_pwrite_stream &pResult);
- bool addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM);
bool addInternalizeSymbolsPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
- bool addInvariantPass(llvm::legacy::PassManager &pPM);
- bool addInvokeHelperPass(llvm::legacy::PassManager &pPM);
- bool addPostLTOCustomPasses(llvm::legacy::PassManager &pPM);
+ void addExpandKernelPass(llvm::legacy::PassManager &pPM);
+ void addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM);
+ void addInvariantPass(llvm::legacy::PassManager &pPM);
+ void addInvokeHelperPass(llvm::legacy::PassManager &pPM);
public:
Compiler();
diff --git a/include/bcc/Renderscript/RSCompilerDriver.h b/include/bcc/Renderscript/RSCompilerDriver.h
index bc3b021..cf22ce8 100644
--- a/include/bcc/Renderscript/RSCompilerDriver.h
+++ b/include/bcc/Renderscript/RSCompilerDriver.h
@@ -111,6 +111,10 @@
return mEnableGlobalMerge;
}
+ const CompilerConfig * getConfig() const {
+ return mConfig;
+ }
+
// Set to true if we should embed global variable information in the code.
void setEmbedGlobalInfo(bool v) {
mEmbedGlobalInfo = v;
diff --git a/include/bcc/Renderscript/RSScript.h b/include/bcc/Renderscript/RSScript.h
index 0023bf6..713fabf 100644
--- a/include/bcc/Renderscript/RSScript.h
+++ b/include/bcc/Renderscript/RSScript.h
@@ -28,6 +28,7 @@
class RSScript;
class Source;
+class CompilerConfig;
typedef llvm::Module* (*RSLinkRuntimeCallback) (bcc::RSScript *, llvm::Module *, llvm::Module *);
@@ -70,6 +71,10 @@
RSScript(Source &pSource);
+ // Passing in the CompilerConfig allows the optimization level to
+ // be derived rather than defaulted to aggressive (-O3)
+ RSScript(Source &pSource, const CompilerConfig * pCompilerConfig);
+
virtual ~RSScript() { }
void setCompilerVersion(unsigned pCompilerVersion) {
diff --git a/include/bcc/Renderscript/RSTransforms.h b/include/bcc/Renderscript/RSTransforms.h
index d5830ca..6dcfedd 100644
--- a/include/bcc/Renderscript/RSTransforms.h
+++ b/include/bcc/Renderscript/RSTransforms.h
@@ -25,7 +25,7 @@
namespace bcc {
llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt);
+createRSKernelExpandPass(bool pEnableStepOpt);
llvm::FunctionPass *
createRSInvariantPass();
diff --git a/include/bcc/Support/Properties.h b/include/bcc/Support/Properties.h
index c82901c..4c3c404 100644
--- a/include/bcc/Support/Properties.h
+++ b/include/bcc/Support/Properties.h
@@ -20,12 +20,12 @@
#include <stdint.h>
#include <stdlib.h>
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
#include <cutils/properties.h>
#endif
static inline uint32_t getProperty(const char *str) {
-#if !defined(RS_SERVER) && defined(HAVE_ANDROID_OS)
+#if !defined(RS_SERVER) && defined(__ANDROID__)
char buf[PROPERTY_VALUE_MAX];
property_get(str, buf, "0");
return atoi(buf);
diff --git a/include/bcinfo/MetadataExtractor.h b/include/bcinfo/MetadataExtractor.h
index 99818ca..742346a 100644
--- a/include/bcinfo/MetadataExtractor.h
+++ b/include/bcinfo/MetadataExtractor.h
@@ -54,12 +54,13 @@
size_t mExportVarCount;
size_t mExportFuncCount;
size_t mExportForEachSignatureCount;
+ size_t mExportReduceCount;
const char **mExportVarNameList;
const char **mExportFuncNameList;
const char **mExportForEachNameList;
const uint32_t *mExportForEachSignatureList;
-
const uint32_t *mExportForEachInputCountList;
+ const char **mExportReduceNameList;
size_t mPragmaCount;
const char **mPragmaKeyList;
@@ -80,8 +81,6 @@
const char *mBuildChecksum;
// Helper functions for extraction
- bool populateVarNameMetadata(const llvm::NamedMDNode *VarNameMetadata);
- bool populateFuncNameMetadata(const llvm::NamedMDNode *FuncNameMetadata);
bool populateForEachMetadata(const llvm::NamedMDNode *Names,
const llvm::NamedMDNode *Signatures);
bool populateObjectSlotMetadata(const llvm::NamedMDNode *ObjectSlotMetadata);
@@ -184,6 +183,20 @@
}
/**
+ * \return number of exported reduce kernels (slots) in this script/module.
+ */
+ size_t getExportReduceCount() const {
+ return mExportReduceCount;
+ }
+
+ /**
+ * \return array of exported reduce kernel names.
+ */
+ const char **getExportReduceNameList() const {
+ return mExportReduceNameList;
+ }
+
+ /**
* \return number of pragmas contained in pragmaKeyList and pragmaValueList.
*/
size_t getPragmaCount() const {
diff --git a/lib/Core/Compiler.cpp b/lib/Core/Compiler.cpp
index c314b6c..5c769b4 100644
--- a/lib/Core/Compiler.cpp
+++ b/lib/Core/Compiler.cpp
@@ -143,6 +143,8 @@
}
+// This function has complete responsibility for creating and executing the
+// exact list of compiler passes.
enum Compiler::ErrorCode Compiler::runPasses(Script &pScript,
llvm::raw_pwrite_stream &pResult) {
// Pass manager for link-time optimization
@@ -153,10 +155,13 @@
passes.add(createTargetTransformInfoWrapperPass(mTarget->getTargetIRAnalysis()));
- // Add our custom passes.
- if (!addCustomPasses(pScript, passes)) {
+ // Add some initial custom passes.
+ addInvokeHelperPass(passes);
+ addExpandKernelPass(passes);
+ addInvariantPass(passes);
+ if (!addInternalizeSymbolsPass(pScript, passes))
return kErrCustomPasses;
- }
+ addGlobalInfoPass(pScript, passes);
if (mTarget->getOptLevel() == llvm::CodeGenOpt::None) {
passes.add(llvm::createGlobalOptimizerPass());
@@ -187,9 +192,9 @@
// These passes have to come after LTO, since we don't want to examine
// functions that are never actually called.
- if (!addPostLTOCustomPasses(passes)) {
- return kErrCustomPasses;
- }
+ if (llvm::Triple(getTargetMachine().getTargetTriple()).getArch() == llvm::Triple::x86_64)
+ passes.add(createRSX86_64CallConvPass()); // Add pass to correct calling convention for X86-64.
+ passes.add(createRSIsThreadablePass()); // Add pass to mark script as threadable.
// RSEmbedInfoPass needs to come after we have scanned for non-threadable
// functions.
@@ -324,9 +329,11 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
size_t i;
for (i = 0; i < exportVarCount; ++i) {
@@ -337,18 +344,22 @@
export_symbols.push_back(exportFuncNameList[i]);
}
- // Expanded foreach functions should not be internalized, too.
- // expanded_foreach_funcs keeps the .expand version of the kernel names
- // around until createInternalizePass() is finished making its own
- // copy of the visible symbols.
- std::vector<std::string> expanded_foreach_funcs;
+ // Expanded foreach and reduce functions should not be
+ // internalized. expanded_funcs keeps the names of the expanded
+ // functions around until createInternalizePass() is finished making
+ // its own copy of the visible symbols.
+ std::vector<std::string> expanded_funcs;
+ expanded_funcs.reserve(exportForEachCount + exportReduceCount);
+
for (i = 0; i < exportForEachCount; ++i) {
- expanded_foreach_funcs.push_back(
- std::string(exportForEachNameList[i]) + ".expand");
+ expanded_funcs.push_back(std::string(exportForEachNameList[i]) + ".expand");
+ }
+ for (i = 0; i < exportReduceCount; ++i) {
+ expanded_funcs.push_back(std::string(exportReduceNameList[i]) + ".expand");
}
- for (i = 0; i < exportForEachCount; i++) {
- export_symbols.push_back(expanded_foreach_funcs[i].c_str());
+ for (auto &symbol_name : expanded_funcs) {
+ export_symbols.push_back(symbol_name.c_str());
}
pPM.add(llvm::createInternalizePass(export_symbols));
@@ -356,69 +367,31 @@
return true;
}
-bool Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvokeHelperPass(llvm::legacy::PassManager &pPM) {
llvm::Triple arch(getTargetMachine().getTargetTriple());
if (arch.isArch64Bit()) {
pPM.add(createRSInvokeHelperPass());
}
- return true;
}
-bool Compiler::addExpandForEachPass(Script &pScript, llvm::legacy::PassManager &pPM) {
- // Expand ForEach on CPU path to reduce launch overhead.
+void Compiler::addExpandKernelPass(llvm::legacy::PassManager &pPM) {
+ // Expand ForEach and reduce on CPU path to reduce launch overhead.
bool pEnableStepOpt = true;
- pPM.add(createRSForEachExpandPass(pEnableStepOpt));
-
- return true;
+ pPM.add(createRSKernelExpandPass(pEnableStepOpt));
}
-bool Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
+void Compiler::addGlobalInfoPass(Script &pScript, llvm::legacy::PassManager &pPM) {
// Add additional information about RS global variables inside the Module.
RSScript &script = static_cast<RSScript &>(pScript);
if (script.getEmbedGlobalInfo()) {
pPM.add(createRSGlobalInfoPass(script.getEmbedGlobalInfoSkipConstant()));
}
-
- return true;
}
-bool Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
+void Compiler::addInvariantPass(llvm::legacy::PassManager &pPM) {
// Mark Loads from RsExpandKernelDriverInfo as "load.invariant".
// Should run after ExpandForEach and before inlining.
pPM.add(createRSInvariantPass());
-
- return true;
-}
-
-bool Compiler::addCustomPasses(Script &pScript, llvm::legacy::PassManager &pPM) {
- if (!addInvokeHelperPass(pPM))
- return false;
-
- if (!addExpandForEachPass(pScript, pPM))
- return false;
-
- if (!addInvariantPass(pPM))
- return false;
-
- if (!addInternalizeSymbolsPass(pScript, pPM))
- return false;
-
- if (!addGlobalInfoPass(pScript, pPM))
- return false;
-
- return true;
-}
-
-bool Compiler::addPostLTOCustomPasses(llvm::legacy::PassManager &pPM) {
- // Add pass to correct calling convention for X86-64.
- llvm::Triple arch(getTargetMachine().getTargetTriple());
- if (arch.getArch() == llvm::Triple::x86_64)
- pPM.add(createRSX86_64CallConvPass());
-
- // Add pass to mark script as threadable.
- pPM.add(createRSIsThreadablePass());
-
- return true;
}
enum Compiler::ErrorCode Compiler::screenGlobalFunctions(Script &pScript) {
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 56cae16..4b18eda 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -24,7 +24,7 @@
libbcc_renderscript_SRC_FILES := \
RSCompilerDriver.cpp \
RSEmbedInfo.cpp \
- RSForEachExpand.cpp \
+ RSKernelExpand.cpp \
RSGlobalInfoPass.cpp \
RSInvariant.cpp \
RSScript.cpp \
diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index 21874f2..7cc4ffb 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp
@@ -42,7 +42,7 @@
#include <sstream>
#include <string>
-#ifdef HAVE_ANDROID_OS
+#ifdef __ANDROID__
#include <cutils/properties.h>
#endif
#include <utils/StopWatch.h>
@@ -249,7 +249,7 @@
return false;
}
- RSScript script(*source);
+ RSScript script(*source, getConfig());
if (pLinkRuntimeCallback) {
setLinkRuntimeCallback(pLinkRuntimeCallback);
}
@@ -379,6 +379,7 @@
pScript.setEmbedGlobalInfo(mEmbedGlobalInfo);
pScript.setEmbedGlobalInfoSkipConstant(mEmbedGlobalInfoSkipConstant);
+ pScript.setLinkRuntimeCallback(getLinkRuntimeCallback());
Compiler::ErrorCode status = compileScript(pScript, pOut, pOut, pRuntimePath,
pBuildChecksum, pDumpIR);
diff --git a/lib/Renderscript/RSEmbedInfo.cpp b/lib/Renderscript/RSEmbedInfo.cpp
index dc1033c..b0c2767 100644
--- a/lib/Renderscript/RSEmbedInfo.cpp
+++ b/lib/Renderscript/RSEmbedInfo.cpp
@@ -73,11 +73,13 @@
size_t exportVarCount = me.getExportVarCount();
size_t exportFuncCount = me.getExportFuncCount();
size_t exportForEachCount = me.getExportForEachSignatureCount();
+ size_t exportReduceCount = me.getExportReduceCount();
size_t objectSlotCount = me.getObjectSlotCount();
size_t pragmaCount = me.getPragmaCount();
const char **exportVarNameList = me.getExportVarNameList();
const char **exportFuncNameList = me.getExportFuncNameList();
const char **exportForEachNameList = me.getExportForEachNameList();
+ const char **exportReduceNameList = me.getExportReduceNameList();
const uint32_t *exportForEachSignatureList =
me.getExportForEachSignatureList();
const uint32_t *objectSlotList = me.getObjectSlotList();
@@ -111,6 +113,11 @@
<< exportForEachNameList[i] << "\n";
}
+ s << "exportReduceCount: " << exportReduceCount << "\n";
+ for (i = 0; i < exportReduceCount; ++i) {
+ s << exportReduceNameList[i] << "\n";
+ }
+
s << "objectSlotCount: " << objectSlotCount << "\n";
for (i = 0; i < objectSlotCount; ++i) {
s << objectSlotList[i] << "\n";
diff --git a/lib/Renderscript/RSForEachExpand.cpp b/lib/Renderscript/RSForEachExpand.cpp
deleted file mode 100644
index 3e70b1d..0000000
--- a/lib/Renderscript/RSForEachExpand.cpp
+++ /dev/null
@@ -1,1046 +0,0 @@
-/*
- * Copyright 2012, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bcc/Assert.h"
-#include "bcc/Renderscript/RSTransforms.h"
-
-#include <cstdlib>
-#include <functional>
-
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Instructions.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/MDBuilder.h>
-#include <llvm/IR/Module.h>
-#include <llvm/Pass.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/IR/DataLayout.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/Type.h>
-#include <llvm/Transforms/Utils/BasicBlockUtils.h>
-
-#include "bcc/Config/Config.h"
-#include "bcc/Support/Log.h"
-
-#include "bcinfo/MetadataExtractor.h"
-
-#define NUM_EXPANDED_FUNCTION_PARAMS 4
-
-using namespace bcc;
-
-namespace {
-
-static const bool gEnableRsTbaa = true;
-
-/* RSForEachExpandPass - This pass operates on functions that are able to be
- * called via rsForEach() or "foreach_<NAME>". We create an inner loop for the
- * ForEach-able function to be invoked over the appropriate data cells of the
- * input/output allocations (adjusting other relevant parameters as we go). We
- * support doing this for any ForEach-able compute kernels. The new function
- * name is the original function name followed by ".expand". Note that we
- * still generate code for the original function.
- */
-class RSForEachExpandPass : public llvm::ModulePass {
-public:
- static char ID;
-
-private:
- static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
-
- enum RsLaunchDimensionsField {
- RsLaunchDimensionsFieldX,
- RsLaunchDimensionsFieldY,
- RsLaunchDimensionsFieldZ,
- RsLaunchDimensionsFieldLod,
- RsLaunchDimensionsFieldFace,
- RsLaunchDimensionsFieldArray,
-
- RsLaunchDimensionsFieldCount
- };
-
- enum RsExpandKernelDriverInfoPfxField {
- RsExpandKernelDriverInfoPfxFieldInPtr,
- RsExpandKernelDriverInfoPfxFieldInStride,
- RsExpandKernelDriverInfoPfxFieldInLen,
- RsExpandKernelDriverInfoPfxFieldOutPtr,
- RsExpandKernelDriverInfoPfxFieldOutStride,
- RsExpandKernelDriverInfoPfxFieldOutLen,
- RsExpandKernelDriverInfoPfxFieldDim,
- RsExpandKernelDriverInfoPfxFieldCurrent,
- RsExpandKernelDriverInfoPfxFieldUsr,
- RsExpandKernelDriverInfoPfxFieldUsLenr,
-
- RsExpandKernelDriverInfoPfxFieldCount
- };
-
- llvm::Module *Module;
- llvm::LLVMContext *Context;
-
- /*
- * Pointer to LLVM type information for the the function signature
- * for expanded kernels. This must be re-calculated for each
- * module the pass is run on.
- */
- llvm::FunctionType *ExpandedFunctionType;
-
- uint32_t mExportForEachCount;
- const char **mExportForEachNameList;
- const uint32_t *mExportForEachSignatureList;
-
- // Turns on optimization of allocation stride values.
- bool mEnableStepOpt;
-
- uint32_t getRootSignature(llvm::Function *Function) {
- const llvm::NamedMDNode *ExportForEachMetadata =
- Module->getNamedMetadata("#rs_export_foreach");
-
- if (!ExportForEachMetadata) {
- llvm::SmallVector<llvm::Type*, 8> RootArgTys;
- for (llvm::Function::arg_iterator B = Function->arg_begin(),
- E = Function->arg_end();
- B != E;
- ++B) {
- RootArgTys.push_back(B->getType());
- }
-
- // For pre-ICS bitcode, we may not have signature information. In that
- // case, we use the size of the RootArgTys to select the number of
- // arguments.
- return (1 << RootArgTys.size()) - 1;
- }
-
- if (ExportForEachMetadata->getNumOperands() == 0) {
- return 0;
- }
-
- bccAssert(ExportForEachMetadata->getNumOperands() > 0);
-
- // We only handle the case for legacy root() functions here, so this is
- // hard-coded to look at only the first such function.
- llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
- if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
- llvm::Metadata *SigMD = SigNode->getOperand(0);
- if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
- llvm::StringRef SigString = SigS->getString();
- uint32_t Signature = 0;
- if (SigString.getAsInteger(10, Signature)) {
- ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
- return 0;
- }
- return Signature;
- }
- }
-
- return 0;
- }
-
- bool isStepOptSupported(llvm::Type *AllocType) {
-
- llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
- llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
-
- if (mEnableStepOpt) {
- return false;
- }
-
- if (AllocType == VoidPtrTy) {
- return false;
- }
-
- if (!PT) {
- return false;
- }
-
- // remaining conditions are 64-bit only
- if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
- return true;
- }
-
- // coerce suggests an upconverted struct type, which we can't support
- if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
- return false;
- }
-
- // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
- llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
- llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
- if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
- return false;
- }
-
- return true;
- }
-
- // Get the actual value we should use to step through an allocation.
- //
- // Normally the value we use to step through an allocation is given to us by
- // the driver. However, for certain primitive data types, we can derive an
- // integer constant for the step value. We use this integer constant whenever
- // possible to allow further compiler optimizations to take place.
- //
- // DL - Target Data size/layout information.
- // T - Type of allocation (should be a pointer).
- // OrigStep - Original step increment (root.expand() input from driver).
- llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
- llvm::Value *OrigStep) {
- bccAssert(DL);
- bccAssert(AllocType);
- bccAssert(OrigStep);
- llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
- if (isStepOptSupported(AllocType)) {
- llvm::Type *ET = PT->getElementType();
- uint64_t ETSize = DL->getTypeAllocSize(ET);
- llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
- return llvm::ConstantInt::get(Int32Ty, ETSize);
- } else {
- return OrigStep;
- }
- }
-
- /// Builds the types required by the pass for the given context.
- void buildTypes(void) {
- // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
-
- llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context);
- llvm::Type *Int8PtrTy = Int8Ty->getPointerTo();
- llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
- llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
- llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
- llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
- llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4);
-
- /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
- *
- * struct RsLaunchDimensions {
- * uint32_t x;
- * uint32_t y;
- * uint32_t z;
- * uint32_t lod;
- * uint32_t face;
- * uint32_t array[4];
- * };
- */
- llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
- RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x
- RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y
- RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z
- RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod
- RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face
- RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
- llvm::StructType *RsLaunchDimensionsTy =
- llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
-
- /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
- *
- * struct RsExpandKernelDriverInfoPfx {
- * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
- * uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
- * uint32_t inLen;
- *
- * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
- * uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
- * uint32_t outLen;
- *
- * // Dimension of the launch
- * RsLaunchDimensions dim;
- *
- * // The walking iterator of the launch
- * RsLaunchDimensions current;
- *
- * const void *usr;
- * uint32_t usrLen;
- *
- * // Items below this line are not used by the compiler and can be change in the driver.
- * // So the compiler must assume there are an unknown number of fields of unknown type
- * // beginning here.
- * };
- *
- * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
- */
- llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
- RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
- RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
- RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen
- RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
- RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
- RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen
- RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim
- RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current
- RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr
- RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen
- llvm::StructType *RsExpandKernelDriverInfoPfxTy =
- llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
-
- // Create the function type for expanded kernels.
-
- llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
-
- llvm::SmallVector<llvm::Type*, 8> ParamTypes;
- ParamTypes.push_back(RsExpandKernelDriverInfoPfxPtrTy); // const RsExpandKernelDriverInfoPfx *p
- ParamTypes.push_back(Int32Ty); // uint32_t x1
- ParamTypes.push_back(Int32Ty); // uint32_t x2
- ParamTypes.push_back(Int32Ty); // uint32_t outstep
-
- ExpandedFunctionType =
- llvm::FunctionType::get(llvm::Type::getVoidTy(*Context), ParamTypes,
- false);
- }
-
- /// @brief Create skeleton of the expanded function.
- ///
- /// This creates a function with the following signature:
- ///
- /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
- /// uint32_t outstep)
- ///
- llvm::Function *createEmptyExpandedFunction(llvm::StringRef OldName) {
- llvm::Function *ExpandedFunction =
- llvm::Function::Create(ExpandedFunctionType,
- llvm::GlobalValue::ExternalLinkage,
- OldName + ".expand", Module);
-
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
- llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
-
- (AI++)->setName("p");
- (AI++)->setName("x1");
- (AI++)->setName("x2");
- (AI++)->setName("arg_outstep");
-
- llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
- ExpandedFunction);
- llvm::IRBuilder<> Builder(Begin);
- Builder.CreateRetVoid();
-
- return ExpandedFunction;
- }
-
- /// @brief Create an empty loop
- ///
- /// Create a loop of the form:
- ///
- /// for (i = LowerBound; i < UpperBound; i++)
- /// ;
- ///
- /// After the loop has been created, the builder is set such that
- /// instructions can be added to the loop body.
- ///
- /// @param Builder The builder to use to build this loop. The current
- /// position of the builder is the position the loop
- /// will be inserted.
- /// @param LowerBound The first value of the loop iterator
- /// @param UpperBound The maximal value of the loop iterator
- /// @param LoopIV A reference that will be set to the loop iterator.
- /// @return The BasicBlock that will be executed after the loop.
- llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
- llvm::Value *LowerBound,
- llvm::Value *UpperBound,
- llvm::PHINode **LoopIV) {
- bccAssert(LowerBound->getType() == UpperBound->getType());
-
- llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
- llvm::Value *Cond, *IVNext;
- llvm::PHINode *IV;
-
- CondBB = Builder.GetInsertBlock();
- AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
- HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
-
- // if (LowerBound < Upperbound)
- // goto LoopHeader
- // else
- // goto AfterBB
- CondBB->getTerminator()->eraseFromParent();
- Builder.SetInsertPoint(CondBB);
- Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
- Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
-
- // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
- // iv.next = iv + 1
- // if (iv.next < Upperbound)
- // goto LoopHeader
- // else
- // goto AfterBB
- Builder.SetInsertPoint(HeaderBB);
- IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
- IV->addIncoming(LowerBound, CondBB);
- IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
- IV->addIncoming(IVNext, HeaderBB);
- Cond = Builder.CreateICmpULT(IVNext, UpperBound);
- Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
- AfterBB->setName("Exit");
- Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
- *LoopIV = IV;
- return AfterBB;
- }
-
- // Finish building the outgoing argument list for calling a ForEach-able function.
- //
- // ArgVector - on input, the non-special arguments
- // on output, the non-special arguments combined with the special arguments
- // from SpecialArgVector
- // SpecialArgVector - special arguments (from ExpandSpecialArguments())
- // SpecialArgContextIdx - return value of ExpandSpecialArguments()
- // (position of context argument in SpecialArgVector)
- // CalleeFunction - the ForEach-able function being called
- // Builder - for inserting code into the caller function
- template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
- void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector,
- const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
- const int SpecialArgContextIdx,
- const llvm::Function &CalleeFunction,
- llvm::IRBuilder<> &CallerBuilder) {
- /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
- * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
- * two types represent the same thing). Therefore, we must introduce a pointer cast when
- * generating a call to the kernel function.
- */
- const int ArgContextIdx =
- SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
- ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
- if (ArgContextIdx >= 0) {
- llvm::Type *ContextArgType = nullptr;
- int ArgIdx = ArgContextIdx;
- for (const auto &Arg : CalleeFunction.getArgumentList()) {
- if (!ArgIdx--) {
- ContextArgType = Arg.getType();
- break;
- }
- }
- bccAssert(ContextArgType);
- ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
- }
- }
-
-public:
- RSForEachExpandPass(bool pEnableStepOpt = true)
- : ModulePass(ID), Module(nullptr), Context(nullptr),
- mEnableStepOpt(pEnableStepOpt) {
-
- }
-
- virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
- // This pass does not use any other analysis passes, but it does
- // add/wrap the existing functions in the module (thus altering the CFG).
- }
-
- // Build contribution to outgoing argument list for calling a
- // ForEach-able function, based on the special parameters of that
- // function.
- //
- // Signature - metadata bits for the signature of the ForEach-able function
- // X, Arg_p - values derived directly from expanded function,
- // suitable for computing arguments for the ForEach-able function
- // CalleeArgs - contribution is accumulated here
- // Bump - invoked once for each contributed outgoing argument
- //
- // Return value is the (zero-based) position of the context (Arg_p)
- // argument in the CalleeArgs vector, or a negative value if the
- // context argument is not placed in the CalleeArgs vector.
- int ExpandSpecialArguments(uint32_t Signature,
- llvm::Value *X,
- llvm::Value *Arg_p,
- llvm::IRBuilder<> &Builder,
- llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
- std::function<void ()> Bump) {
-
- bccAssert(CalleeArgs.empty());
-
- int Return = -1;
- if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
- CalleeArgs.push_back(Arg_p);
- Bump();
- Return = CalleeArgs.size() - 1;
- }
-
- if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
- CalleeArgs.push_back(X);
- Bump();
- }
-
- if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
- bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
-
- llvm::Value *Current = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldCurrent);
-
- if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
- llvm::Value *Y = Builder.CreateLoad(
- Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldY), "Y");
-
- CalleeArgs.push_back(Y);
- Bump();
- }
-
- if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
- llvm::Value *Z = Builder.CreateLoad(
- Builder.CreateStructGEP(nullptr, Current, RsLaunchDimensionsFieldZ), "Z");
- CalleeArgs.push_back(Z);
- Bump();
- }
- }
-
- return Return;
- }
-
- /* Performs the actual optimization on a selected function. On success, the
- * Module will contain a new function of the name "<NAME>.expand" that
- * invokes <NAME>() in a loop with the appropriate parameters.
- */
- bool ExpandFunction(llvm::Function *Function, uint32_t Signature) {
- ALOGV("Expanding ForEach-able Function %s",
- Function->getName().str().c_str());
-
- if (!Signature) {
- Signature = getRootSignature(Function);
- if (!Signature) {
- // We couldn't determine how to expand this function based on its
- // function signature.
- return false;
- }
- }
-
- llvm::DataLayout DL(Module);
-
- llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
-
- /*
- * Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
- */
-
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
- llvm::Function::arg_iterator ExpandedFunctionArgIter =
- ExpandedFunction->arg_begin();
-
- llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
-
- llvm::Value *InStep = nullptr;
- llvm::Value *OutStep = nullptr;
-
- // Construct the actual function body.
- llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
-
- // Collect and construct the arguments for the kernel().
- // Note that we load any loop-invariant arguments before entering the Loop.
- llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
-
- llvm::Type *InTy = nullptr;
- llvm::Value *InBasePtr = nullptr;
- if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
- llvm::Value *InsBasePtr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
- llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
- llvm::Value *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, 0);
- llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr,
- "instep_addr");
-
- InTy = (FunctionArgIter++)->getType();
- InStep = getStepValue(&DL, InTy, InStepArg);
-
- InStep->setName("instep");
-
- llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, 0);
- InBasePtr = Builder.CreateLoad(InputAddr, "input_base");
- }
-
- llvm::Type *OutTy = nullptr;
- llvm::Value *OutBasePtr = nullptr;
- if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
- OutTy = (FunctionArgIter++)->getType();
- OutStep = getStepValue(&DL, OutTy, Arg_outstep);
- OutStep->setName("outstep");
- OutBasePtr = Builder.CreateLoad(
- Builder.CreateConstInBoundsGEP2_32(nullptr,
- Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
- 0, 0));
- }
-
- llvm::Value *UsrData = nullptr;
- if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
- llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
- UsrData = Builder.CreatePointerCast(Builder.CreateLoad(
- Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr)), UsrDataTy);
- UsrData->setName("UsrData");
- }
-
- llvm::PHINode *IV;
- createLoop(Builder, Arg_x1, Arg_x2, &IV);
-
- llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
- const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
- [&FunctionArgIter]() { FunctionArgIter++; });
-
- bccAssert(FunctionArgIter == Function->arg_end());
-
- // Populate the actual call to kernel().
- llvm::SmallVector<llvm::Value*, 8> RootArgs;
-
- llvm::Value *InPtr = nullptr;
- llvm::Value *OutPtr = nullptr;
-
- // Calculate the current input and output pointers
- //
- // We always calculate the input/output pointers with a GEP operating on i8
- // values and only cast at the very end to OutTy. This is because the step
- // between two values is given in bytes.
- //
- // TODO: We could further optimize the output by using a GEP operation of
- // type 'OutTy' in cases where the element type of the allocation allows.
- if (OutBasePtr) {
- llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
- OutOffset = Builder.CreateMul(OutOffset, OutStep);
- OutPtr = Builder.CreateGEP(OutBasePtr, OutOffset);
- OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
- }
-
- if (InBasePtr) {
- llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
- InOffset = Builder.CreateMul(InOffset, InStep);
- InPtr = Builder.CreateGEP(InBasePtr, InOffset);
- InPtr = Builder.CreatePointerCast(InPtr, InTy);
- }
-
- if (InPtr) {
- RootArgs.push_back(InPtr);
- }
-
- if (OutPtr) {
- RootArgs.push_back(OutPtr);
- }
-
- if (UsrData) {
- RootArgs.push_back(UsrData);
- }
-
- finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
-
- Builder.CreateCall(Function, RootArgs);
-
- return true;
- }
-
- /* Expand a pass-by-value kernel.
- */
- bool ExpandKernel(llvm::Function *Function, uint32_t Signature) {
- bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
- ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
-
- // TODO: Refactor this to share functionality with ExpandFunction.
- llvm::DataLayout DL(Module);
-
- llvm::Function *ExpandedFunction =
- createEmptyExpandedFunction(Function->getName());
-
- /*
- * Extract the expanded function's parameters. It is guaranteed by
- * createEmptyExpandedFunction that there will be five parameters.
- */
-
- bccAssert(ExpandedFunction->arg_size() == NUM_EXPANDED_FUNCTION_PARAMS);
-
- llvm::Function::arg_iterator ExpandedFunctionArgIter =
- ExpandedFunction->arg_begin();
-
- llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
- llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
-
- // Construct the actual function body.
- llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
-
- // Create TBAA meta-data.
- llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
- *TBAAAllocation, *TBAAPointer;
- llvm::MDBuilder MDHelper(*Context);
-
- TBAARenderScriptDistinct =
- MDHelper.createTBAARoot("RenderScript Distinct TBAA");
- TBAARenderScript = MDHelper.createTBAANode("RenderScript TBAA",
- TBAARenderScriptDistinct);
- TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
- TBAARenderScript);
- TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
- TBAAAllocation, 0);
- TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
- TBAARenderScript);
- TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
-
- llvm::MDNode *AliasingDomain, *AliasingScope;
- AliasingDomain = MDHelper.createAnonymousAliasScopeDomain("RS argument scope domain");
- AliasingScope = MDHelper.createAnonymousAliasScope(AliasingDomain, "RS argument scope");
-
- /*
- * Collect and construct the arguments for the kernel().
- *
- * Note that we load any loop-invariant arguments before entering the Loop.
- */
- size_t NumInputs = Function->arg_size();
-
- // No usrData parameter on kernels.
- bccAssert(
- !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
-
- llvm::Function::arg_iterator ArgIter = Function->arg_begin();
-
- // Check the return type
- llvm::Type *OutTy = nullptr;
- llvm::Value *OutStep = nullptr;
- llvm::LoadInst *OutBasePtr = nullptr;
- llvm::Value *CastedOutBasePtr = nullptr;
-
- bool PassOutByPointer = false;
-
- if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
- llvm::Type *OutBaseTy = Function->getReturnType();
-
- if (OutBaseTy->isVoidTy()) {
- PassOutByPointer = true;
- OutTy = ArgIter->getType();
-
- ArgIter++;
- --NumInputs;
- } else {
- // We don't increment Args, since we are using the actual return type.
- OutTy = OutBaseTy->getPointerTo();
- }
-
- OutStep = getStepValue(&DL, OutTy, Arg_outstep);
- OutStep->setName("outstep");
- OutBasePtr = Builder.CreateLoad(
- Builder.CreateConstInBoundsGEP2_32(nullptr,
- Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldOutPtr),
- 0, 0));
-
- if (gEnableRsTbaa) {
- OutBasePtr->setMetadata("tbaa", TBAAPointer);
- }
-
- OutBasePtr->setMetadata("alias.scope", AliasingScope);
-
- CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
- }
-
- llvm::PHINode *IV;
- createLoop(Builder, Arg_x1, Arg_x2, &IV);
-
- llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
- const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
- [&NumInputs]() { --NumInputs; });
-
- llvm::SmallVector<llvm::Type*, 8> InTypes;
- llvm::SmallVector<llvm::Value*, 8> InSteps;
- llvm::SmallVector<llvm::Value*, 8> InBasePtrs;
- llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
-
- bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
-
- if (NumInputs > 0) {
- llvm::Value *InsBasePtr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInPtr, "inputs_base");
-
- llvm::Value *InStepsBase = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldInStride, "insteps_base");
-
- llvm::Instruction *AllocaInsertionPoint = &*ExpandedFunction->getEntryBlock().begin();
- for (size_t InputIndex = 0; InputIndex < NumInputs;
- ++InputIndex, ArgIter++) {
-
- llvm::Value *InStepAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InStepsBase, 0, InputIndex);
- llvm::LoadInst *InStepArg = Builder.CreateLoad(InStepAddr,
- "instep_addr");
-
- llvm::Type *InType = ArgIter->getType();
-
- /*
- * AArch64 calling conventions dictate that structs of sufficient size
- * get passed by pointer instead of passed by value. This, combined
- * with the fact that we don't allow kernels to operate on pointer
- * data means that if we see a kernel with a pointer parameter we know
- * that it is struct input that has been promoted. As such we don't
- * need to convert its type to a pointer. Later we will need to know
- * to create a temporary copy on the stack, so we save this information
- * in InStructTempSlots.
- */
- if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
- llvm::Type *ElementType = PtrType->getElementType();
- uint64_t Alignment = DL.getABITypeAlignment(ElementType);
- llvm::Value *Slot = new llvm::AllocaInst(ElementType,
- nullptr,
- Alignment,
- "input_struct_slot",
- AllocaInsertionPoint);
- InStructTempSlots.push_back(Slot);
- } else {
- InType = InType->getPointerTo();
- InStructTempSlots.push_back(nullptr);
- }
-
- llvm::Value *InStep = getStepValue(&DL, InType, InStepArg);
-
- InStep->setName("instep");
-
- llvm::Value *InputAddr = Builder.CreateConstInBoundsGEP2_32(nullptr, InsBasePtr, 0, InputIndex);
- llvm::LoadInst *InBasePtr = Builder.CreateLoad(InputAddr,
- "input_base");
- llvm::Value *CastInBasePtr = Builder.CreatePointerCast(InBasePtr,
- InType, "casted_in");
- if (gEnableRsTbaa) {
- InBasePtr->setMetadata("tbaa", TBAAPointer);
- }
-
- InBasePtr->setMetadata("alias.scope", AliasingScope);
-
- InTypes.push_back(InType);
- InSteps.push_back(InStep);
- InBasePtrs.push_back(CastInBasePtr);
- }
- }
-
- // Populate the actual call to kernel().
- llvm::SmallVector<llvm::Value*, 8> RootArgs;
-
- // Calculate the current input and output pointers
- //
- //
- // We always calculate the input/output pointers with a GEP operating on i8
- // values combined with a multiplication and only cast at the very end to
- // OutTy. This is to account for dynamic stepping sizes when the value
- // isn't apparent at compile time. In the (very common) case when we know
- // the step size at compile time, due to haveing complete type information
- // this multiplication will optmized out and produces code equivalent to a
- // a GEP on a pointer of the correct type.
-
- // Output
-
- llvm::Value *OutPtr = nullptr;
- if (CastedOutBasePtr) {
- llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
-
- OutPtr = Builder.CreateGEP(CastedOutBasePtr, OutOffset);
-
- if (PassOutByPointer) {
- RootArgs.push_back(OutPtr);
- }
- }
-
- // Inputs
-
- if (NumInputs > 0) {
- llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
-
- for (size_t Index = 0; Index < NumInputs; ++Index) {
- llvm::Value *InPtr = Builder.CreateGEP(InBasePtrs[Index], Offset);
- llvm::Value *Input;
-
- if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
- // Pass a pointer to a temporary on the stack, rather than
- // passing a pointer to the original value. We do not want
- // the kernel to potentially modify the input data.
-
- llvm::Type *ElementType = llvm::cast<llvm::PointerType>(
- InPtr->getType())->getElementType();
- uint64_t StoreSize = DL.getTypeStoreSize(ElementType);
- uint64_t Alignment = DL.getABITypeAlignment(ElementType);
-
- Builder.CreateMemCpy(TemporarySlot, InPtr, StoreSize, Alignment,
- /* isVolatile = */ false,
- /* !tbaa = */ gEnableRsTbaa ? TBAAAllocation : nullptr,
- /* !tbaa.struct = */ nullptr,
- /* !alias.scope = */ AliasingScope);
- Input = TemporarySlot;
- } else {
- llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
-
- if (gEnableRsTbaa) {
- InputLoad->setMetadata("tbaa", TBAAAllocation);
- }
-
- InputLoad->setMetadata("alias.scope", AliasingScope);
-
- Input = InputLoad;
- }
-
- RootArgs.push_back(Input);
- }
- }
-
- finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
-
- llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
-
- if (OutPtr && !PassOutByPointer) {
- llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
- if (gEnableRsTbaa) {
- Store->setMetadata("tbaa", TBAAAllocation);
- }
- Store->setMetadata("alias.scope", AliasingScope);
- }
-
- return true;
- }
-
- /// @brief Checks if pointers to allocation internals are exposed
- ///
- /// This function verifies if through the parameters passed to the kernel
- /// or through calls to the runtime library the script gains access to
- /// pointers pointing to data within a RenderScript Allocation.
- /// If we know we control all loads from and stores to data within
- /// RenderScript allocations and if we know the run-time internal accesses
- /// are all annotated with RenderScript TBAA metadata, only then we
- /// can safely use TBAA to distinguish between generic and from-allocation
- /// pointers.
- bool allocPointersExposed(llvm::Module &Module) {
- // Old style kernel function can expose pointers to elements within
- // allocations.
- // TODO: Extend analysis to allow simple cases of old-style kernels.
- for (size_t i = 0; i < mExportForEachCount; ++i) {
- const char *Name = mExportForEachNameList[i];
- uint32_t Signature = mExportForEachSignatureList[i];
- if (Module.getFunction(Name) &&
- !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
- return true;
- }
- }
-
- // Check for library functions that expose a pointer to an Allocation or
- // that are not yet annotated with RenderScript-specific tbaa information.
- static const std::vector<const char *> Funcs{
- // rsGetElementAt(...)
- "_Z14rsGetElementAt13rs_allocationj",
- "_Z14rsGetElementAt13rs_allocationjj",
- "_Z14rsGetElementAt13rs_allocationjjj",
-
- // rsSetElementAt()
- "_Z14rsSetElementAt13rs_allocationPvj",
- "_Z14rsSetElementAt13rs_allocationPvjj",
- "_Z14rsSetElementAt13rs_allocationPvjjj",
-
- // rsGetElementAtYuv_uchar_Y()
- "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
-
- // rsGetElementAtYuv_uchar_U()
- "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
-
- // rsGetElementAtYuv_uchar_V()
- "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
- };
-
- for (auto FI : Funcs) {
- llvm::Function *Function = Module.getFunction(FI);
-
- if (!Function) {
- ALOGE("Missing run-time function '%s'", FI);
- return true;
- }
-
- if (Function->getNumUses() > 0) {
- return true;
- }
- }
-
- return false;
- }
-
- /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
- ///
- /// The TBAA metadata used to annotate loads/stores from RenderScript
- /// Allocations is generated in a separate TBAA tree with a
- /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
- /// all nodes in unrelated alias analysis trees. This function makes the
- /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
- /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
- /// the connected trees every access to an Allocation is resolved to
- /// must-alias if compared to a normal C/C++ access.
- void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
- llvm::MDBuilder MDHelper(*Context);
- llvm::MDNode *TBAARenderScriptDistinct =
- MDHelper.createTBAARoot("RenderScript Distinct TBAA");
- llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
- "RenderScript TBAA", TBAARenderScriptDistinct);
- llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA");
- TBAARenderScript->replaceOperandWith(1, TBAARoot);
- }
-
- virtual bool runOnModule(llvm::Module &Module) {
- bool Changed = false;
- this->Module = &Module;
- this->Context = &Module.getContext();
-
- this->buildTypes();
-
- bcinfo::MetadataExtractor me(&Module);
- if (!me.extract()) {
- ALOGE("Could not extract metadata from module!");
- return false;
- }
- mExportForEachCount = me.getExportForEachSignatureCount();
- mExportForEachNameList = me.getExportForEachNameList();
- mExportForEachSignatureList = me.getExportForEachSignatureList();
-
- bool AllocsExposed = allocPointersExposed(Module);
-
- for (size_t i = 0; i < mExportForEachCount; ++i) {
- const char *name = mExportForEachNameList[i];
- uint32_t signature = mExportForEachSignatureList[i];
- llvm::Function *kernel = Module.getFunction(name);
- if (kernel) {
- if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
- Changed |= ExpandKernel(kernel, signature);
- kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
- } else if (kernel->getReturnType()->isVoidTy()) {
- Changed |= ExpandFunction(kernel, signature);
- kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
- } else {
- // There are some graphics root functions that are not
- // expanded, but that will be called directly. For those
- // functions, we can not set the linkage to internal.
- }
- }
- }
-
- if (gEnableRsTbaa && !AllocsExposed) {
- connectRenderScriptTBAAMetadata(Module);
- }
-
- return Changed;
- }
-
- virtual const char *getPassName() const {
- return "ForEach-able Function Expansion";
- }
-
-}; // end RSForEachExpandPass
-
-} // end anonymous namespace
-
-char RSForEachExpandPass::ID = 0;
-static llvm::RegisterPass<RSForEachExpandPass> X("foreachexp", "ForEach Expand Pass");
-
-namespace bcc {
-
-llvm::ModulePass *
-createRSForEachExpandPass(bool pEnableStepOpt){
- return new RSForEachExpandPass(pEnableStepOpt);
-}
-
-} // end namespace bcc
diff --git a/lib/Renderscript/RSKernelExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
new file mode 100644
index 0000000..34611d7
--- /dev/null
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright 2012, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bcc/Assert.h"
+#include "bcc/Renderscript/RSTransforms.h"
+
+#include <cstdlib>
+#include <functional>
+
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/MDBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "bcc/Config/Config.h"
+#include "bcc/Support/Log.h"
+
+#include "bcinfo/MetadataExtractor.h"
+
+#ifndef __DISABLE_ASSERTS
+// Only used in bccAssert()
+const int kNumExpandedForeachParams = 4;
+const int kNumExpandedReduceParams = 3;
+#endif
+
+const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
+const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
+
+using namespace bcc;
+
+namespace {
+
+static const bool gEnableRsTbaa = true;
+
+/* RSKernelExpandPass - This pass operates on functions that are able
+ * to be called via rsForEach(), "foreach_<NAME>", or
+ * "reduce_<NAME>". We create an inner loop for the function to be
+ * invoked over the appropriate data cells of the input/output
+ * allocations (adjusting other relevant parameters as we go). We
+ * support doing this for any forEach or reduce style compute
+ * kernels. The new function name is the original function name
+ * followed by ".expand". Note that we still generate code for the
+ * original function.
+ */
+class RSKernelExpandPass : public llvm::ModulePass {
+public:
+ static char ID;
+
+private:
+ static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
+
+ enum RsLaunchDimensionsField {
+ RsLaunchDimensionsFieldX,
+ RsLaunchDimensionsFieldY,
+ RsLaunchDimensionsFieldZ,
+ RsLaunchDimensionsFieldLod,
+ RsLaunchDimensionsFieldFace,
+ RsLaunchDimensionsFieldArray,
+
+ RsLaunchDimensionsFieldCount
+ };
+
+ enum RsExpandKernelDriverInfoPfxField {
+ RsExpandKernelDriverInfoPfxFieldInPtr,
+ RsExpandKernelDriverInfoPfxFieldInStride,
+ RsExpandKernelDriverInfoPfxFieldInLen,
+ RsExpandKernelDriverInfoPfxFieldOutPtr,
+ RsExpandKernelDriverInfoPfxFieldOutStride,
+ RsExpandKernelDriverInfoPfxFieldOutLen,
+ RsExpandKernelDriverInfoPfxFieldDim,
+ RsExpandKernelDriverInfoPfxFieldCurrent,
+ RsExpandKernelDriverInfoPfxFieldUsr,
+ RsExpandKernelDriverInfoPfxFieldUsLenr,
+
+ RsExpandKernelDriverInfoPfxFieldCount
+ };
+
+ llvm::Module *Module;
+ llvm::LLVMContext *Context;
+
+ /*
+ * Pointers to LLVM type information for the the function signatures
+ * for expanded functions. These must be re-calculated for each module
+ * the pass is run on.
+ */
+ llvm::FunctionType *ExpandedForEachType, *ExpandedReduceType;
+
+ uint32_t mExportForEachCount;
+ const char **mExportForEachNameList;
+ const uint32_t *mExportForEachSignatureList;
+
+ uint32_t mExportReduceCount;
+ const char **mExportReduceNameList;
+
+ // Turns on optimization of allocation stride values.
+ bool mEnableStepOpt;
+
+ uint32_t getRootSignature(llvm::Function *Function) {
+ const llvm::NamedMDNode *ExportForEachMetadata =
+ Module->getNamedMetadata("#rs_export_foreach");
+
+ if (!ExportForEachMetadata) {
+ llvm::SmallVector<llvm::Type*, 8> RootArgTys;
+ for (llvm::Function::arg_iterator B = Function->arg_begin(),
+ E = Function->arg_end();
+ B != E;
+ ++B) {
+ RootArgTys.push_back(B->getType());
+ }
+
+ // For pre-ICS bitcode, we may not have signature information. In that
+ // case, we use the size of the RootArgTys to select the number of
+ // arguments.
+ return (1 << RootArgTys.size()) - 1;
+ }
+
+ if (ExportForEachMetadata->getNumOperands() == 0) {
+ return 0;
+ }
+
+ bccAssert(ExportForEachMetadata->getNumOperands() > 0);
+
+ // We only handle the case for legacy root() functions here, so this is
+ // hard-coded to look at only the first such function.
+ llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
+ if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
+ llvm::Metadata *SigMD = SigNode->getOperand(0);
+ if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
+ llvm::StringRef SigString = SigS->getString();
+ uint32_t Signature = 0;
+ if (SigString.getAsInteger(10, Signature)) {
+ ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
+ return 0;
+ }
+ return Signature;
+ }
+ }
+
+ return 0;
+ }
+
+ bool isStepOptSupported(llvm::Type *AllocType) {
+
+ llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
+ llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
+
+ if (mEnableStepOpt) {
+ return false;
+ }
+
+ if (AllocType == VoidPtrTy) {
+ return false;
+ }
+
+ if (!PT) {
+ return false;
+ }
+
+ // remaining conditions are 64-bit only
+ if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
+ return true;
+ }
+
+ // coerce suggests an upconverted struct type, which we can't support
+ if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
+ return false;
+ }
+
+ // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
+ llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
+ llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
+ if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
+ return false;
+ }
+
+ return true;
+ }
+
+ // Get the actual value we should use to step through an allocation.
+ //
+ // Normally the value we use to step through an allocation is given to us by
+ // the driver. However, for certain primitive data types, we can derive an
+ // integer constant for the step value. We use this integer constant whenever
+ // possible to allow further compiler optimizations to take place.
+ //
+ // DL - Target Data size/layout information.
+ // T - Type of allocation (should be a pointer).
+ // OrigStep - Original step increment (root.expand() input from driver).
+ llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
+ llvm::Value *OrigStep) {
+ bccAssert(DL);
+ bccAssert(AllocType);
+ bccAssert(OrigStep);
+ llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
+ if (isStepOptSupported(AllocType)) {
+ llvm::Type *ET = PT->getElementType();
+ uint64_t ETSize = DL->getTypeAllocSize(ET);
+ llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
+ return llvm::ConstantInt::get(Int32Ty, ETSize);
+ } else {
+ return OrigStep;
+ }
+ }
+
+ /// Builds the types required by the pass for the given context.
+ void buildTypes(void) {
+ // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
+
+ llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context);
+ llvm::Type *Int8PtrTy = Int8Ty->getPointerTo();
+ llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
+ llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
+ llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
+ llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
+ llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4);
+
+ /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
+ *
+ * struct RsLaunchDimensions {
+ * uint32_t x;
+ * uint32_t y;
+ * uint32_t z;
+ * uint32_t lod;
+ * uint32_t face;
+ * uint32_t array[4];
+ * };
+ */
+ llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
+ RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x
+ RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y
+ RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z
+ RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod
+ RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face
+ RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
+ llvm::StructType *RsLaunchDimensionsTy =
+ llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
+
+ /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
+ *
+ * struct RsExpandKernelDriverInfoPfx {
+ * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
+ * uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
+ * uint32_t inLen;
+ *
+ * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
+ * uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
+ * uint32_t outLen;
+ *
+ * // Dimension of the launch
+ * RsLaunchDimensions dim;
+ *
+ * // The walking iterator of the launch
+ * RsLaunchDimensions current;
+ *
+ * const void *usr;
+ * uint32_t usrLen;
+ *
+ * // Items below this line are not used by the compiler and can be change in the driver.
+ * // So the compiler must assume there are an unknown number of fields of unknown type
+ * // beginning here.
+ * };
+ *
+ * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
+ */
+ llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen
+ RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim
+ RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current
+ RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr
+ RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen
+ llvm::StructType *RsExpandKernelDriverInfoPfxTy =
+ llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
+
+ // Create the function type for expanded kernels.
+ llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
+
+ llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
+ // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
+ ExpandedForEachType = llvm::FunctionType::get(VoidTy,
+ {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
+
+ // void (void *inBuf, void *outBuf, uint32_t len)
+ ExpandedReduceType = llvm::FunctionType::get(VoidTy, {VoidPtrTy, VoidPtrTy, Int32Ty}, false);
+ }
+
+ /// @brief Create skeleton of the expanded foreach kernel.
+ ///
+ /// This creates a function with the following signature:
+ ///
+ /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
+ /// uint32_t outstep)
+ ///
+ llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
+ llvm::Function *ExpandedFunction =
+ llvm::Function::Create(ExpandedForEachType,
+ llvm::GlobalValue::ExternalLinkage,
+ OldName + ".expand", Module);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+ llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+ (AI++)->setName("p");
+ (AI++)->setName("x1");
+ (AI++)->setName("x2");
+ (AI++)->setName("arg_outstep");
+ llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+ ExpandedFunction);
+ llvm::IRBuilder<> Builder(Begin);
+ Builder.CreateRetVoid();
+ return ExpandedFunction;
+ }
+
+ // Create skeleton of the expanded reduce kernel.
+ //
+ // This creates a function with the following signature:
+ //
+ // void @func.expand(i8* nocapture %inBuf, i8* nocapture %outBuf, i32 len)
+ //
+ llvm::Function *createEmptyExpandedReduceKernel(llvm::StringRef OldName) {
+ llvm::Function *ExpandedFunction =
+ llvm::Function::Create(ExpandedReduceType,
+ llvm::GlobalValue::ExternalLinkage,
+ OldName + ".expand", Module);
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedReduceParams);
+
+ llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
+
+ using llvm::Attribute;
+
+ llvm::Argument *InBuf = &(*AI++);
+ InBuf->setName("inBuf");
+ InBuf->addAttr(llvm::AttributeSet::get(*Context, InBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ llvm::Argument *OutBuf = &(*AI++);
+ OutBuf->setName("outBuf");
+ OutBuf->addAttr(llvm::AttributeSet::get(*Context, OutBuf->getArgNo() + 1, llvm::makeArrayRef(Attribute::NoCapture)));
+
+ (AI++)->setName("len");
+
+ llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
+ ExpandedFunction);
+ llvm::IRBuilder<> Builder(Begin);
+ Builder.CreateRetVoid();
+
+ return ExpandedFunction;
+ }
+
+ /// @brief Create an empty loop
+ ///
+ /// Create a loop of the form:
+ ///
+ /// for (i = LowerBound; i < UpperBound; i++)
+ /// ;
+ ///
+ /// After the loop has been created, the builder is set such that
+ /// instructions can be added to the loop body.
+ ///
+ /// @param Builder The builder to use to build this loop. The current
+ /// position of the builder is the position the loop
+ /// will be inserted.
+ /// @param LowerBound The first value of the loop iterator
+ /// @param UpperBound The maximal value of the loop iterator
+ /// @param LoopIV A reference that will be set to the loop iterator.
+ /// @return The BasicBlock that will be executed after the loop.
+ llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
+ llvm::Value *LowerBound,
+ llvm::Value *UpperBound,
+ llvm::PHINode **LoopIV) {
+ bccAssert(LowerBound->getType() == UpperBound->getType());
+
+ llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
+ llvm::Value *Cond, *IVNext;
+ llvm::PHINode *IV;
+
+ CondBB = Builder.GetInsertBlock();
+ AfterBB = llvm::SplitBlock(CondBB, Builder.GetInsertPoint(), nullptr, nullptr);
+ HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
+
+ // if (LowerBound < Upperbound)
+ // goto LoopHeader
+ // else
+ // goto AfterBB
+ CondBB->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(CondBB);
+ Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
+ Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
+
+ // iv = PHI [CondBB -> LowerBound], [LoopHeader -> NextIV ]
+ // iv.next = iv + 1
+ // if (iv.next < Upperbound)
+ // goto LoopHeader
+ // else
+ // goto AfterBB
+ Builder.SetInsertPoint(HeaderBB);
+ IV = Builder.CreatePHI(LowerBound->getType(), 2, "X");
+ IV->addIncoming(LowerBound, CondBB);
+ IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
+ IV->addIncoming(IVNext, HeaderBB);
+ Cond = Builder.CreateICmpULT(IVNext, UpperBound);
+ Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
+ AfterBB->setName("Exit");
+ Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
+ *LoopIV = IV;
+ return AfterBB;
+ }
+
+ // Finish building the outgoing argument list for calling a ForEach-able function.
+ //
+ // ArgVector - on input, the non-special arguments
+ // on output, the non-special arguments combined with the special arguments
+ // from SpecialArgVector
+ // SpecialArgVector - special arguments (from ExpandSpecialArguments())
+ // SpecialArgContextIdx - return value of ExpandSpecialArguments()
+ // (position of context argument in SpecialArgVector)
+ // CalleeFunction - the ForEach-able function being called
+ // Builder - for inserting code into the caller function
+ template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
+ void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector,
+ const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
+ const int SpecialArgContextIdx,
+ const llvm::Function &CalleeFunction,
+ llvm::IRBuilder<> &CallerBuilder) {
+ /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
+ * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
+ * two types represent the same thing). Therefore, we must introduce a pointer cast when
+ * generating a call to the kernel function.
+ */
+ const int ArgContextIdx =
+ SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
+ ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
+ if (ArgContextIdx >= 0) {
+ llvm::Type *ContextArgType = nullptr;
+ int ArgIdx = ArgContextIdx;
+ for (const auto &Arg : CalleeFunction.getArgumentList()) {
+ if (!ArgIdx--) {
+ ContextArgType = Arg.getType();
+ break;
+ }
+ }
+ bccAssert(ContextArgType);
+ ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
+ }
+ }
+
+ // GEPHelper() returns a SmallVector of values suitable for passing
+ // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
+ // the returned data type. It is sized so that the SmallVector
+ // returned by GEPHelper() never needs to do a heap allocation for
+ // any list of GEP indices it encounters in the code.
+ typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
+
+ // Helper for turning a list of constant integer GEP indices into a
+ // SmallVector of llvm::Value*. The return value is suitable for
+ // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
+ //
+ // Inputs:
+ // I32Args should be integers which represent the index arguments
+ // to a GEP instruction.
+ //
+ // Returns:
+ // Returns a SmallVector of ConstantInts.
+ SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
+ SmallGEPIndices Out(I32Args.size());
+ llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
+ std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
+ [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
+ return Out;
+ }
+
+public:
+ RSKernelExpandPass(bool pEnableStepOpt = true)
+ : ModulePass(ID), Module(nullptr), Context(nullptr),
+ mEnableStepOpt(pEnableStepOpt) {
+
+ }
+
+ virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+ // This pass does not use any other analysis passes, but it does
+ // add/wrap the existing functions in the module (thus altering the CFG).
+ }
+
+ // Build contribution to outgoing argument list for calling a
+ // ForEach-able function, based on the special parameters of that
+ // function.
+ //
+ // Signature - metadata bits for the signature of the ForEach-able function
+ // X, Arg_p - values derived directly from expanded function,
+ // suitable for computing arguments for the ForEach-able function
+ // CalleeArgs - contribution is accumulated here
+ // Bump - invoked once for each contributed outgoing argument
+ // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
+ // this function can insert loop-invariant loads
+ //
+ // Return value is the (zero-based) position of the context (Arg_p)
+ // argument in the CalleeArgs vector, or a negative value if the
+ // context argument is not placed in the CalleeArgs vector.
+ int ExpandSpecialArguments(uint32_t Signature,
+ llvm::Value *X,
+ llvm::Value *Arg_p,
+ llvm::IRBuilder<> &Builder,
+ llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
+ std::function<void ()> Bump,
+ llvm::Instruction *LoopHeaderInsertionPoint) {
+
+ bccAssert(CalleeArgs.empty());
+
+ int Return = -1;
+ if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
+ CalleeArgs.push_back(Arg_p);
+ Bump();
+ Return = CalleeArgs.size() - 1;
+ }
+
+ if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
+ CalleeArgs.push_back(X);
+ Bump();
+ }
+
+ if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
+ bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
+ bccAssert(LoopHeaderInsertionPoint);
+
+ // Y and Z are loop invariant, so they can be hoisted out of the
+ // loop. Set the IRBuilder insertion point to the loop header.
+ auto OldInsertionPoint = Builder.saveIP();
+ Builder.SetInsertPoint(LoopHeaderInsertionPoint);
+
+ if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
+ SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+ RsLaunchDimensionsFieldY}));
+ llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
+ CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
+ Bump();
+ }
+
+ if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
+ SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
+ RsLaunchDimensionsFieldZ}));
+ llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
+ CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
+ Bump();
+ }
+
+ Builder.restoreIP(OldInsertionPoint);
+ }
+
+ return Return;
+ }
+
+ /* Performs the actual optimization on a selected function. On success, the
+ * Module will contain a new function of the name "<NAME>.expand" that
+ * invokes <NAME>() in a loop with the appropriate parameters.
+ */
+ bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
+ ALOGV("Expanding ForEach-able Function %s",
+ Function->getName().str().c_str());
+
+ if (!Signature) {
+ Signature = getRootSignature(Function);
+ if (!Signature) {
+ // We couldn't determine how to expand this function based on its
+ // function signature.
+ return false;
+ }
+ }
+
+ llvm::DataLayout DL(Module);
+
+ llvm::Function *ExpandedFunction =
+ createEmptyExpandedForEachKernel(Function->getName());
+
+ /*
+ * Extract the expanded function's parameters. It is guaranteed by
+ * createEmptyExpandedFunction that there will be four parameters.
+ */
+
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+
+ llvm::Function::arg_iterator ExpandedFunctionArgIter =
+ ExpandedFunction->arg_begin();
+
+ llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
+
+ llvm::Value *InStep = nullptr;
+ llvm::Value *OutStep = nullptr;
+
+ // Construct the actual function body.
+ llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+ // Collect and construct the arguments for the kernel().
+ // Note that we load any loop-invariant arguments before entering the Loop.
+ llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
+
+ llvm::Type *InTy = nullptr;
+ llvm::Value *InBufPtr = nullptr;
+ if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
+ SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
+ llvm::LoadInst *InStepArg = Builder.CreateLoad(
+ Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
+
+ InTy = (FunctionArgIter++)->getType();
+ InStep = getStepValue(&DL, InTy, InStepArg);
+
+ InStep->setName("instep");
+
+ SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
+ InBufPtr = Builder.CreateLoad(
+ Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
+ }
+
+ llvm::Type *OutTy = nullptr;
+ llvm::Value *OutBasePtr = nullptr;
+ if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
+ OutTy = (FunctionArgIter++)->getType();
+ OutStep = getStepValue(&DL, OutTy, Arg_outstep);
+ OutStep->setName("outstep");
+ SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+ OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
+ }
+
+ llvm::Value *UsrData = nullptr;
+ if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
+ llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
+ llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
+ UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
+ UsrData->setName("UsrData");
+ }
+
+ llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
+ llvm::PHINode *IV;
+ createLoop(Builder, Arg_x1, Arg_x2, &IV);
+
+ llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
+ const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
+ [&FunctionArgIter]() { FunctionArgIter++; },
+ LoopHeader->getTerminator());
+
+ bccAssert(FunctionArgIter == Function->arg_end());
+
+ // Populate the actual call to kernel().
+ llvm::SmallVector<llvm::Value*, 8> RootArgs;
+
+ llvm::Value *InPtr = nullptr;
+ llvm::Value *OutPtr = nullptr;
+
+ // Calculate the current input and output pointers
+ //
+ // We always calculate the input/output pointers with a GEP operating on i8
+ // values and only cast at the very end to OutTy. This is because the step
+ // between two values is given in bytes.
+ //
+ // TODO: We could further optimize the output by using a GEP operation of
+ // type 'OutTy' in cases where the element type of the allocation allows.
+ if (OutBasePtr) {
+ llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+ OutOffset = Builder.CreateMul(OutOffset, OutStep);
+ OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
+ OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
+ }
+
+ if (InBufPtr) {
+ llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
+ InOffset = Builder.CreateMul(InOffset, InStep);
+ InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
+ InPtr = Builder.CreatePointerCast(InPtr, InTy);
+ }
+
+ if (InPtr) {
+ RootArgs.push_back(InPtr);
+ }
+
+ if (OutPtr) {
+ RootArgs.push_back(OutPtr);
+ }
+
+ if (UsrData) {
+ RootArgs.push_back(UsrData);
+ }
+
+ finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
+
+ Builder.CreateCall(Function, RootArgs);
+
+ return true;
+ }
+
+ /* Expand a pass-by-value foreach kernel.
+ */
+ bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
+ bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
+ ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
+
+ // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
+ llvm::DataLayout DL(Module);
+
+ llvm::Function *ExpandedFunction =
+ createEmptyExpandedForEachKernel(Function->getName());
+
+ /*
+ * Extract the expanded function's parameters. It is guaranteed by
+ * createEmptyExpandedFunction that there will be four parameters.
+ */
+
+ bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
+
+ llvm::Function::arg_iterator ExpandedFunctionArgIter =
+ ExpandedFunction->arg_begin();
+
+ llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
+ // Arg_outstep is not used by expanded new-style forEach kernels.
+
+ // Construct the actual function body.
+ llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+ // Create TBAA meta-data.
+ llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
+ *TBAAAllocation, *TBAAPointer;
+ llvm::MDBuilder MDHelper(*Context);
+
+ TBAARenderScriptDistinct =
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+ TBAARenderScriptDistinct);
+ TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+ TBAARenderScript);
+ TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+ TBAAAllocation, 0);
+ TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
+ TBAARenderScript);
+ TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
+
+ /*
+ * Collect and construct the arguments for the kernel().
+ *
+ * Note that we load any loop-invariant arguments before entering the Loop.
+ */
+ size_t NumRemainingInputs = Function->arg_size();
+
+ // No usrData parameter on kernels.
+ bccAssert(
+ !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
+
+ llvm::Function::arg_iterator ArgIter = Function->arg_begin();
+
+ // Check the return type
+ llvm::Type *OutTy = nullptr;
+ llvm::LoadInst *OutBasePtr = nullptr;
+ llvm::Value *CastedOutBasePtr = nullptr;
+
+ bool PassOutByPointer = false;
+
+ if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
+ llvm::Type *OutBaseTy = Function->getReturnType();
+
+ if (OutBaseTy->isVoidTy()) {
+ PassOutByPointer = true;
+ OutTy = ArgIter->getType();
+
+ ArgIter++;
+ --NumRemainingInputs;
+ } else {
+ // We don't increment Args, since we are using the actual return type.
+ OutTy = OutBaseTy->getPointerTo();
+ }
+
+ SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
+ OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
+
+ if (gEnableRsTbaa) {
+ OutBasePtr->setMetadata("tbaa", TBAAPointer);
+ }
+
+ CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
+ }
+
+ llvm::SmallVector<llvm::Type*, 8> InTypes;
+ llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
+ llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
+
+ bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
+
+ // Create the loop structure.
+ llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
+ llvm::PHINode *IV;
+ createLoop(Builder, Arg_x1, Arg_x2, &IV);
+
+ llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
+ const int CalleeArgsContextIdx =
+ ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
+ [&NumRemainingInputs]() { --NumRemainingInputs; },
+ LoopHeader->getTerminator());
+
+ // After ExpandSpecialArguments() gets called, NumRemainingInputs
+ // counts the number of arguments to the kernel that correspond to
+ // an array entry from the InPtr field of the DriverInfo
+ // structure.
+ const size_t NumInPtrArguments = NumRemainingInputs;
+
+ if (NumInPtrArguments > 0) {
+ // Extract information about input slots and step sizes. The work done
+ // here is loop-invariant, so we can hoist the operations out of the loop.
+ auto OldInsertionPoint = Builder.saveIP();
+ Builder.SetInsertPoint(LoopHeader->getTerminator());
+
+ for (size_t InputIndex = 0; InputIndex < NumInPtrArguments; ++InputIndex, ArgIter++) {
+ llvm::Type *InType = ArgIter->getType();
+
+ /*
+ * AArch64 calling conventions dictate that structs of sufficient size
+ * get passed by pointer instead of passed by value. This, combined
+ * with the fact that we don't allow kernels to operate on pointer
+ * data means that if we see a kernel with a pointer parameter we know
+ * that it is a struct input that has been promoted. As such we don't
+ * need to convert its type to a pointer. Later we will need to know
+ * to create a temporary copy on the stack, so we save this information
+ * in InStructTempSlots.
+ */
+ if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
+ llvm::Type *ElementType = PtrType->getElementType();
+ InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
+ "input_struct_slot"));
+ } else {
+ InType = InType->getPointerTo();
+ InStructTempSlots.push_back(nullptr);
+ }
+
+ SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
+ static_cast<int32_t>(InputIndex)}));
+ llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
+ llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
+ llvm::Value *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
+ if (gEnableRsTbaa) {
+ InBufPtr->setMetadata("tbaa", TBAAPointer);
+ }
+
+ InTypes.push_back(InType);
+ InBufPtrs.push_back(CastInBufPtr);
+ }
+
+ Builder.restoreIP(OldInsertionPoint);
+ }
+
+ // Populate the actual call to kernel().
+ llvm::SmallVector<llvm::Value*, 8> RootArgs;
+
+ // Calculate the current input and output pointers.
+
+ // Output
+
+ llvm::Value *OutPtr = nullptr;
+ if (CastedOutBasePtr) {
+ llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
+ OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
+
+ if (PassOutByPointer) {
+ RootArgs.push_back(OutPtr);
+ }
+ }
+
+ // Inputs
+
+ if (NumInPtrArguments > 0) {
+ llvm::Value *Offset = Builder.CreateSub(IV, Arg_x1);
+
+ for (size_t Index = 0; Index < NumInPtrArguments; ++Index) {
+ llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
+ llvm::Value *Input;
+
+ llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
+
+ if (gEnableRsTbaa) {
+ InputLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+
+ if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
+ // Pass a pointer to a temporary on the stack, rather than
+ // passing a pointer to the original value. We do not want
+ // the kernel to potentially modify the input data.
+
+ // Note: don't annotate with TBAA, since the kernel might
+ // have its own TBAA annotations for the pointer argument.
+ Builder.CreateStore(InputLoad, TemporarySlot);
+ Input = TemporarySlot;
+ } else {
+ Input = InputLoad;
+ }
+
+ RootArgs.push_back(Input);
+ }
+ }
+
+ finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
+
+ llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
+
+ if (OutPtr && !PassOutByPointer) {
+ RetVal->setName("call.result");
+ llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
+ if (gEnableRsTbaa) {
+ Store->setMetadata("tbaa", TBAAAllocation);
+ }
+ }
+
+ return true;
+ }
+
+ // Expand a reduce-style kernel function.
+ //
+ // The input is a kernel which represents a binary operation,
+ // of the form
+ //
+ // define foo @func(foo %a, foo %b),
+ //
+ // (More generally, it can be of the forms
+ //
+ // define void @func(foo* %ret, foo* %a, foo* %b)
+ // define void @func(foo* %ret, foo1 %a, foo1 %b)
+ // define foo1 @func(foo2 %a, foo2 %b)
+ //
+ // as a result of argument / return value conversions. Here, "foo1"
+ // and "foo2" refer to possibly coerced types, and the coerced
+ // argument type may be different from the coerced return type. See
+ // "Note on coercion" below.)
+ //
+ // Note also, we do not expect to encounter any case when the
+ // arguments are promoted to pointers but the return value is
+ // unpromoted to pointer, e.g.
+ //
+ // define foo1 @func(foo* %a, foo* %b)
+ //
+ // and we will throw an assertion in this case.)
+ //
+ // The input kernel gets expanded into a kernel of the form
+ //
+ // define void @func.expand(i8* %inBuf, i8* outBuf, i32 len)
+ //
+ // which performs a serial reduction of `len` elements from `inBuf`,
+ // and stores the result into `outBuf`. In pseudocode, @func.expand
+ // does:
+ //
+ // inArr := (foo *)inBuf;
+ // accum := inArr[0];
+ // for (i := 1; i < len; ++i) {
+ // accum := foo(accum, inArr[i]);
+ // }
+ // *(foo *)outBuf := accum;
+ //
+ // Note on coercion
+ //
+ // Both the return value and the argument types may undergo internal
+ // coercion in clang as part of call lowering. As a result, the
+ // return value type may differ from the argument type even if the
+ // types in the RenderScript signaure are the same. For instance, the
+ // kernel
+ //
+ // int3 add(int3 a, int3 b) { return a + b; }
+ //
+ // gets lowered by clang as
+ //
+ // define <3 x i32> @add(<4 x i32> %a.coerce, <4 x i32> %b.coerce)
+ //
+ // under AArch64. The details of this process are found in clang,
+ // lib/CodeGen/TargetInfo.cpp, under classifyArgumentType() and
+ // classifyReturnType() in ARMABIInfo, AArch64ABIInfo. If the value
+ // is passed by pointer, then the pointed-to type is not coerced.
+ //
+ // Since we lack the original type information, this code does loads
+ // and stores of allocation data by way of pointers to the coerced
+ // type.
+ bool ExpandReduce(llvm::Function *Function) {
+ bccAssert(Function);
+
+ ALOGV("Expanding reduce kernel %s", Function->getName().str().c_str());
+
+ llvm::DataLayout DL(Module);
+
+ // TBAA Metadata
+ llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
+ llvm::MDBuilder MDHelper(*Context);
+
+ TBAARenderScriptDistinct =
+ MDHelper.createTBAARoot(kRenderScriptTBAARootName);
+ TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
+ TBAARenderScriptDistinct);
+ TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
+ TBAARenderScript);
+ TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
+ TBAAAllocation, 0);
+
+ llvm::Function *ExpandedFunction =
+ createEmptyExpandedReduceKernel(Function->getName());
+
+ // Extract the expanded kernel's parameters. It is guaranteed by
+ // createEmptyExpandedFunction that there will be 3 parameters.
+ auto ExpandedFunctionArgIter = ExpandedFunction->arg_begin();
+
+ llvm::Value *Arg_inBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_outBuf = &*(ExpandedFunctionArgIter++);
+ llvm::Value *Arg_len = &*(ExpandedFunctionArgIter++);
+
+ bccAssert(Function->arg_size() == 2 || Function->arg_size() == 3);
+
+ // Check if, instead of returning a value, the original kernel has
+ // a pointer parameter which points to a temporary buffer into
+ // which the return value gets written.
+ const bool ReturnValuePointerStyle = (Function->arg_size() == 3);
+ bccAssert(Function->getReturnType()->isVoidTy() == ReturnValuePointerStyle);
+
+ // Check if, instead of being passed by value, the inputs to the
+ // original kernel are passed by pointer.
+ auto FirstArgIter = Function->arg_begin();
+ // The second argument is always an input to the original kernel.
+ auto SecondArgIter = std::next(FirstArgIter);
+ const bool InputsPointerStyle = SecondArgIter->getType()->isPointerTy();
+
+ // Get the output type (i.e. return type of the original kernel).
+ llvm::PointerType *OutPtrTy = nullptr;
+ llvm::Type *OutTy = nullptr;
+ if (ReturnValuePointerStyle) {
+ OutPtrTy = llvm::dyn_cast<llvm::PointerType>(FirstArgIter->getType());
+ bccAssert(OutPtrTy && "Expected a pointer parameter to kernel");
+ OutTy = OutPtrTy->getElementType();
+ } else {
+ OutTy = Function->getReturnType();
+ bccAssert(!OutTy->isVoidTy());
+ OutPtrTy = OutTy->getPointerTo();
+ }
+
+ // Get the input type (type of the arguments to the original
+ // kernel). Some input types are different from the output type,
+ // due to explicit coercion that the compiler performs when
+ // lowering the parameters. See "Note on coercion" above.
+ llvm::PointerType *InPtrTy;
+ llvm::Type *InTy;
+ if (InputsPointerStyle) {
+ InPtrTy = llvm::dyn_cast<llvm::PointerType>(SecondArgIter->getType());
+ bccAssert(InPtrTy && "Expected a pointer parameter to kernel");
+ bccAssert(ReturnValuePointerStyle);
+ bccAssert(std::next(SecondArgIter)->getType() == InPtrTy &&
+ "Input type mismatch");
+ InTy = InPtrTy->getElementType();
+ } else {
+ InTy = SecondArgIter->getType();
+ InPtrTy = InTy->getPointerTo();
+ if (!ReturnValuePointerStyle) {
+ bccAssert(InTy == FirstArgIter->getType() && "Input type mismatch");
+ } else {
+ bccAssert(InTy == std::next(SecondArgIter)->getType() &&
+ "Input type mismatch");
+ }
+ }
+
+ // The input type should take up the same amount of space in
+ // memory as the output type.
+ bccAssert(DL.getTypeAllocSize(InTy) == DL.getTypeAllocSize(OutTy));
+
+ // Construct the actual function body.
+ llvm::IRBuilder<> Builder(ExpandedFunction->getEntryBlock().begin());
+
+ // Cast input and output buffers to appropriate types.
+ llvm::Value *InBuf = Builder.CreatePointerCast(Arg_inBuf, InPtrTy);
+ llvm::Value *OutBuf = Builder.CreatePointerCast(Arg_outBuf, OutPtrTy);
+
+ // Create a slot to pass temporary results back. This needs to be
+ // separate from the accumulator slot because the kernel may mark
+ // the return value slot as noalias.
+ llvm::Value *ReturnBuf = nullptr;
+ if (ReturnValuePointerStyle) {
+ ReturnBuf = Builder.CreateAlloca(OutTy, nullptr, "ret.tmp");
+ }
+
+ // Create a slot to hold the second input if the inputs are passed
+ // by pointer to the original kernel. We cannot directly pass a
+ // pointer to the input buffer, because the kernel may modify its
+ // inputs.
+ llvm::Value *SecondInputTempBuf = nullptr;
+ if (InputsPointerStyle) {
+ SecondInputTempBuf = Builder.CreateAlloca(InTy, nullptr, "in.tmp");
+ }
+
+ // Create a slot to accumulate temporary results, and fill it with
+ // the first value.
+ llvm::Value *AccumBuf = Builder.CreateAlloca(OutTy, nullptr, "accum");
+ // Cast to OutPtrTy before loading, since AccumBuf has type OutPtrTy.
+ llvm::LoadInst *FirstElementLoad = Builder.CreateLoad(
+ Builder.CreatePointerCast(InBuf, OutPtrTy));
+ if (gEnableRsTbaa) {
+ FirstElementLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ // Memory operations with AccumBuf shouldn't be marked with
+ // RenderScript TBAA, since this might conflict with TBAA metadata
+ // in the kernel function when AccumBuf is passed by pointer.
+ Builder.CreateStore(FirstElementLoad, AccumBuf);
+
+ // Loop body
+
+ // Create the loop structure. Note that the first input in the input buffer
+ // has already been accumulated, so that we start at index 1.
+ llvm::PHINode *IndVar;
+ llvm::Value *Start = llvm::ConstantInt::get(Arg_len->getType(), 1);
+ llvm::BasicBlock *Exit = createLoop(Builder, Start, Arg_len, &IndVar);
+
+ llvm::Value *InputPtr = Builder.CreateInBoundsGEP(InBuf, IndVar, "next_input.gep");
+
+ // Set up arguments and call the original (unexpanded) kernel.
+ //
+ // The original kernel can have at most 3 arguments, which is
+ // achieved when the signature looks like:
+ //
+ // define void @func(foo* %ret, bar %a, bar %b)
+ //
+ // (bar can be one of foo/foo.coerce/foo*).
+ llvm::SmallVector<llvm::Value *, 3> KernelArgs;
+
+ if (ReturnValuePointerStyle) {
+ KernelArgs.push_back(ReturnBuf);
+ }
+
+ if (InputsPointerStyle) {
+ bccAssert(ReturnValuePointerStyle);
+ // Because the return buffer is copied back into the
+ // accumulator, it's okay if the accumulator is overwritten.
+ KernelArgs.push_back(AccumBuf);
+
+ llvm::LoadInst *InputLoad = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ InputLoad->setMetadata("tbaa", TBAAAllocation);
+ }
+ Builder.CreateStore(InputLoad, SecondInputTempBuf);
+
+ KernelArgs.push_back(SecondInputTempBuf);
+ } else {
+ // InPtrTy may be different from OutPtrTy (the type of
+ // AccumBuf), so first cast the accumulator buffer to the
+ // pointer type corresponding to the input argument type.
+ KernelArgs.push_back(
+ Builder.CreateLoad(Builder.CreatePointerCast(AccumBuf, InPtrTy)));
+
+ llvm::LoadInst *LoadedArg = Builder.CreateLoad(InputPtr);
+ if (gEnableRsTbaa) {
+ LoadedArg->setMetadata("tbaa", TBAAAllocation);
+ }
+ KernelArgs.push_back(LoadedArg);
+ }
+
+ llvm::Value *RetVal = Builder.CreateCall(Function, KernelArgs);
+
+ const uint64_t ElementSize = DL.getTypeStoreSize(OutTy);
+ const uint64_t ElementAlign = DL.getABITypeAlignment(OutTy);
+
+ // Store the output in the accumulator.
+ if (ReturnValuePointerStyle) {
+ Builder.CreateMemCpy(AccumBuf, ReturnBuf, ElementSize, ElementAlign);
+ } else {
+ Builder.CreateStore(RetVal, AccumBuf);
+ }
+
+ // Loop exit
+ Builder.SetInsertPoint(Exit, Exit->begin());
+
+ llvm::LoadInst *OutputLoad = Builder.CreateLoad(AccumBuf);
+ llvm::StoreInst *OutputStore = Builder.CreateStore(OutputLoad, OutBuf);
+ if (gEnableRsTbaa) {
+ OutputStore->setMetadata("tbaa", TBAAAllocation);
+ }
+
+ return true;
+ }
+
+ /// @brief Checks if pointers to allocation internals are exposed
+ ///
+ /// This function verifies if through the parameters passed to the kernel
+ /// or through calls to the runtime library the script gains access to
+ /// pointers pointing to data within a RenderScript Allocation.
+ /// If we know we control all loads from and stores to data within
+ /// RenderScript allocations and if we know the run-time internal accesses
+ /// are all annotated with RenderScript TBAA metadata, only then we
+ /// can safely use TBAA to distinguish between generic and from-allocation
+ /// pointers.
+ bool allocPointersExposed(llvm::Module &Module) {
+ // Old style kernel function can expose pointers to elements within
+ // allocations.
+ // TODO: Extend analysis to allow simple cases of old-style kernels.
+ for (size_t i = 0; i < mExportForEachCount; ++i) {
+ const char *Name = mExportForEachNameList[i];
+ uint32_t Signature = mExportForEachSignatureList[i];
+ if (Module.getFunction(Name) &&
+ !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
+ return true;
+ }
+ }
+
+ // Check for library functions that expose a pointer to an Allocation or
+ // that are not yet annotated with RenderScript-specific tbaa information.
+ static const std::vector<const char *> Funcs{
+ // rsGetElementAt(...)
+ "_Z14rsGetElementAt13rs_allocationj",
+ "_Z14rsGetElementAt13rs_allocationjj",
+ "_Z14rsGetElementAt13rs_allocationjjj",
+
+ // rsSetElementAt()
+ "_Z14rsSetElementAt13rs_allocationPvj",
+ "_Z14rsSetElementAt13rs_allocationPvjj",
+ "_Z14rsSetElementAt13rs_allocationPvjjj",
+
+ // rsGetElementAtYuv_uchar_Y()
+ "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
+
+ // rsGetElementAtYuv_uchar_U()
+ "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
+
+ // rsGetElementAtYuv_uchar_V()
+ "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
+ };
+
+ for (auto FI : Funcs) {
+ llvm::Function *Function = Module.getFunction(FI);
+
+ if (!Function) {
+ ALOGE("Missing run-time function '%s'", FI);
+ return true;
+ }
+
+ if (Function->getNumUses() > 0) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
+ ///
+ /// The TBAA metadata used to annotate loads/stores from RenderScript
+ /// Allocations is generated in a separate TBAA tree with a
+ /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
+ /// all nodes in unrelated alias analysis trees. This function makes the
+ /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
+ /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
+ /// the connected trees every access to an Allocation is resolved to
+ /// must-alias if compared to a normal C/C++ access.
+ void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
+ llvm::MDBuilder MDHelper(*Context);
+ llvm::MDNode *TBAARenderScriptDistinct =
+ MDHelper.createTBAARoot("RenderScript Distinct TBAA");
+ llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
+ "RenderScript TBAA", TBAARenderScriptDistinct);
+ llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA");
+ TBAARenderScript->replaceOperandWith(1, TBAARoot);
+ }
+
+ virtual bool runOnModule(llvm::Module &Module) {
+ bool Changed = false;
+ this->Module = &Module;
+ Context = &Module.getContext();
+
+ buildTypes();
+
+ bcinfo::MetadataExtractor me(&Module);
+ if (!me.extract()) {
+ ALOGE("Could not extract metadata from module!");
+ return false;
+ }
+
+ // Expand forEach_* style kernels.
+ mExportForEachCount = me.getExportForEachSignatureCount();
+ mExportForEachNameList = me.getExportForEachNameList();
+ mExportForEachSignatureList = me.getExportForEachSignatureList();
+
+ for (size_t i = 0; i < mExportForEachCount; ++i) {
+ const char *name = mExportForEachNameList[i];
+ uint32_t signature = mExportForEachSignatureList[i];
+ llvm::Function *kernel = Module.getFunction(name);
+ if (kernel) {
+ if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
+ Changed |= ExpandForEach(kernel, signature);
+ kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+ } else if (kernel->getReturnType()->isVoidTy()) {
+ Changed |= ExpandOldStyleForEach(kernel, signature);
+ kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
+ } else {
+ // There are some graphics root functions that are not
+ // expanded, but that will be called directly. For those
+ // functions, we can not set the linkage to internal.
+ }
+ }
+ }
+
+ // Expand reduce_* style kernels.
+ mExportReduceCount = me.getExportReduceCount();
+ mExportReduceNameList = me.getExportReduceNameList();
+
+ for (size_t i = 0; i < mExportReduceCount; ++i) {
+ llvm::Function *kernel = Module.getFunction(mExportReduceNameList[i]);
+ if (kernel) {
+ Changed |= ExpandReduce(kernel);
+ }
+ }
+
+ if (gEnableRsTbaa && !allocPointersExposed(Module)) {
+ connectRenderScriptTBAAMetadata(Module);
+ }
+
+ return Changed;
+ }
+
+ virtual const char *getPassName() const {
+ return "forEach_* and reduce_* function expansion";
+ }
+
+}; // end RSKernelExpandPass
+
+} // end anonymous namespace
+
+char RSKernelExpandPass::ID = 0;
+static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
+
+namespace bcc {
+
+llvm::ModulePass *
+createRSKernelExpandPass(bool pEnableStepOpt) {
+ return new RSKernelExpandPass(pEnableStepOpt);
+}
+
+} // end namespace bcc
diff --git a/lib/Renderscript/RSScript.cpp b/lib/Renderscript/RSScript.cpp
index 90261ed..e04c35c 100644
--- a/lib/Renderscript/RSScript.cpp
+++ b/lib/Renderscript/RSScript.cpp
@@ -19,6 +19,7 @@
#include "bcc/Assert.h"
#include "bcc/Source.h"
#include "bcc/Support/Log.h"
+#include "bcc/Support/CompilerConfig.h"
using namespace bcc;
@@ -54,6 +55,20 @@
mEmbedInfo(false), mEmbedGlobalInfo(false),
mEmbedGlobalInfoSkipConstant(false) { }
+RSScript::RSScript(Source &pSource, const CompilerConfig * pCompilerConfig): RSScript(pSource)
+{
+ switch (pCompilerConfig->getOptimizationLevel()) {
+ case llvm::CodeGenOpt::None: mOptimizationLevel = kOptLvl0; break;
+ case llvm::CodeGenOpt::Less: mOptimizationLevel = kOptLvl1; break;
+ case llvm::CodeGenOpt::Default: mOptimizationLevel = kOptLvl2; break;
+ case llvm::CodeGenOpt::Aggressive: //Intentional fallthrough
+ default: {
+ mOptimizationLevel = kOptLvl3;
+ break;
+ }
+ }
+}
+
bool RSScript::doReset() {
mCompilerVersion = 0;
mOptimizationLevel = kOptLvl3;
diff --git a/lib/Renderscript/RSStubsWhiteList.cpp b/lib/Renderscript/RSStubsWhiteList.cpp
index b69681d..426fb43 100644
--- a/lib/Renderscript/RSStubsWhiteList.cpp
+++ b/lib/Renderscript/RSStubsWhiteList.cpp
@@ -1235,6 +1235,7 @@
"_Z3madfff",
"_Z3maxDv2_cS_",
"_Z3maxDv2_fS_",
+"_Z3maxDv2_ff",
"_Z3maxDv2_hS_",
"_Z3maxDv2_iS_",
"_Z3maxDv2_jS_",
@@ -1244,6 +1245,7 @@
"_Z3maxDv2_tS_",
"_Z3maxDv3_cS_",
"_Z3maxDv3_fS_",
+"_Z3maxDv3_ff",
"_Z3maxDv3_hS_",
"_Z3maxDv3_iS_",
"_Z3maxDv3_jS_",
@@ -1253,6 +1255,7 @@
"_Z3maxDv3_tS_",
"_Z3maxDv4_cS_",
"_Z3maxDv4_fS_",
+"_Z3maxDv4_ff",
"_Z3maxDv4_hS_",
"_Z3maxDv4_iS_",
"_Z3maxDv4_jS_",
@@ -1271,6 +1274,7 @@
"_Z3maxtt",
"_Z3minDv2_cS_",
"_Z3minDv2_fS_",
+"_Z3minDv2_ff",
"_Z3minDv2_hS_",
"_Z3minDv2_iS_",
"_Z3minDv2_jS_",
@@ -1280,6 +1284,7 @@
"_Z3minDv2_tS_",
"_Z3minDv3_cS_",
"_Z3minDv3_fS_",
+"_Z3minDv3_ff",
"_Z3minDv3_hS_",
"_Z3minDv3_iS_",
"_Z3minDv3_jS_",
@@ -1289,6 +1294,7 @@
"_Z3minDv3_tS_",
"_Z3minDv4_cS_",
"_Z3minDv4_fS_",
+"_Z3minDv4_ff",
"_Z3minDv4_hS_",
"_Z3minDv4_iS_",
"_Z3minDv4_jS_",
diff --git a/lib/Support/CompilerConfig.cpp b/lib/Support/CompilerConfig.cpp
index eac26aa..71cd7cc 100644
--- a/lib/Support/CompilerConfig.cpp
+++ b/lib/Support/CompilerConfig.cpp
@@ -155,7 +155,9 @@
#if defined(TARGET_BUILD)
if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
#ifndef FORCE_CPU_VARIANT_32
+#ifdef DEFAULT_ARM_CODEGEN
setCPU(llvm::sys::getHostCPUName());
+#endif
#else
#define XSTR(S) #S
#define STR(S) XSTR(S)
@@ -175,7 +177,9 @@
#if defined(TARGET_BUILD)
if (!getProperty("debug.rs.arm-no-tune-for-cpu")) {
#ifndef FORCE_CPU_VARIANT_64
+#ifdef DEFAULT_ARM64_CODEGEN
setCPU(llvm::sys::getHostCPUName());
+#endif
#else
#define XSTR(S) #S
#define STR(S) XSTR(S)
diff --git a/libbcc-host-build.mk b/libbcc-host-build.mk
index 3a8839f..22f0f72 100644
--- a/libbcc-host-build.mk
+++ b/libbcc-host-build.mk
@@ -26,14 +26,6 @@
$(RS_VERSION_DEFINE) \
$(LOCAL_CFLAGS)
-ifneq ($(BOARD_OVERRIDE_RS_CPU_VARIANT_32),)
-LOCAL_CFLAGS += -DFORCE_CPU_VARIANT_32=$(BOARD_OVERRIDE_RS_CPU_VARIANT_32)
-endif
-
-ifneq ($(BOARD_OVERRIDE_RS_CPU_VARIANT_64),)
-LOCAL_CFLAGS += -DFORCE_CPU_VARIANT_64=$(BOARD_OVERRIDE_RS_CPU_VARIANT_64)
-endif
-
ifeq ($(TARGET_BUILD_VARIANT),eng)
libbcc_CFLAGS += -DANDROID_ENGINEERING_BUILD
else
diff --git a/llvm-loadable-libbcc.mk b/llvm-loadable-libbcc.mk
new file mode 100644
index 0000000..7675167
--- /dev/null
+++ b/llvm-loadable-libbcc.mk
@@ -0,0 +1,31 @@
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Checks whether libbcc can be built as an LLVM loadable module on the
+# host.
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := true
+
+ifdef USE_MINGW
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
+
+ifeq ($(HOST_OS),darwin)
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
+
+ifneq ($(FORCE_BUILD_LLVM_COMPONENTS),true)
+CAN_BUILD_HOST_LLVM_LOADABLE_MODULE := false
+endif
diff --git a/tests/README.lit b/tests/README.lit
new file mode 100644
index 0000000..16fa305
--- /dev/null
+++ b/tests/README.lit
@@ -0,0 +1,7 @@
+To run the libbcc lit tests:
+ * Ensure `llvm-rs-as` is built, either by doing a top-level `make
+ checkbuild` or by doing `mm` from frameworks/compile/slang.
+ * Ensure that LLVM and libbcc are built with
+ `FORCE_BUILD_LLVM_COMPONENTS=true`.
+ * Ensure `opt` is built from external/llvm, either by top-level `make
+ checkbuild` or by doing `mm` from external/llvm.
diff --git a/tests/libbcc/getelementptr.ll b/tests/libbcc/getelementptr.ll
new file mode 100644
index 0000000..1cf201a
--- /dev/null
+++ b/tests/libbcc/getelementptr.ll
@@ -0,0 +1,70 @@
+; This checks that RSForEachExpand generates getelementptr
+; instructions into the driver info structure as expected - namely,
+; that they index into the right positions of the structure and that
+; the instructions that are generated are in the loop header.
+
+; RUN: opt -load libbcc.so -kernelexp -S < %s | FileCheck %s
+
+; ModuleID = 'test_getelementptr.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; Old-style kernel
+define void @root(i32* nocapture %ain, i32* nocapture %out, i32 %x, i32 %y, i32 %z) {
+ ret void
+; CHECK: define void @root.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %outstep)
+; CHECK: Begin:
+; CHECK: %instep_addr.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 1, i32 0
+; CHECK: load i32, i32* %instep_addr.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: Loop:
+}
+
+; New style kernel with multiple inputs
+define i32 @foo(i32 %in0, i32 %in1, i32 %x, i32 %y, i32 %z) {
+ ret i32 0
+; CHECK: define void @foo.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, i32 %arg_outstep)
+; CHECK: Begin:
+; CHECK: %out_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 3, i32 0
+; CHECK: load i8*, i8** %out_buf.gep
+; CHECK: %Y.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 1
+; CHECK: load i32, i32* %Y.gep
+; CHECK: %Z.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 7, i32 2
+; CHECK: load i32, i32* %Z.gep
+; CHECK: %input_buf.gep = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 0
+; CHECK: load i8*, i8** %input_buf.gep
+; CHECK: %input_buf.gep1 = getelementptr inbounds %RsExpandKernelDriverInfoPfx, %RsExpandKernelDriverInfoPfx* %p, i32 0, i32 0, i32 1
+; CHECK: load i8*, i8** %input_buf.gep1
+; CHECK: Loop:
+}
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"foo"}
+!5 = !{!"91"}
+!6 = !{!"123"}
diff --git a/tests/libbcc/lit.cfg b/tests/libbcc/lit.cfg
new file mode 100644
index 0000000..109a9d7
--- /dev/null
+++ b/tests/libbcc/lit.cfg
@@ -0,0 +1,49 @@
+# -*- Python -*-
+
+# Configuration file for the 'lit' test runner.
+
+import re
+
+# name: The name of this test suite.
+config.name = 'libbcc'
+
+# suffixes: A list of file extensions to treat as test files.
+config.suffixes = ['.ll']
+
+# testFormat: The test format to use to interpret tests.
+import lit.formats
+config.test_format = lit.formats.ShTest()
+
+ANDROID_HOST_OUT = os.getenv("ANDROID_HOST_OUT")
+ANDROID_PRODUCT_OUT = os.getenv("ANDROID_PRODUCT_OUT")
+
+if not ANDROID_HOST_OUT or not ANDROID_PRODUCT_OUT:
+ import sys
+ sys.exit(1)
+
+# test_source_root: The path where tests are located (default is the test suite
+# root).
+config.test_source_root = None
+config.test_exec_root = os.path.join(ANDROID_HOST_OUT, 'tests', 'libbcc')
+
+tools_dir = os.pathsep.join([os.path.join(ANDROID_HOST_OUT, 'bin'),
+ os.path.join(ANDROID_HOST_OUT, 'lib64')])
+
+# Based on LLVM's lit.cfg: "For each occurrence of an llvm tool name
+# as its own word, replace it with the full path to the build directory
+# holding that tool."
+for pattern in [r"\bFileCheck\b",
+ r"\bllvm-rs-as\b",
+ r"\bbcinfo\b",
+ r"\bopt\b",
+ r"\blibbcc.so\b"]:
+ tool_match = re.match(r"^(\\)?((\| )?)\W+b([\.0-9A-Za-z-_]+)\\b\W*$",
+ pattern)
+ tool_pipe = tool_match.group(2)
+ tool_name = tool_match.group(4)
+ import lit.util
+ tool_path = lit.util.which(tool_name, tools_dir)
+ if not tool_path:
+ lit_config.note("Did not find " + tool_name + " in " + tools_dir)
+ tool_path = os.path.join(tools_dir, tool_name)
+ config.substitutions.append((pattern, tool_pipe + tool_path))
diff --git a/tests/libbcc/tbaa-through-alloca.ll b/tests/libbcc/tbaa-through-alloca.ll
new file mode 100644
index 0000000..5b0a270
--- /dev/null
+++ b/tests/libbcc/tbaa-through-alloca.ll
@@ -0,0 +1,71 @@
+; This test checks that the code doesn't aggressively apply TBAA
+; metadata to temporaries that are passed by pointer to kernels.
+
+; RUN: opt -load libbcc.so -kernelexp -inline -tbaa -aa-eval -print-may-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+%struct.int5.0 = type { [5 x i32] }
+
+; Function Attrs: nounwind
+define void @add1_int5(%struct.int5.0* noalias nocapture sret %agg.result, %struct.int5.0* nocapture %in) #0 {
+ br label %1
+
+; <label>:1 ; preds = %1, %0
+ %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+ %2 = getelementptr inbounds %struct.int5.0, %struct.int5.0* %in, i64 0, i32 0, i64 %indvars.iv
+; CHECK: MayAlias: %load_from_input{{.*}} <-> store %struct.int5.0 %input, %struct.int5.0* %input_struct_slot
+ %load_from_input = load i32, i32* %2, align 4, !tbaa !9
+ %3 = add nsw i32 %load_from_input, 1
+ store i32 %3, i32* %2, align 4, !tbaa !9
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 5
+ br i1 %exitcond, label %4, label %1
+
+; <label>:4 ; preds = %1
+ %5 = bitcast %struct.int5.0* %agg.result to i8*
+ %6 = bitcast %struct.int5.0* %in to i8*
+ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 20, i32 4, i1 false), !tbaa.struct !13
+ ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { nounwind }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+!\23rs_export_type = !{!7}
+!\25int5 = !{!8}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1_int5"}
+!5 = !{!"0"}
+!6 = !{!"35"}
+!7 = !{!"int5"}
+!8 = !{!"data", !"<ConstantArray>"}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{i64 0, i64 20, !14}
+!14 = !{!11, !11, i64 0}
diff --git a/tests/libbcc/tbaa.ll b/tests/libbcc/tbaa.ll
new file mode 100644
index 0000000..6d8cb48
--- /dev/null
+++ b/tests/libbcc/tbaa.ll
@@ -0,0 +1,43 @@
+; Basic test of TBAA that should report that pointer loads do not
+; alias with stores to allocations.
+
+; RUN: opt -load libbcc.so -kernelexp -tbaa -aa-eval -print-no-aliases -evaluate-aa-metadata < %s -S -o - 2>&1 | FileCheck %s
+
+; ModuleID = 'kernel.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Declarations expected by the expansion pass.
+declare void @_Z14rsGetElementAt13rs_allocationj()
+declare void @_Z14rsGetElementAt13rs_allocationjj()
+declare void @_Z14rsGetElementAt13rs_allocationjjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjj()
+declare void @_Z14rsSetElementAt13rs_allocationPvjjj()
+declare void @_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj()
+declare void @_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj()
+
+; CHECK: NoAlias: %0 = load {{.*}}, i8** %out_buf.gep, !tbaa {{.*}} <-> store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+; CHECK: NoAlias: %input_buf = load i8*, i8** %input_buf.gep, !tbaa {{.*}} <-> store i32 %call.result, i32* {{.*}}, !tbaa {{.*}}
+
+; Function Attrs: nounwind readnone
+define i32 @add1(i32 %in) #0 {
+ %1 = add nsw i32 %in, 1
+ ret i32 %1
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_foreach_name = !{!3, !4}
+!\23rs_export_foreach = !{!5, !6}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"foo"}
+!3 = !{!"root"}
+!4 = !{!"add1"}
+!5 = !{!"0"}
+!6 = !{!"35"}
diff --git a/tests/libbcc/test_reduce_metadata.ll b/tests/libbcc/test_reduce_metadata.ll
new file mode 100644
index 0000000..aea8f36
--- /dev/null
+++ b/tests/libbcc/test_reduce_metadata.ll
@@ -0,0 +1,28 @@
+; Check that the #rs_export_reduce node is recognized.
+
+; RUN: llvm-rs-as %s -o %t
+; RUN: bcinfo %t | FileCheck %s
+
+; CHECK: exportReduceCount: 1
+; CHECK: func[0]: add
+
+; ModuleID = 'reduce.bc'
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-linux-gnueabi"
+
+; Function Attrs: nounwind readnone
+define i32 @add(i32 %a, i32 %b) #0 {
+ %1 = add nsw i32 %b, %a
+ ret i32 %1
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+!\23pragma = !{!1, !2}
+!\23rs_export_reduce = !{!3}
+
+!0 = !{!"clang version 3.6 "}
+!1 = !{!"version", !"1"}
+!2 = !{!"java_package_name", !"com.android.rs.test"}
+!3 = !{!"add"}
diff --git a/tests/run-lit-tests.sh b/tests/run-lit-tests.sh
new file mode 100755
index 0000000..8976555
--- /dev/null
+++ b/tests/run-lit-tests.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -e
+
+LIT_PATH=$ANDROID_BUILD_TOP/frameworks/compile/libbcc/tests/debuginfo/llvm-lit
+LIBBCC_TESTS=$ANDROID_BUILD_TOP/frameworks/compile/libbcc/tests/libbcc
+
+$LIT_PATH $LIBBCC_TESTS $@
diff --git a/tools/bcc/Main.cpp b/tools/bcc/Main.cpp
index 2832694..47dc60f 100644
--- a/tools/bcc/Main.cpp
+++ b/tools/bcc/Main.cpp
@@ -365,7 +365,14 @@
// into the .rs.info symbol.
Source *source = Source::CreateFromBuffer(context, OptInputFilenames[0].c_str(),
bitcode, bitcodeSize);
- RSScript *s = new (std::nothrow) RSScript(*source);
+
+ // If the bitcode fails verification in the bitcode loader, the returned Source is set to NULL.
+ if (!source) {
+ ALOGE("Failed to load source from file %s", OptInputFilenames[0].c_str());
+ return EXIT_FAILURE;
+ }
+
+ RSScript *s = new (std::nothrow) RSScript(*source, RSCD.getConfig());
if (s == nullptr) {
llvm::errs() << "Out of memory when creating script for file `"
<< OptInputFilenames[0] << "'!\n";