X86: Fix kernel Input/Output GetElementPtr offset issue
http://b/24142721
The disagreement between module and x86 target datalayout causes
mismatched size calculation of structs with 64-bit scalar fields.
This patch solves the issue by treating input and output pointers as
byte* pointers, indexing them with explicit byte offsets computed using
a modified data layout for x86, with alignment for 64-bit scalars set to
8-bytes, and casting the indexed pointers back to the real type.
Change-Id: Id99ea0e37f5cc9ad93cc6a5979db79ff11b3998c
Signed-off-by: Yong Chen <yong.a.chen@intel.com>
diff --git a/lib/Renderscript/RSKernelExpand.cpp b/lib/Renderscript/RSKernelExpand.cpp
index 7337a30..aa139f2 100644
--- a/lib/Renderscript/RSKernelExpand.cpp
+++ b/lib/Renderscript/RSKernelExpand.cpp
@@ -665,8 +665,9 @@
// ArgIter - iterator pointing to first input of the UNexpanded function
// NumInputs - number of inputs (NOT number of ARGUMENTS)
//
- // InBufPtrs[] - this function sets each array element to point to the first
- // cell of the corresponding input allocation
+ // InTypes[] - this function saves input type, they will be used in ExpandInputsBody().
+ // InBufPtrs[] - this function sets each array element to point to the first cell / byte
+ // (byte for x86, cell for other platforms) of the corresponding input allocation
// InStructTempSlots[] - this function sets each array element either to nullptr
// or to the result of an alloca (for the case where the
// calling convention dictates that a value must be passed
@@ -677,6 +678,7 @@
llvm::MDNode *TBAAPointer,
llvm::Function::arg_iterator ArgIter,
const size_t NumInputs,
+ llvm::SmallVectorImpl<llvm::Type *> &InTypes,
llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) {
bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
@@ -712,12 +714,25 @@
static_cast<int32_t>(InputIndex)}));
llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
- llvm::Value *CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
+
+ llvm::Value *CastInBufPtr = nullptr;
+ if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) {
+ CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
+ } else {
+ // The disagreement between module and x86 target machine datalayout
+ // causes mismatched input/output data offset between slang reflected
+ // code and bcc codegen for GetElementPtr. To solve this issue, skip the
+ // cast to InType and leave CastInBufPtr as an int8_t*. The buffer is
+ // later indexed with an explicit byte offset computed based on
+ // X86_CUSTOM_DL_STRING and then bitcast it to actual input type.
+ CastInBufPtr = InBufPtr;
+ }
if (gEnableRsTbaa) {
InBufPtr->setMetadata("tbaa", TBAAPointer);
}
+ InTypes.push_back(InType);
InBufPtrs.push_back(CastInBufPtr);
}
@@ -732,6 +747,8 @@
// Arg_x1 - first X coordinate to be processed by the expanded function
// TBAAAllocation - metadata for marking loads of input values out of allocations
// NumInputs -- number of inputs (NOT number of ARGUMENTS)
+ // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant()
+ // to convert the pointer of byte InPtr to its real type.
// InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant()
// InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant()
// IndVar - value of loop induction variable (X coordinate) for a given loop iteration
@@ -742,16 +759,32 @@
llvm::Value *Arg_x1,
llvm::MDNode *TBAAAllocation,
const size_t NumInputs,
+ const llvm::SmallVectorImpl<llvm::Type *> &InTypes,
const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots,
llvm::Value *IndVar,
llvm::SmallVectorImpl<llvm::Value *> &RootArgs) {
llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1);
+ llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
for (size_t Index = 0; Index < NumInputs; ++Index) {
- llvm::Value *InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
- llvm::Value *Input;
+ llvm::Value *InPtr = nullptr;
+ if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) {
+ InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
+ } else {
+ // Treat x86 input buffer as byte[], get indexed pointer with explicit
+ // byte offset computed using a datalayout based on
+ // X86_CUSTOM_DL_STRING, then bitcast it to actual input type.
+ llvm::DataLayout DL(X86_CUSTOM_DL_STRING);
+ llvm::Type *InTy = InTypes[Index];
+ uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType());
+ llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep));
+ InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes);
+ InPtr = Builder.CreatePointerCast(InPtr, InTy);
+ }
+
+ llvm::Value *Input;
llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
if (gEnableRsTbaa) {
@@ -793,6 +826,9 @@
}
llvm::DataLayout DL(Module);
+ if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) {
+ DL.reset(X86_CUSTOM_DL_STRING);
+ }
llvm::Function *ExpandedFunction =
createEmptyExpandedForEachKernel(Function->getName());
@@ -923,6 +959,10 @@
// TODO: Refactor this to share functionality with ExpandOldStyleForEach.
llvm::DataLayout DL(Module);
+ if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) {
+ DL.reset(X86_CUSTOM_DL_STRING);
+ }
+ llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
llvm::Function *ExpandedFunction =
createEmptyExpandedForEachKernel(Function->getName());
@@ -1003,9 +1043,20 @@
OutBasePtr->setMetadata("tbaa", TBAAPointer);
}
- CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
+ if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) {
+ CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
+ } else {
+ // The disagreement between module and x86 target machine datalayout
+ // causes mismatched input/output data offset between slang reflected
+ // code and bcc codegen for GetElementPtr. To solve this issue, skip the
+ // cast to OutTy and leave CastedOutBasePtr as an int8_t*. The buffer
+ // is later indexed with an explicit byte offset computed based on
+ // X86_CUSTOM_DL_STRING and then bitcast it to actual output type.
+ CastedOutBasePtr = OutBasePtr;
+ }
}
+ llvm::SmallVector<llvm::Type*, 8> InTypes;
llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
@@ -1030,7 +1081,7 @@
if (NumInPtrArguments > 0) {
ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments,
- InBufPtrs, InStructTempSlots);
+ InTypes, InBufPtrs, InStructTempSlots);
}
// Populate the actual call to kernel().
@@ -1043,7 +1094,18 @@
llvm::Value *OutPtr = nullptr;
if (CastedOutBasePtr) {
llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
- OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
+
+ if (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING) {
+ OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
+ } else {
+ // Treat x86 output buffer as byte[], get indexed pointer with explicit
+ // byte offset computed using a datalayout based on
+ // X86_CUSTOM_DL_STRING, then bitcast it to actual output type.
+ uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType());
+ llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep));
+ OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes);
+ OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
+ }
if (PassOutByPointer) {
RootArgs.push_back(OutPtr);
@@ -1054,7 +1116,7 @@
if (NumInPtrArguments > 0) {
ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments,
- InBufPtrs, InStructTempSlots, IV, RootArgs);
+ InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs);
}
finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
@@ -1141,6 +1203,9 @@
ALOGV("Expanding simple reduce kernel %s", Function->getName().str().c_str());
llvm::DataLayout DL(Module);
+ if (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING) {
+ DL.reset(X86_CUSTOM_DL_STRING);
+ }
// TBAA Metadata
llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript, *TBAAAllocation;
@@ -1431,15 +1496,16 @@
ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs,
[](){}, LoopHeader->getTerminator());
+ llvm::SmallVector<llvm::Type*, 8> InTypes;
llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs,
- InBufPtrs, InStructTempSlots);
+ InTypes, InBufPtrs, InStructTempSlots);
// Populate the actual call to the original accumulator.
llvm::SmallVector<llvm::Value*, 8> RootArgs;
RootArgs.push_back(Arg_accum);
- ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InBufPtrs, InStructTempSlots,
+ ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots,
IndVar, RootArgs);
finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder);
Builder.CreateCall(FnAccumulator, RootArgs);