blob: 506b315079909210d3b1f7c9dc1fbee21e0e53ea [file] [log] [blame]
//===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
//
// The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the TargetLoweringX8632 class, which
// consists almost entirely of the lowering sequence for each
// high-level instruction.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/MathExtras.h"
#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
#include "IceDefs.h"
#include "IceELFObjectWriter.h"
#include "IceGlobalInits.h"
#include "IceInstX8632.h"
#include "IceLiveness.h"
#include "IceOperand.h"
#include "IceRegistersX8632.h"
#include "IceTargetLoweringX8632.def"
#include "IceTargetLoweringX8632.h"
#include "IceUtils.h"
namespace Ice {
namespace {
// The following table summarizes the logic for lowering the fcmp
// instruction. There is one table entry for each of the 16 conditions.
//
// The first four columns describe the case when the operands are
// floating point scalar values. A comment in lowerFcmp() describes the
// lowering template. In the most general case, there is a compare
// followed by two conditional branches, because some fcmp conditions
// don't map to a single x86 conditional branch. However, in many cases
// it is possible to swap the operands in the comparison and have a
// single conditional branch. Since it's quite tedious to validate the
// table by hand, good execution tests are helpful.
//
// The last two columns describe the case when the operands are vectors
// of floating point values. For most fcmp conditions, there is a clear
// mapping to a single x86 cmpps instruction variant. Some fcmp
// conditions require special code to handle and these are marked in the
// table with a Cmpps_Invalid predicate.
const struct TableFcmp_ {
uint32_t Default;
bool SwapScalarOperands;
CondX86::BrCond C1, C2;
bool SwapVectorOperands;
CondX86::CmppsCond Predicate;
} TableFcmp[] = {
#define X(val, dflt, swapS, C1, C2, swapV, pred) \
{ dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred } \
,
FCMPX8632_TABLE
#undef X
};
const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
// The following table summarizes the logic for lowering the icmp instruction
// for i32 and narrower types. Each icmp condition has a clear mapping to an
// x86 conditional branch instruction.
const struct TableIcmp32_ {
CondX86::BrCond Mapping;
} TableIcmp32[] = {
#define X(val, C_32, C1_64, C2_64, C3_64) \
{ CondX86::C_32 } \
,
ICMPX8632_TABLE
#undef X
};
const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
// The following table summarizes the logic for lowering the icmp instruction
// for the i64 type. For Eq and Ne, two separate 32-bit comparisons and
// conditional branches are needed. For the other conditions, three separate
// conditional branches are needed.
const struct TableIcmp64_ {
CondX86::BrCond C1, C2, C3;
} TableIcmp64[] = {
#define X(val, C_32, C1_64, C2_64, C3_64) \
{ CondX86::C1_64, CondX86::C2_64, CondX86::C3_64 } \
,
ICMPX8632_TABLE
#undef X
};
const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
size_t Index = static_cast<size_t>(Cond);
assert(Index < TableIcmp32Size);
return TableIcmp32[Index].Mapping;
}
const struct TableTypeX8632Attributes_ {
Type InVectorElementType;
} TableTypeX8632Attributes[] = {
#define X(tag, elementty, cvt, sdss, pack, width, fld) \
{ elementty } \
,
ICETYPEX8632_TABLE
#undef X
};
const size_t TableTypeX8632AttributesSize =
llvm::array_lengthof(TableTypeX8632Attributes);
// Return the type which the elements of the vector have in the X86
// representation of the vector.
Type getInVectorElementType(Type Ty) {
assert(isVectorType(Ty));
size_t Index = static_cast<size_t>(Ty);
(void)Index;
assert(Index < TableTypeX8632AttributesSize);
return TableTypeX8632Attributes[Ty].InVectorElementType;
}
// The maximum number of arguments to pass in XMM registers
const uint32_t X86_MAX_XMM_ARGS = 4;
// The number of bits in a byte
const uint32_t X86_CHAR_BIT = 8;
// Stack alignment
const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
// Size of the return address on the stack
const uint32_t X86_RET_IP_SIZE_BYTES = 4;
// The number of different NOP instructions
const uint32_t X86_NUM_NOP_VARIANTS = 5;
// Value is in bytes. Return Value adjusted to the next highest multiple
// of the stack alignment.
uint32_t applyStackAlignment(uint32_t Value) {
return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
}
// In some cases, there are x-macros tables for both high-level and
// low-level instructions/operands that use the same enum key value.
// The tables are kept separate to maintain a proper separation
// between abstraction layers. There is a risk that the tables could
// get out of sync if enum values are reordered or if entries are
// added or deleted. The following dummy namespaces use
// static_asserts to ensure everything is kept in sync.
// Validate the enum values in FCMPX8632_TABLE.
namespace dummy1 {
// Define a temporary set of enum values based on low-level table
// entries.
enum _tmp_enum {
#define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
FCMPX8632_TABLE
#undef X
_num
};
// Define a set of constants based on high-level table entries.
#define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
ICEINSTFCMP_TABLE
#undef X
// Define a set of constants based on low-level table entries, and
// ensure the table entry keys are consistent.
#define X(val, dflt, swapS, C1, C2, swapV, pred) \
static const int _table2_##val = _tmp_##val; \
static_assert( \
_table1_##val == _table2_##val, \
"Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
FCMPX8632_TABLE
#undef X
// Repeat the static asserts with respect to the high-level table
// entries in case the high-level table has extra entries.
#define X(tag, str) \
static_assert( \
_table1_##tag == _table2_##tag, \
"Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
ICEINSTFCMP_TABLE
#undef X
} // end of namespace dummy1
// Validate the enum values in ICMPX8632_TABLE.
namespace dummy2 {
// Define a temporary set of enum values based on low-level table
// entries.
enum _tmp_enum {
#define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
ICMPX8632_TABLE
#undef X
_num
};
// Define a set of constants based on high-level table entries.
#define X(tag, str) static const int _table1_##tag = InstIcmp::tag;
ICEINSTICMP_TABLE
#undef X
// Define a set of constants based on low-level table entries, and
// ensure the table entry keys are consistent.
#define X(val, C_32, C1_64, C2_64, C3_64) \
static const int _table2_##val = _tmp_##val; \
static_assert( \
_table1_##val == _table2_##val, \
"Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
ICMPX8632_TABLE
#undef X
// Repeat the static asserts with respect to the high-level table
// entries in case the high-level table has extra entries.
#define X(tag, str) \
static_assert( \
_table1_##tag == _table2_##tag, \
"Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
ICEINSTICMP_TABLE
#undef X
} // end of namespace dummy2
// Validate the enum values in ICETYPEX8632_TABLE.
namespace dummy3 {
// Define a temporary set of enum values based on low-level table
// entries.
enum _tmp_enum {
#define X(tag, elementty, cvt, sdss, pack, width, fld) _tmp_##tag,
ICETYPEX8632_TABLE
#undef X
_num
};
// Define a set of constants based on high-level table entries.
#define X(tag, size, align, elts, elty, str) \
static const int _table1_##tag = tag;
ICETYPE_TABLE
#undef X
// Define a set of constants based on low-level table entries, and
// ensure the table entry keys are consistent.
#define X(tag, elementty, cvt, sdss, pack, width, fld) \
static const int _table2_##tag = _tmp_##tag; \
static_assert(_table1_##tag == _table2_##tag, \
"Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
ICETYPEX8632_TABLE
#undef X
// Repeat the static asserts with respect to the high-level table
// entries in case the high-level table has extra entries.
#define X(tag, size, align, elts, elty, str) \
static_assert(_table1_##tag == _table2_##tag, \
"Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
ICETYPE_TABLE
#undef X
} // end of namespace dummy3
} // end of anonymous namespace
BoolFoldingEntry::BoolFoldingEntry(Inst *I)
: Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)), IsLiveOut(true),
NumUses(0) {}
BoolFolding::BoolFoldingProducerKind
BoolFolding::getProducerKind(const Inst *Instr) {
if (llvm::isa<InstIcmp>(Instr)) {
if (Instr->getSrc(0)->getType() != IceType_i64)
return PK_Icmp32;
return PK_None; // TODO(stichnot): actually PK_Icmp64;
}
return PK_None; // TODO(stichnot): remove this
if (llvm::isa<InstFcmp>(Instr))
return PK_Fcmp;
if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
switch (Cast->getCastKind()) {
default:
return PK_None;
case InstCast::Trunc:
return PK_Trunc;
}
}
return PK_None;
}
BoolFolding::BoolFoldingConsumerKind
BoolFolding::getConsumerKind(const Inst *Instr) {
if (llvm::isa<InstBr>(Instr))
return CK_Br;
if (llvm::isa<InstSelect>(Instr))
return CK_Select;
return CK_None; // TODO(stichnot): remove this
if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
switch (Cast->getCastKind()) {
default:
return CK_None;
case InstCast::Sext:
return CK_Sext;
case InstCast::Zext:
return CK_Zext;
}
}
return CK_None;
}
// Returns true if the producing instruction has a "complex" lowering
// sequence. This generally means that its lowering sequence requires
// more than one conditional branch, namely 64-bit integer compares
// and some floating-point compares. When this is true, and there is
// more than one consumer, we prefer to disable the folding
// optimization because it minimizes branches.
bool BoolFolding::hasComplexLowering(const Inst *Instr) {
switch (getProducerKind(Instr)) {
default:
return false;
case PK_Icmp64:
return true;
case PK_Fcmp:
return TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
CondX86::Br_None;
}
}
void BoolFolding::init(CfgNode *Node) {
Producers.clear();
for (Inst &Instr : Node->getInsts()) {
// Check whether Instr is a valid producer.
Variable *Var = Instr.getDest();
if (!Instr.isDeleted() // only consider non-deleted instructions
&& Var // only instructions with an actual dest var
&& Var->getType() == IceType_i1 // only bool-type dest vars
&& getProducerKind(&Instr) != PK_None) { // white-listed instructions
Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
}
// Check each src variable against the map.
for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
Operand *Src = Instr.getSrc(I);
SizeT NumVars = Src->getNumVars();
for (SizeT J = 0; J < NumVars; ++J) {
const Variable *Var = Src->getVar(J);
SizeT VarNum = Var->getIndex();
if (containsValid(VarNum)) {
if (I != 0 // All valid consumers use Var as the first source operand
|| getConsumerKind(&Instr) == CK_None // must be white-listed
|| (Producers[VarNum].IsComplex && // complex can't be multi-use
Producers[VarNum].NumUses > 0)) {
setInvalid(VarNum);
continue;
}
++Producers[VarNum].NumUses;
if (Instr.isLastUse(Var)) {
Producers[VarNum].IsLiveOut = false;
}
}
}
}
}
for (auto &I : Producers) {
// Ignore entries previously marked invalid.
if (I.second.Instr == nullptr)
continue;
// Disable the producer if its dest may be live beyond this block.
if (I.second.IsLiveOut) {
setInvalid(I.first);
continue;
}
// Mark as "dead" rather than outright deleting. This is so that
// other peephole style optimizations during or before lowering
// have access to this instruction in undeleted form. See for
// example tryOptimizedCmpxchgCmpBr().
I.second.Instr->setDead();
}
}
const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
auto *Var = llvm::dyn_cast<const Variable>(Opnd);
if (Var == nullptr)
return nullptr;
SizeT VarNum = Var->getIndex();
auto Element = Producers.find(VarNum);
if (Element == Producers.end())
return nullptr;
return Element->second.Instr;
}
void BoolFolding::dump(const Cfg *Func) const {
if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
return;
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
for (auto &I : Producers) {
if (I.second.Instr == nullptr)
continue;
Str << "Found foldable producer:\n ";
I.second.Instr->dump(Func);
Str << "\n";
}
}
void TargetX8632::initNodeForLowering(CfgNode *Node) {
FoldingInfo.init(Node);
FoldingInfo.dump(Func);
}
TargetX8632::TargetX8632(Cfg *Func)
: TargetLowering(Func),
InstructionSet(static_cast<X86InstructionSet>(
Func->getContext()->getFlags().getTargetInstructionSet() -
TargetInstructionSet::X86InstructionSet_Begin)),
IsEbpBasedFrame(false), NeedsStackAlignment(false),
SpillAreaSizeBytes(0) {
static_assert((X86InstructionSet::End - X86InstructionSet::Begin) ==
(TargetInstructionSet::X86InstructionSet_End -
TargetInstructionSet::X86InstructionSet_Begin),
"X86InstructionSet range different from TargetInstructionSet");
// TODO: Don't initialize IntegerRegisters and friends every time.
// Instead, initialize in some sort of static initializer for the
// class.
llvm::SmallBitVector IntegerRegisters(RegX8632::Reg_NUM);
llvm::SmallBitVector IntegerRegistersI8(RegX8632::Reg_NUM);
llvm::SmallBitVector FloatRegisters(RegX8632::Reg_NUM);
llvm::SmallBitVector VectorRegisters(RegX8632::Reg_NUM);
llvm::SmallBitVector InvalidRegisters(RegX8632::Reg_NUM);
ScratchRegs.resize(RegX8632::Reg_NUM);
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
IntegerRegisters[RegX8632::val] = isInt; \
IntegerRegistersI8[RegX8632::val] = isI8; \
FloatRegisters[RegX8632::val] = isFP; \
VectorRegisters[RegX8632::val] = isFP; \
ScratchRegs[RegX8632::val] = scratch;
REGX8632_TABLE;
#undef X
TypeToRegisterSet[IceType_void] = InvalidRegisters;
TypeToRegisterSet[IceType_i1] = IntegerRegistersI8;
TypeToRegisterSet[IceType_i8] = IntegerRegistersI8;
TypeToRegisterSet[IceType_i16] = IntegerRegisters;
TypeToRegisterSet[IceType_i32] = IntegerRegisters;
TypeToRegisterSet[IceType_i64] = IntegerRegisters;
TypeToRegisterSet[IceType_f32] = FloatRegisters;
TypeToRegisterSet[IceType_f64] = FloatRegisters;
TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
}
void TargetX8632::translateO2() {
TimerMarker T(TimerStack::TT_O2, Func);
if (!Ctx->getFlags().getPhiEdgeSplit()) {
// Lower Phi instructions.
Func->placePhiLoads();
if (Func->hasError())
return;
Func->placePhiStores();
if (Func->hasError())
return;
Func->deletePhis();
if (Func->hasError())
return;
Func->dump("After Phi lowering");
}
// Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt();
// Argument lowering
Func->doArgLowering();
// Target lowering. This requires liveness analysis for some parts
// of the lowering decisions, such as compare/branch fusing. If
// non-lightweight liveness analysis is used, the instructions need
// to be renumbered first. TODO: This renumbering should only be
// necessary if we're actually calculating live intervals, which we
// only do for register allocation.
Func->renumberInstructions();
if (Func->hasError())
return;
// TODO: It should be sufficient to use the fastest liveness
// calculation, i.e. livenessLightweight(). However, for some
// reason that slows down the rest of the translation. Investigate.
Func->liveness(Liveness_Basic);
if (Func->hasError())
return;
Func->dump("After x86 address mode opt");
doLoadOpt();
Func->genCode();
if (Func->hasError())
return;
Func->dump("After x86 codegen");
// Register allocation. This requires instruction renumbering and
// full liveness analysis.
Func->renumberInstructions();
if (Func->hasError())
return;
Func->liveness(Liveness_Intervals);
if (Func->hasError())
return;
// Validate the live range computations. The expensive validation
// call is deliberately only made when assertions are enabled.
assert(Func->validateLiveness());
// The post-codegen dump is done here, after liveness analysis and
// associated cleanup, to make the dump cleaner and more useful.
Func->dump("After initial x8632 codegen");
Func->getVMetadata()->init(VMK_All);
regAlloc(RAK_Global);
if (Func->hasError())
return;
Func->dump("After linear scan regalloc");
if (Ctx->getFlags().getPhiEdgeSplit()) {
Func->advancedPhiLowering();
Func->dump("After advanced Phi lowering");
}
// Stack frame mapping.
Func->genFrame();
if (Func->hasError())
return;
Func->dump("After stack frame mapping");
Func->contractEmptyNodes();
Func->reorderNodes();
// Branch optimization. This needs to be done just before code
// emission. In particular, no transformations that insert or
// reorder CfgNodes should be done after branch optimization. We go
// ahead and do it before nop insertion to reduce the amount of work
// needed for searching for opportunities.
Func->doBranchOpt();
Func->dump("After branch optimization");
// Nop insertion
if (Ctx->getFlags().shouldDoNopInsertion()) {
Func->doNopInsertion();
}
}
void TargetX8632::translateOm1() {
TimerMarker T(TimerStack::TT_Om1, Func);
Func->placePhiLoads();
if (Func->hasError())
return;
Func->placePhiStores();
if (Func->hasError())
return;
Func->deletePhis();
if (Func->hasError())
return;
Func->dump("After Phi lowering");
Func->doArgLowering();
Func->genCode();
if (Func->hasError())
return;
Func->dump("After initial x8632 codegen");
regAlloc(RAK_InfOnly);
if (Func->hasError())
return;
Func->dump("After regalloc of infinite-weight variables");
Func->genFrame();
if (Func->hasError())
return;
Func->dump("After stack frame mapping");
// Nop insertion
if (Ctx->getFlags().shouldDoNopInsertion()) {
Func->doNopInsertion();
}
}
namespace {
// Converts a ConstantInteger32 operand into its constant value, or
// MemoryOrderInvalid if the operand is not a ConstantInteger32.
uint64_t getConstantMemoryOrder(Operand *Opnd) {
if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
return Integer->getValue();
return Intrinsics::MemoryOrderInvalid;
}
// Determines whether the dest of a Load instruction can be folded
// into one of the src operands of a 2-operand instruction. This is
// true as long as the load dest matches exactly one of the binary
// instruction's src operands. Replaces Src0 or Src1 with LoadSrc if
// the answer is true.
bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
Operand *&Src0, Operand *&Src1) {
if (Src0 == LoadDest && Src1 != LoadDest) {
Src0 = LoadSrc;
return true;
}
if (Src0 != LoadDest && Src1 == LoadDest) {
Src1 = LoadSrc;
return true;
}
return false;
}
} // end of anonymous namespace
void TargetX8632::doLoadOpt() {
for (CfgNode *Node : Func->getNodes()) {
Context.init(Node);
while (!Context.atEnd()) {
Variable *LoadDest = nullptr;
Operand *LoadSrc = nullptr;
Inst *CurInst = Context.getCur();
Inst *Next = Context.getNextInst();
// Determine whether the current instruction is a Load
// instruction or equivalent.
if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
// An InstLoad always qualifies.
LoadDest = Load->getDest();
const bool DoLegalize = false;
LoadSrc = formMemoryOperand(Load->getSourceAddress(),
LoadDest->getType(), DoLegalize);
} else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
// An AtomicLoad intrinsic qualifies as long as it has a valid
// memory ordering, and can be implemented in a single
// instruction (i.e., not i64).
Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
if (ID == Intrinsics::AtomicLoad &&
Intrin->getDest()->getType() != IceType_i64 &&
Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
LoadDest = Intrin->getDest();
const bool DoLegalize = false;
LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
DoLegalize);
}
}
// A Load instruction can be folded into the following
// instruction only if the following instruction ends the Load's
// Dest variable's live range.
if (LoadDest && Next && Next->isLastUse(LoadDest)) {
assert(LoadSrc);
Inst *NewInst = nullptr;
if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
Operand *Src0 = Arith->getSrc(0);
Operand *Src1 = Arith->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstArithmetic::create(Func, Arith->getOp(),
Arith->getDest(), Src0, Src1);
}
} else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
Operand *Src0 = Icmp->getSrc(0);
Operand *Src1 = Icmp->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstIcmp::create(Func, Icmp->getCondition(),
Icmp->getDest(), Src0, Src1);
}
} else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
Operand *Src0 = Fcmp->getSrc(0);
Operand *Src1 = Fcmp->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
Fcmp->getDest(), Src0, Src1);
}
} else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
Operand *Src0 = Select->getTrueOperand();
Operand *Src1 = Select->getFalseOperand();
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstSelect::create(Func, Select->getDest(),
Select->getCondition(), Src0, Src1);
}
} else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
// The load dest can always be folded into a Cast
// instruction.
Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
if (Src0 == LoadDest) {
NewInst = InstCast::create(Func, Cast->getCastKind(),
Cast->getDest(), LoadSrc);
}
}
if (NewInst) {
CurInst->setDeleted();
Next->setDeleted();
Context.insert(NewInst);
// Update NewInst->LiveRangesEnded so that target lowering
// may benefit. Also update NewInst->HasSideEffects.
NewInst->spliceLivenessInfo(Next, CurInst);
}
}
Context.advanceCur();
Context.advanceNext();
}
}
Func->dump("After load optimization");
}
bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
return Br->optimizeBranch(NextNode);
}
return false;
}
IceString TargetX8632::RegNames[] = {
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
name,
REGX8632_TABLE
#undef X
};
Variable *TargetX8632::getPhysicalRegister(SizeT RegNum, Type Ty) {
if (Ty == IceType_void)
Ty = IceType_i32;
if (PhysicalRegisters[Ty].empty())
PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
assert(RegNum < PhysicalRegisters[Ty].size());
Variable *Reg = PhysicalRegisters[Ty][RegNum];
if (Reg == nullptr) {
Reg = Func->makeVariable(Ty);
Reg->setRegNum(RegNum);
PhysicalRegisters[Ty][RegNum] = Reg;
// Specially mark esp as an "argument" so that it is considered
// live upon function entry.
if (RegNum == RegX8632::Reg_esp) {
Func->addImplicitArg(Reg);
Reg->setIgnoreLiveness();
}
}
return Reg;
}
IceString TargetX8632::getRegName(SizeT RegNum, Type Ty) const {
assert(RegNum < RegX8632::Reg_NUM);
static IceString RegNames8[] = {
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
name8,
REGX8632_TABLE
#undef X
};
static IceString RegNames16[] = {
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
name16,
REGX8632_TABLE
#undef X
};
switch (Ty) {
case IceType_i1:
case IceType_i8:
return RegNames8[RegNum];
case IceType_i16:
return RegNames16[RegNum];
default:
return RegNames[RegNum];
}
}
void TargetX8632::emitVariable(const Variable *Var) const {
Ostream &Str = Ctx->getStrEmit();
if (Var->hasReg()) {
Str << "%" << getRegName(Var->getRegNum(), Var->getType());
return;
}
if (Var->getWeight().isInf())
llvm_unreachable("Infinite-weight Variable has no register assigned");
int32_t Offset = Var->getStackOffset();
if (!hasFramePointer())
Offset += getStackAdjustment();
if (Offset)
Str << Offset;
const Type FrameSPTy = IceType_i32;
Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
}
X8632::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
if (Var->hasReg())
llvm_unreachable("Stack Variable has a register assigned");
if (Var->getWeight().isInf())
llvm_unreachable("Infinite-weight Variable has no register assigned");
int32_t Offset = Var->getStackOffset();
if (!hasFramePointer())
Offset += getStackAdjustment();
return X8632::Address(RegX8632::getEncodedGPR(getFrameOrStackReg()), Offset);
}
void TargetX8632::lowerArguments() {
VarList &Args = Func->getArgs();
// The first four arguments of vector type, regardless of their
// position relative to the other arguments in the argument list, are
// passed in registers xmm0 - xmm3.
unsigned NumXmmArgs = 0;
Context.init(Func->getEntryNode());
Context.setInsertPoint(Context.getCur());
for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS;
++I) {
Variable *Arg = Args[I];
Type Ty = Arg->getType();
if (!isVectorType(Ty))
continue;
// Replace Arg in the argument list with the home register. Then
// generate an instruction in the prolog to copy the home register
// to the assigned location of Arg.
int32_t RegNum = RegX8632::Reg_xmm0 + NumXmmArgs;
++NumXmmArgs;
Variable *RegisterArg = Func->makeVariable(Ty);
if (ALLOW_DUMP)
RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
RegisterArg->setRegNum(RegNum);
RegisterArg->setIsArg();
Arg->setIsArg(false);
Args[I] = RegisterArg;
Context.insert(InstAssign::create(Func, Arg, RegisterArg));
}
}
// Helper function for addProlog().
//
// This assumes Arg is an argument passed on the stack. This sets the
// frame offset for Arg and updates InArgsSizeBytes according to Arg's
// width. For an I64 arg that has been split into Lo and Hi components,
// it calls itself recursively on the components, taking care to handle
// Lo first because of the little-endian architecture. Lastly, this
// function generates an instruction to copy Arg into its assigned
// register if applicable.
void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
size_t BasicFrameOffset,
size_t &InArgsSizeBytes) {
Variable *Lo = Arg->getLo();
Variable *Hi = Arg->getHi();
Type Ty = Arg->getType();
if (Lo && Hi && Ty == IceType_i64) {
assert(Lo->getType() != IceType_i64); // don't want infinite recursion
assert(Hi->getType() != IceType_i64); // don't want infinite recursion
finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
return;
}
if (isVectorType(Ty)) {
InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
}
Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
if (Arg->hasReg()) {
assert(Ty != IceType_i64);
OperandX8632Mem *Mem = OperandX8632Mem::create(
Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
if (isVectorType(Arg->getType())) {
_movp(Arg, Mem);
} else {
_mov(Arg, Mem);
}
// This argument-copying instruction uses an explicit
// OperandX8632Mem operand instead of a Variable, so its
// fill-from-stack operation has to be tracked separately for
// statistics.
Ctx->statsUpdateFills();
}
}
Type TargetX8632::stackSlotType() { return IceType_i32; }
void TargetX8632::addProlog(CfgNode *Node) {
// Stack frame layout:
//
// +------------------------+
// | 1. return address |
// +------------------------+
// | 2. preserved registers |
// +------------------------+
// | 3. padding |
// +------------------------+
// | 4. global spill area |
// +------------------------+
// | 5. padding |
// +------------------------+
// | 6. local spill area |
// +------------------------+
// | 7. padding |
// +------------------------+
// | 8. allocas |
// +------------------------+
//
// The following variables record the size in bytes of the given areas:
// * X86_RET_IP_SIZE_BYTES: area 1
// * PreservedRegsSizeBytes: area 2
// * SpillAreaPaddingBytes: area 3
// * GlobalsSize: area 4
// * GlobalsAndSubsequentPaddingSize: areas 4 - 5
// * LocalsSpillAreaSize: area 6
// * SpillAreaSizeBytes: areas 3 - 7
// Determine stack frame offsets for each Variable without a
// register assignment. This can be done as one variable per stack
// slot. Or, do coalescing by running the register allocator again
// with an infinite set of registers (as a side effect, this gives
// variables a second chance at physical register assignment).
//
// A middle ground approach is to leverage sparsity and allocate one
// block of space on the frame for globals (variables with
// multi-block lifetime), and one block to share for locals
// (single-block lifetime).
Context.init(Node);
Context.setInsertPoint(Context.getCur());
llvm::SmallBitVector CalleeSaves =
getRegisterSet(RegSet_CalleeSave, RegSet_None);
RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
size_t GlobalsSize = 0;
// If there is a separate locals area, this represents that area.
// Otherwise it counts any variable not counted by GlobalsSize.
SpillAreaSizeBytes = 0;
// If there is a separate locals area, this specifies the alignment
// for it.
uint32_t LocalsSlotsAlignmentBytes = 0;
// The entire spill locations area gets aligned to largest natural
// alignment of the variables that have a spill slot.
uint32_t SpillAreaAlignmentBytes = 0;
// A spill slot linked to a variable with a stack slot should reuse
// that stack slot.
std::function<bool(Variable *)> TargetVarHook =
[&VariablesLinkedToSpillSlots](Variable *Var) {
if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
assert(Var->getWeight().isZero());
if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
VariablesLinkedToSpillSlots.push_back(Var);
return true;
}
}
return false;
};
// Compute the list of spilled variables and bounds for GlobalsSize, etc.
getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
&SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
&LocalsSlotsAlignmentBytes, TargetVarHook);
uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
SpillAreaSizeBytes += GlobalsSize;
// Add push instructions for preserved registers.
uint32_t NumCallee = 0;
size_t PreservedRegsSizeBytes = 0;
for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
if (CalleeSaves[i] && RegsUsed[i]) {
++NumCallee;
PreservedRegsSizeBytes += 4;
_push(getPhysicalRegister(i));
}
}
Ctx->statsUpdateRegistersSaved(NumCallee);
// Generate "push ebp; mov ebp, esp"
if (IsEbpBasedFrame) {
assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
.count() == 0);
PreservedRegsSizeBytes += 4;
Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
_push(ebp);
_mov(ebp, esp);
// Keep ebp live for late-stage liveness analysis
// (e.g. asm-verbose mode).
Context.insert(InstFakeUse::create(Func, ebp));
}
// Align the variables area. SpillAreaPaddingBytes is the size of
// the region after the preserved registers and before the spill areas.
// LocalsSlotsPaddingBytes is the amount of padding between the globals
// and locals area if they are separate.
assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
uint32_t SpillAreaPaddingBytes = 0;
uint32_t LocalsSlotsPaddingBytes = 0;
alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
SpillAreaAlignmentBytes, GlobalsSize,
LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
&LocalsSlotsPaddingBytes);
SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
uint32_t GlobalsAndSubsequentPaddingSize =
GlobalsSize + LocalsSlotsPaddingBytes;
// Align esp if necessary.
if (NeedsStackAlignment) {
uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
SpillAreaSizeBytes = StackSize - StackOffset;
}
// Generate "sub esp, SpillAreaSizeBytes"
if (SpillAreaSizeBytes)
_sub(getPhysicalRegister(RegX8632::Reg_esp),
Ctx->getConstantInt32(SpillAreaSizeBytes));
Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
resetStackAdjustment();
// Fill in stack offsets for stack args, and copy args into registers
// for those that were register-allocated. Args are pushed right to
// left, so Arg[0] is closest to the stack/frame pointer.
Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;
if (!IsEbpBasedFrame)
BasicFrameOffset += SpillAreaSizeBytes;
const VarList &Args = Func->getArgs();
size_t InArgsSizeBytes = 0;
unsigned NumXmmArgs = 0;
for (Variable *Arg : Args) {
// Skip arguments passed in registers.
if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
++NumXmmArgs;
continue;
}
finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
}
// Fill in stack offsets for locals.
assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
IsEbpBasedFrame);
// Assign stack offsets to variables that have been linked to spilled
// variables.
for (Variable *Var : VariablesLinkedToSpillSlots) {
Variable *Linked = (llvm::cast<SpillVariable>(Var))->getLinkedTo();
Var->setStackOffset(Linked->getStackOffset());
}
this->HasComputedFrame = true;
if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
Str << "Stack layout:\n";
uint32_t EspAdjustmentPaddingSize =
SpillAreaSizeBytes - LocalsSpillAreaSize -
GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
Str << " in-args = " << InArgsSizeBytes << " bytes\n"
<< " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
<< " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
<< " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
<< " globals spill area = " << GlobalsSize << " bytes\n"
<< " globals-locals spill areas intermediate padding = "
<< GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
<< " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
<< " esp alignment padding = " << EspAdjustmentPaddingSize
<< " bytes\n";
Str << "Stack details:\n"
<< " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
<< " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
<< " locals spill area alignment = " << LocalsSlotsAlignmentBytes
<< " bytes\n"
<< " is ebp based = " << IsEbpBasedFrame << "\n";
}
}
void TargetX8632::addEpilog(CfgNode *Node) {
InstList &Insts = Node->getInsts();
InstList::reverse_iterator RI, E;
for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
if (llvm::isa<InstX8632Ret>(*RI))
break;
}
if (RI == E)
return;
// Convert the reverse_iterator position into its corresponding
// (forward) iterator position.
InstList::iterator InsertPoint = RI.base();
--InsertPoint;
Context.init(Node);
Context.setInsertPoint(InsertPoint);
Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
if (IsEbpBasedFrame) {
Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
// For late-stage liveness analysis (e.g. asm-verbose mode),
// adding a fake use of esp before the assignment of esp=ebp keeps
// previous esp adjustments from being dead-code eliminated.
Context.insert(InstFakeUse::create(Func, esp));
_mov(esp, ebp);
_pop(ebp);
} else {
// add esp, SpillAreaSizeBytes
if (SpillAreaSizeBytes)
_add(esp, Ctx->getConstantInt32(SpillAreaSizeBytes));
}
// Add pop instructions for preserved registers.
llvm::SmallBitVector CalleeSaves =
getRegisterSet(RegSet_CalleeSave, RegSet_None);
for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
SizeT j = CalleeSaves.size() - i - 1;
if (j == RegX8632::Reg_ebp && IsEbpBasedFrame)
continue;
if (CalleeSaves[j] && RegsUsed[j]) {
_pop(getPhysicalRegister(j));
}
}
if (!Ctx->getFlags().getUseSandboxing())
return;
// Change the original ret instruction into a sandboxed return sequence.
// t:ecx = pop
// bundle_lock
// and t, ~31
// jmp *t
// bundle_unlock
// FakeUse <original_ret_operand>
const SizeT BundleSize = 1
<< Func->getAssembler<>()->getBundleAlignLog2Bytes();
Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
_pop(T_ecx);
_bundle_lock();
_and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
_jmp(T_ecx);
_bundle_unlock();
if (RI->getSrcSize()) {
Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
Context.insert(InstFakeUse::create(Func, RetValue));
}
RI->setDeleted();
}
void TargetX8632::split64(Variable *Var) {
switch (Var->getType()) {
default:
return;
case IceType_i64:
// TODO: Only consider F64 if we need to push each half when
// passing as an argument to a function call. Note that each half
// is still typed as I32.
case IceType_f64:
break;
}
Variable *Lo = Var->getLo();
Variable *Hi = Var->getHi();
if (Lo) {
assert(Hi);
return;
}
assert(Hi == nullptr);
Lo = Func->makeVariable(IceType_i32);
Hi = Func->makeVariable(IceType_i32);
if (ALLOW_DUMP) {
Lo->setName(Func, Var->getName(Func) + "__lo");
Hi->setName(Func, Var->getName(Func) + "__hi");
}
Var->setLoHi(Lo, Hi);
if (Var->getIsArg()) {
Lo->setIsArg();
Hi->setIsArg();
}
}
Operand *TargetX8632::loOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64 ||
Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var);
return Var->getLo();
}
if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
}
if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(),
Mem->getOffset(), Mem->getIndex(),
Mem->getShift(), Mem->getSegmentRegister());
}
llvm_unreachable("Unsupported operand type");
return nullptr;
}
Operand *TargetX8632::hiOperand(Operand *Operand) {
assert(Operand->getType() == IceType_i64 ||
Operand->getType() == IceType_f64);
if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
return Operand;
if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
split64(Var);
return Var->getHi();
}
if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
return Ctx->getConstantInt32(
static_cast<uint32_t>(Const->getValue() >> 32));
}
if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
Constant *Offset = Mem->getOffset();
if (Offset == nullptr) {
Offset = Ctx->getConstantInt32(4);
} else if (ConstantInteger32 *IntOffset =
llvm::dyn_cast<ConstantInteger32>(Offset)) {
Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
} else if (ConstantRelocatable *SymOffset =
llvm::dyn_cast<ConstantRelocatable>(Offset)) {
assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
Offset =
Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName(),
SymOffset->getSuppressMangling());
}
return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(), Offset,
Mem->getIndex(), Mem->getShift(),
Mem->getSegmentRegister());
}
llvm_unreachable("Unsupported operand type");
return nullptr;
}
llvm::SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
RegSetMask Exclude) const {
llvm::SmallBitVector Registers(RegX8632::Reg_NUM);
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
if (scratch && (Include & RegSet_CallerSave)) \
Registers[RegX8632::val] = true; \
if (preserved && (Include & RegSet_CalleeSave)) \
Registers[RegX8632::val] = true; \
if (stackptr && (Include & RegSet_StackPointer)) \
Registers[RegX8632::val] = true; \
if (frameptr && (Include & RegSet_FramePointer)) \
Registers[RegX8632::val] = true; \
if (scratch && (Exclude & RegSet_CallerSave)) \
Registers[RegX8632::val] = false; \
if (preserved && (Exclude & RegSet_CalleeSave)) \
Registers[RegX8632::val] = false; \
if (stackptr && (Exclude & RegSet_StackPointer)) \
Registers[RegX8632::val] = false; \
if (frameptr && (Exclude & RegSet_FramePointer)) \
Registers[RegX8632::val] = false;
REGX8632_TABLE
#undef X
return Registers;
}
void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
IsEbpBasedFrame = true;
// Conservatively require the stack to be aligned. Some stack
// adjustment operations implemented below assume that the stack is
// aligned before the alloca. All the alloca code ensures that the
// stack alignment is preserved after the alloca. The stack alignment
// restriction can be relaxed in some cases.
NeedsStackAlignment = true;
// TODO(stichnot): minimize the number of adjustments of esp, etc.
Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
Operand *TotalSize = legalize(Inst->getSizeInBytes());
Variable *Dest = Inst->getDest();
uint32_t AlignmentParam = Inst->getAlignInBytes();
// For default align=0, set it to the real value 1, to avoid any
// bit-manipulation problems below.
AlignmentParam = std::max(AlignmentParam, 1u);
// LLVM enforces power of 2 alignment.
assert(llvm::isPowerOf2_32(AlignmentParam));
assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
_and(esp, Ctx->getConstantInt32(-Alignment));
}
if (const auto *ConstantTotalSize =
llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
uint32_t Value = ConstantTotalSize->getValue();
Value = Utils::applyAlignment(Value, Alignment);
_sub(esp, Ctx->getConstantInt32(Value));
} else {
// Non-constant sizes need to be adjusted to the next highest
// multiple of the required alignment at runtime.
Variable *T = makeReg(IceType_i32);
_mov(T, TotalSize);
_add(T, Ctx->getConstantInt32(Alignment - 1));
_and(T, Ctx->getConstantInt32(-Alignment));
_sub(esp, T);
}
_mov(Dest, esp);
}
void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
Variable *Dest = Inst->getDest();
Operand *Src0 = legalize(Inst->getSrc(0));
Operand *Src1 = legalize(Inst->getSrc(1));
if (Inst->isCommutative()) {
if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
std::swap(Src0, Src1);
}
if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Operand *Src0Lo = loOperand(Src0);
Operand *Src0Hi = hiOperand(Src0);
Operand *Src1Lo = loOperand(Src1);
Operand *Src1Hi = hiOperand(Src1);
Variable *T_Lo = nullptr, *T_Hi = nullptr;
switch (Inst->getOp()) {
case InstArithmetic::_num:
llvm_unreachable("Unknown arithmetic operator");
break;
case InstArithmetic::Add:
_mov(T_Lo, Src0Lo);
_add(T_Lo, Src1Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_adc(T_Hi, Src1Hi);
_mov(DestHi, T_Hi);
break;
case InstArithmetic::And:
_mov(T_Lo, Src0Lo);
_and(T_Lo, Src1Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_and(T_Hi, Src1Hi);
_mov(DestHi, T_Hi);
break;
case InstArithmetic::Or:
_mov(T_Lo, Src0Lo);
_or(T_Lo, Src1Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_or(T_Hi, Src1Hi);
_mov(DestHi, T_Hi);
break;
case InstArithmetic::Xor:
_mov(T_Lo, Src0Lo);
_xor(T_Lo, Src1Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_xor(T_Hi, Src1Hi);
_mov(DestHi, T_Hi);
break;
case InstArithmetic::Sub:
_mov(T_Lo, Src0Lo);
_sub(T_Lo, Src1Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_sbb(T_Hi, Src1Hi);
_mov(DestHi, T_Hi);
break;
case InstArithmetic::Mul: {
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
// gcc does the following:
// a=b*c ==>
// t1 = b.hi; t1 *=(imul) c.lo
// t2 = c.hi; t2 *=(imul) b.lo
// t3:eax = b.lo
// t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
// a.lo = t4.lo
// t4.hi += t1
// t4.hi += t2
// a.hi = t4.hi
// The mul instruction cannot take an immediate operand.
Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
_mov(T_1, Src0Hi);
_imul(T_1, Src1Lo);
_mov(T_2, Src1Hi);
_imul(T_2, Src0Lo);
_mov(T_3, Src0Lo, RegX8632::Reg_eax);
_mul(T_4Lo, T_3, Src1Lo);
// The mul instruction produces two dest variables, edx:eax. We
// create a fake definition of edx to account for this.
Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo));
_mov(DestLo, T_4Lo);
_add(T_4Hi, T_1);
_add(T_4Hi, T_2);
_mov(DestHi, T_4Hi);
} break;
case InstArithmetic::Shl: {
// TODO: Refactor the similarities between Shl, Lshr, and Ashr.
// gcc does the following:
// a=b<<c ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t3 = shld t3, t2, t1
// t2 = shl t2, t1
// test t1, 0x20
// je L1
// use(t3)
// t3 = t2
// t2 = 0
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
InstX8632Label *Label = InstX8632Label::create(Func, this);
_mov(T_1, Src1Lo, RegX8632::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shld(T_3, T_2, T_1);
_shl(T_2, T_1);
_test(T_1, BitTest);
_br(CondX86::Br_e, Label);
// T_2 and T_3 are being assigned again because of the
// intra-block control flow, so we need the _mov_nonkillable
// variant to avoid liveness problems.
_mov_nonkillable(T_3, T_2);
_mov_nonkillable(T_2, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
} break;
case InstArithmetic::Lshr: {
// a=b>>c (unsigned) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = shr t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = 0
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
InstX8632Label *Label = InstX8632Label::create(Func, this);
_mov(T_1, Src1Lo, RegX8632::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_shr(T_3, T_1);
_test(T_1, BitTest);
_br(CondX86::Br_e, Label);
// T_2 and T_3 are being assigned again because of the
// intra-block control flow, so we need the _mov_nonkillable
// variant to avoid liveness problems.
_mov_nonkillable(T_2, T_3);
_mov_nonkillable(T_3, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
} break;
case InstArithmetic::Ashr: {
// a=b>>c (signed) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = sar t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = sar t3, 0x1f
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *SignExtend = Ctx->getConstantInt32(0x1f);
InstX8632Label *Label = InstX8632Label::create(Func, this);
_mov(T_1, Src1Lo, RegX8632::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_sar(T_3, T_1);
_test(T_1, BitTest);
_br(CondX86::Br_e, Label);
// T_2 and T_3 are being assigned again because of the
// intra-block control flow, so T_2 needs the _mov_nonkillable
// variant to avoid liveness problems. T_3 doesn't need special
// treatment because it is reassigned via _sar instead of _mov.
_mov_nonkillable(T_2, T_3);
_sar(T_3, SignExtend);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
} break;
case InstArithmetic::Udiv: {
const SizeT MaxSrcs = 2;
InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
Call->addArg(Inst->getSrc(1));
lowerCall(Call);
} break;
case InstArithmetic::Sdiv: {
const SizeT MaxSrcs = 2;
InstCall *Call = makeHelperCall(H_sdiv_i64, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
Call->addArg(Inst->getSrc(1));
lowerCall(Call);
} break;
case InstArithmetic::Urem: {
const SizeT MaxSrcs = 2;
InstCall *Call = makeHelperCall(H_urem_i64, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
Call->addArg(Inst->getSrc(1));
lowerCall(Call);
} break;
case InstArithmetic::Srem: {
const SizeT MaxSrcs = 2;
InstCall *Call = makeHelperCall(H_srem_i64, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
Call->addArg(Inst->getSrc(1));
lowerCall(Call);
} break;
case InstArithmetic::Fadd:
case InstArithmetic::Fsub:
case InstArithmetic::Fmul:
case InstArithmetic::Fdiv:
case InstArithmetic::Frem:
llvm_unreachable("FP instruction with i64 type");
break;
}
} else if (isVectorType(Dest->getType())) {
// TODO: Trap on integer divide and integer modulo by zero.
// See: https://code.google.com/p/nativeclient/issues/detail?id=3899
if (llvm::isa<OperandX8632Mem>(Src1))
Src1 = legalizeToVar(Src1);
switch (Inst->getOp()) {
case InstArithmetic::_num:
llvm_unreachable("Unknown arithmetic operator");
break;
case InstArithmetic::Add: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_padd(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::And: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_pand(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Or: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_por(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Xor: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_pxor(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Sub: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_psub(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Mul: {
bool TypesAreValidForPmull =
Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
bool InstructionSetIsValidForPmull =
Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_pmull(T, Src1);
_movp(Dest, T);
} else if (Dest->getType() == IceType_v4i32) {
// Lowering sequence:
// Note: The mask arguments have index 0 on the left.
//
// movups T1, Src0
// pshufd T2, Src0, {1,0,3,0}
// pshufd T3, Src1, {1,0,3,0}
// # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
// pmuludq T1, Src1
// # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
// pmuludq T2, T3
// # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
// shufps T1, T2, {0,2,0,2}
// pshufd T4, T1, {0,2,1,3}
// movups Dest, T4
// Mask that directs pshufd to create a vector with entries
// Src[1, 0, 3, 0]
const unsigned Constant1030 = 0x31;
Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
// Mask that directs shufps to create a vector with entries
// Dest[0, 2], Src[0, 2]
const unsigned Mask0202 = 0x88;
// Mask that directs pshufd to create a vector with entries
// Src[0, 2, 1, 3]
const unsigned Mask0213 = 0xd8;
Variable *T1 = makeReg(IceType_v4i32);
Variable *T2 = makeReg(IceType_v4i32);
Variable *T3 = makeReg(IceType_v4i32);
Variable *T4 = makeReg(IceType_v4i32);
_movp(T1, Src0);
_pshufd(T2, Src0, Mask1030);
_pshufd(T3, Src1, Mask1030);
_pmuludq(T1, Src1);
_pmuludq(T2, T3);
_shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
_pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
_movp(Dest, T4);
} else {
assert(Dest->getType() == IceType_v16i8);
scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
}
} break;
case InstArithmetic::Shl:
case InstArithmetic::Lshr:
case InstArithmetic::Ashr:
case InstArithmetic::Udiv:
case InstArithmetic::Urem:
case InstArithmetic::Sdiv:
case InstArithmetic::Srem:
scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
break;
case InstArithmetic::Fadd: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_addps(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Fsub: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_subps(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Fmul: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_mulps(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Fdiv: {
Variable *T = makeReg(Dest->getType());
_movp(T, Src0);
_divps(T, Src1);
_movp(Dest, T);
} break;
case InstArithmetic::Frem:
scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
break;
}
} else { // Dest->getType() is non-i64 scalar
Variable *T_edx = nullptr;
Variable *T = nullptr;
switch (Inst->getOp()) {
case InstArithmetic::_num:
llvm_unreachable("Unknown arithmetic operator");
break;
case InstArithmetic::Add:
_mov(T, Src0);
_add(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::And:
_mov(T, Src0);
_and(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Or:
_mov(T, Src0);
_or(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Xor:
_mov(T, Src0);
_xor(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Sub:
_mov(T, Src0);
_sub(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Mul:
// TODO: Optimize for llvm::isa<Constant>(Src1)
// TODO: Strength-reduce multiplications by a constant,
// particularly -1 and powers of 2. Advanced: use lea to
// multiply by 3, 5, 9.
//
// The 8-bit version of imul only allows the form "imul r/m8"
// where T must be in eax.
if (isByteSizedArithType(Dest->getType())) {
_mov(T, Src0, RegX8632::Reg_eax);
Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
} else {
_mov(T, Src0);
}
_imul(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Shl:
_mov(T, Src0);
if (!llvm::isa<Constant>(Src1))
Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
_shl(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Lshr:
_mov(T, Src0);
if (!llvm::isa<Constant>(Src1))
Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
_shr(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Ashr:
_mov(T, Src0);
if (!llvm::isa<Constant>(Src1))
Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
_sar(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Udiv:
// div and idiv are the few arithmetic operators that do not allow
// immediates as the operand.
Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
if (isByteSizedArithType(Dest->getType())) {
Variable *T_ah = nullptr;
Constant *Zero = Ctx->getConstantZero(IceType_i8);
_mov(T, Src0, RegX8632::Reg_eax);
_mov(T_ah, Zero, RegX8632::Reg_ah);
_div(T, Src1, T_ah);
_mov(Dest, T);
} else {
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_mov(T, Src0, RegX8632::Reg_eax);
_mov(T_edx, Zero, RegX8632::Reg_edx);
_div(T, Src1, T_edx);
_mov(Dest, T);
}
break;
case InstArithmetic::Sdiv:
Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
if (isByteSizedArithType(Dest->getType())) {
_mov(T, Src0, RegX8632::Reg_eax);
_cbwdq(T, T);
_idiv(T, Src1, T);
_mov(Dest, T);
} else {
T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
_mov(T, Src0, RegX8632::Reg_eax);
_cbwdq(T_edx, T);
_idiv(T, Src1, T_edx);
_mov(Dest, T);
}
break;
case InstArithmetic::Urem:
Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
if (isByteSizedArithType(Dest->getType())) {
Variable *T_ah = nullptr;
Constant *Zero = Ctx->getConstantZero(IceType_i8);
_mov(T, Src0, RegX8632::Reg_eax);
_mov(T_ah, Zero, RegX8632::Reg_ah);
_div(T_ah, Src1, T);
_mov(Dest, T_ah);
} else {
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_mov(T_edx, Zero, RegX8632::Reg_edx);
_mov(T, Src0, RegX8632::Reg_eax);
_div(T_edx, Src1, T);
_mov(Dest, T_edx);
}
break;
case InstArithmetic::Srem:
Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
if (isByteSizedArithType(Dest->getType())) {
Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
_mov(T, Src0, RegX8632::Reg_eax);
_cbwdq(T, T);
Context.insert(InstFakeDef::create(Func, T_ah));
_idiv(T_ah, Src1, T);
_mov(Dest, T_ah);
} else {
T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
_mov(T, Src0, RegX8632::Reg_eax);
_cbwdq(T_edx, T);
_idiv(T_edx, Src1, T);
_mov(Dest, T_edx);
}
break;
case InstArithmetic::Fadd:
_mov(T, Src0);
_addss(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Fsub:
_mov(T, Src0);
_subss(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Fmul:
_mov(T, Src0);
_mulss(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Fdiv:
_mov(T, Src0);
_divss(T, Src1);
_mov(Dest, T);
break;
case InstArithmetic::Frem: {
const SizeT MaxSrcs = 2;
Type Ty = Dest->getType();
InstCall *Call =
makeHelperCall(isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64,
Dest, MaxSrcs);
Call->addArg(Src0);
Call->addArg(Src1);
return lowerCall(Call);
} break;
}
}
}
void TargetX8632::lowerAssign(const InstAssign *Inst) {
Variable *Dest = Inst->getDest();
Operand *Src0 = Inst->getSrc(0);
assert(Dest->getType() == Src0->getType());
if (Dest->getType() == IceType_i64) {
Src0 = legalize(Src0);
Operand *Src0Lo = loOperand(Src0);
Operand *Src0Hi = hiOperand(Src0);
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = nullptr, *T_Hi = nullptr;
_mov(T_Lo, Src0Lo);
_mov(DestLo, T_Lo);
_mov(T_Hi, Src0Hi);
_mov(DestHi, T_Hi);
} else {
Operand *RI;
if (Dest->hasReg())
// If Dest already has a physical register, then legalize the
// Src operand into a Variable with the same register
// assignment. This is mostly a workaround for advanced phi
// lowering's ad-hoc register allocation which assumes no
// register allocation is needed when at least one of the
// operands is non-memory.
RI = legalize(Src0, Legal_Reg, Dest->getRegNum());
else
// If Dest could be a stack operand, then RI must be a physical
// register or a scalar integer immediate.
RI = legalize(Src0, Legal_Reg | Legal_Imm);
if (isVectorType(Dest->getType()))
_movp(Dest, RI);
else
_mov(Dest, RI);
}
}
void TargetX8632::lowerBr(const InstBr *Inst) {
if (Inst->isUnconditional()) {
_br(Inst->getTargetUnconditional());
return;
}
Operand *Cond = Inst->getCondition();
// Handle folding opportunities.
if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
assert(Producer->isDeleted());
switch (BoolFolding::getProducerKind(Producer)) {
default:
break;
case BoolFolding::PK_Icmp32: {
// TODO(stichnot): Refactor similarities between this block and
// the corresponding code in lowerIcmp().
auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
Operand *Src0 = Producer->getSrc(0);
Operand *Src1 = legalize(Producer->getSrc(1));
Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
_cmp(Src0RM, Src1);
_br(getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
Inst->getTargetFalse());
return;
}
}
}
Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_cmp(Src0, Zero);
_br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
}
void TargetX8632::lowerCall(const InstCall *Instr) {
// x86-32 calling convention:
//
// * At the point before the call, the stack must be aligned to 16
// bytes.
//
// * The first four arguments of vector type, regardless of their
// position relative to the other arguments in the argument list, are
// placed in registers xmm0 - xmm3.
//
// * Other arguments are pushed onto the stack in right-to-left order,
// such that the left-most argument ends up on the top of the stack at
// the lowest memory address.
//
// * Stack arguments of vector type are aligned to start at the next
// highest multiple of 16 bytes. Other stack arguments are aligned to
// 4 bytes.
//
// This intends to match the section "IA-32 Function Calling
// Convention" of the document "OS X ABI Function Call Guide" by
// Apple.
NeedsStackAlignment = true;
typedef std::vector<Operand *> OperandList;
OperandList XmmArgs;
OperandList StackArgs, StackArgLocations;
uint32_t ParameterAreaSizeBytes = 0;
// Classify each argument operand according to the location where the
// argument is passed.
for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
Operand *Arg = Instr->getArg(i);
Type Ty = Arg->getType();
// The PNaCl ABI requires the width of arguments to be at least 32 bits.
assert(typeWidthInBytes(Ty) >= 4);
if (isVectorType(Ty) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
XmmArgs.push_back(Arg);
} else {
StackArgs.push_back(Arg);
if (isVectorType(Arg->getType())) {
ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
}
Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));
ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
}
}
// Adjust the parameter area so that the stack is aligned. It is
// assumed that the stack is already aligned at the start of the
// calling sequence.
ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
// Subtract the appropriate amount for the argument area. This also
// takes care of setting the stack adjustment during emission.
//
// TODO: If for some reason the call instruction gets dead-code
// eliminated after lowering, we would need to ensure that the
// pre-call and the post-call esp adjustment get eliminated as well.
if (ParameterAreaSizeBytes) {
_adjust_stack(ParameterAreaSizeBytes);
}
// Copy arguments that are passed on the stack to the appropriate
// stack locations.
for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
}
// Copy arguments to be passed in registers to the appropriate
// registers.
// TODO: Investigate the impact of lowering arguments passed in
// registers after lowering stack arguments as opposed to the other
// way around. Lowering register arguments after stack arguments may
// reduce register pressure. On the other hand, lowering register
// arguments first (before stack arguments) may result in more compact
// code, as the memory operand displacements may end up being smaller
// before any stack adjustment is done.
for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
Variable *Reg = legalizeToVar(XmmArgs[i], RegX8632::Reg_xmm0 + i);
// Generate a FakeUse of register arguments so that they do not get
// dead code eliminated as a result of the FakeKill of scratch
// registers after the call.
Context.insert(InstFakeUse::create(Func, Reg));
}
// Generate the call instruction. Assign its result to a temporary
// with high register allocation weight.
Variable *Dest = Instr->getDest();
// ReturnReg doubles as ReturnRegLo as necessary.
Variable *ReturnReg = nullptr;
Variable *ReturnRegHi = nullptr;
if (Dest) {
switch (Dest->getType()) {
case IceType_NUM:
llvm_unreachable("Invalid Call dest type");
break;
case IceType_void:
break;
case IceType_i1:
case IceType_i8:
case IceType_i16:
case IceType_i32:
ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_eax);
break;
case IceType_i64:
ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
break;
case IceType_f32:
case IceType_f64:
// Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
// the fstp instruction.
break;
case IceType_v4i1:
case IceType_v8i1:
case IceType_v16i1:
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
case IceType_v4f32:
ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_xmm0);
break;
}
}
Operand *CallTarget = legalize(Instr->getCallTarget());
const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
if (NeedSandboxing) {
if (llvm::isa<Constant>(CallTarget)) {
_bundle_lock(InstBundleLock::Opt_AlignToEnd);
} else {
Variable *CallTargetVar = nullptr;
_mov(CallTargetVar, CallTarget);
_bundle_lock(InstBundleLock::Opt_AlignToEnd);
const SizeT BundleSize =
1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
_and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
CallTarget = CallTargetVar;
}
}
Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
Context.insert(NewCall);
if (NeedSandboxing)
_bundle_unlock();
if (ReturnRegHi)
Context.insert(InstFakeDef::create(Func, ReturnRegHi));
// Add the appropriate offset to esp. The call instruction takes care
// of resetting the stack offset during emission.
if (ParameterAreaSizeBytes) {
Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
_add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
}
// Insert a register-kill pseudo instruction.
Context.insert(InstFakeKill::create(Func, NewCall));
// Generate a FakeUse to keep the call live if necessary.
if (Instr->hasSideEffects() && ReturnReg) {
Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
Context.insert(FakeUse);
}
if (!Dest)
return;
// Assign the result of the call to Dest.
if (ReturnReg) {
if (ReturnRegHi) {
assert(Dest->getType() == IceType_i64);
split64(Dest);
Variable *DestLo = Dest->getLo();
Variable *DestHi = Dest->getHi();
_mov(DestLo, ReturnReg);
_mov(DestHi, ReturnRegHi);
} else {
assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
isVectorType(Dest->getType()));
if (isVectorType(Dest->getType())) {
_movp(Dest, ReturnReg);
} else {
_mov(Dest, ReturnReg);
}
}
} else if (isScalarFloatingType(Dest->getType())) {
// Special treatment for an FP function which returns its result in
// st(0).
// If Dest ends up being a physical xmm register, the fstp emit code
// will route st(0) through a temporary stack slot.
_fstp(Dest);
// Create a fake use of Dest in case it actually isn't used,
// because st(0) still needs to be popped.
Context.insert(InstFakeUse::create(Func, Dest));
}
}
void TargetX8632::lowerCast(const InstCast *Inst) {
// a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
InstCast::OpKind CastKind = Inst->getCastKind();
Variable *Dest = Inst->getDest();
switch (CastKind) {
default:
Func->setError("Cast type not supported");
return;
case InstCast::Sext: {
// Src0RM is the source operand legalized to physical register or memory,
// but not immediate, since the relevant x86 native instructions don't
// allow an immediate operand. If the operand is an immediate, we could
// consider computing the strength-reduced result at translation time,
// but we're unlikely to see something like that in the bitcode that
// the optimizer wouldn't have already taken care of.
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (isVectorType(Dest->getType())) {
Type DestTy = Dest->getType();
if (DestTy == IceType_v16i8) {
// onemask = materialize(1,1,...); dst = (src & onemask) > 0
Variable *OneMask = makeVectorOfOnes(Dest->getType());
Variable *T = makeReg(DestTy);
_movp(T, Src0RM);
_pand(T, OneMask);
Variable *Zeros = makeVectorOfZeros(Dest->getType());
_pcmpgt(T, Zeros);
_movp(Dest, T);
} else {
// width = width(elty) - 1; dest = (src << width) >> width
SizeT ShiftAmount =
X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
Variable *T = makeReg(DestTy);
_movp(T, Src0RM);
_psll(T, ShiftConstant);
_psra(T, ShiftConstant);
_movp(Dest, T);
}
} else if (Dest->getType() == IceType_i64) {
// t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
Constant *Shift = Ctx->getConstantInt32(31);
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = makeReg(DestLo->getType());
if (Src0RM->getType() == IceType_i32) {
_mov(T_Lo, Src0RM);
} else if (Src0RM->getType() == IceType_i1) {
_movzx(T_Lo, Src0RM);
_shl(T_Lo, Shift);
_sar(T_Lo, Shift);
} else {
_movsx(T_Lo, Src0RM);
}
_mov(DestLo, T_Lo);
Variable *T_Hi = nullptr;
_mov(T_Hi, T_Lo);
if (Src0RM->getType() != IceType_i1)
// For i1, the sar instruction is already done above.
_sar(T_Hi, Shift);
_mov(DestHi, T_Hi);
} else if (Src0RM->getType() == IceType_i1) {
// t1 = src
// shl t1, dst_bitwidth - 1
// sar t1, dst_bitwidth - 1
// dst = t1
size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
Variable *T = makeReg(Dest->getType());
if (typeWidthInBytes(Dest->getType()) <=
typeWidthInBytes(Src0RM->getType())) {
_mov(T, Src0RM);
} else {
// Widen the source using movsx or movzx. (It doesn't matter
// which one, since the following shl/sar overwrite the bits.)
_movzx(T, Src0RM);
}
_shl(T, ShiftAmount);
_sar(T, ShiftAmount);
_mov(Dest, T);
} else {
// t1 = movsx src; dst = t1
Variable *T = makeReg(Dest->getType());
_movsx(T, Src0RM);
_mov(Dest, T);
}
break;
}
case InstCast::Zext: {
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (isVectorType(Dest->getType())) {
// onemask = materialize(1,1,...); dest = onemask & src
Type DestTy = Dest->getType();
Variable *OneMask = makeVectorOfOnes(DestTy);
Variable *T = makeReg(DestTy);
_movp(T, Src0RM);
_pand(T, OneMask);
_movp(Dest, T);
} else if (Dest->getType() == IceType_i64) {
// t1=movzx src; dst.lo=t1; dst.hi=0
Constant *Zero = Ctx->getConstantZero(IceType_i32);
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *Tmp = makeReg(DestLo->getType());
if (Src0RM->getType() == IceType_i32) {
_mov(Tmp, Src0RM);
} else {
_movzx(Tmp, Src0RM);
}
if (Src0RM->getType() == IceType_i1) {
Constant *One = Ctx->getConstantInt32(1);
_and(Tmp, One);
}
_mov(DestLo, Tmp);
_mov(DestHi, Zero);
} else if (Src0RM->getType() == IceType_i1) {
// t = Src0RM; t &= 1; Dest = t
Constant *One = Ctx->getConstantInt32(1);
Type DestTy = Dest->getType();
Variable *T;
if (DestTy == IceType_i8) {
T = makeReg(DestTy);
_mov(T, Src0RM);
} else {
// Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
T = makeReg(IceType_i32);
_movzx(T, Src0RM);
}
_and(T, One);
_mov(Dest, T);
} else {
// t1 = movzx src; dst = t1
Variable *T = makeReg(Dest->getType());
_movzx(T, Src0RM);
_mov(Dest, T);
}
break;
}
case InstCast::Trunc: {
if (isVectorType(Dest->getType())) {
// onemask = materialize(1,1,...); dst = src & onemask
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
Type Src0Ty = Src0RM->getType();
Variable *OneMask = makeVectorOfOnes(Src0Ty);
Variable *T = makeReg(Dest->getType());
_movp(T, Src0RM);
_pand(T, OneMask);
_movp(Dest, T);
} else {
Operand *Src0 = Inst->getSrc(0);
if (Src0->getType() == IceType_i64)
Src0 = loOperand(Src0);
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
// t1 = trunc Src0RM; Dest = t1
Variable *T = nullptr;
_mov(T, Src0RM);
if (Dest->getType() == IceType_i1)
_and(T, Ctx->getConstantInt1(1));
_mov(Dest, T);
}
break;
}
case InstCast::Fptrunc:
case InstCast::Fpext: {
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
// t1 = cvt Src0RM; Dest = t1
Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Float2float);
_mov(Dest, T);
break;
}
case InstCast::Fptosi:
if (isVectorType(Dest->getType())) {
assert(Dest->getType() == IceType_v4i32 &&
Inst->getSrc(0)->getType() == IceType_v4f32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
_movp(Dest, T);
} else if (Dest->getType() == IceType_i64) {
// Use a helper for converting floating-point values to 64-bit
// integers. SSE2 appears to have no way to convert from xmm
// registers to something like the edx:eax register pair, and
// gcc and clang both want to use x87 instructions complete with
// temporary manipulation of the status word. This helper is
// not needed for x86-64.
split64(Dest);
const SizeT MaxSrcs = 1;
Type SrcType = Inst->getSrc(0)->getType();
InstCall *Call =
makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64
: H_fptosi_f64_i64,
Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
lowerCall(Call);
} else {
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
// t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
Variable *T_1 = makeReg(IceType_i32);
Variable *T_2 = makeReg(Dest->getType());
_cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
_mov(T_2, T_1); // T_1 and T_2 may have different integer types
if (Dest->getType() == IceType_i1)
_and(T_2, Ctx->getConstantInt1(1));
_mov(Dest, T_2);
}
break;
case InstCast::Fptoui:
if (isVectorType(Dest->getType())) {
assert(Dest->getType() == IceType_v4i32 &&
Inst->getSrc(0)->getType() == IceType_v4f32);
const SizeT MaxSrcs = 1;
InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
lowerCall(Call);
} else if (Dest->getType() == IceType_i64 ||
Dest->getType() == IceType_i32) {
// Use a helper for both x86-32 and x86-64.
split64(Dest);
const SizeT MaxSrcs = 1;
Type DestType = Dest->getType();
Type SrcType = Inst->getSrc(0)->getType();
IceString TargetString;
if (isInt32Asserting32Or64(DestType)) {
TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
: H_fptoui_f64_i32;
} else {
TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
: H_fptoui_f64_i64;
}
InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
Call->addArg(Inst->getSrc(0));
lowerCall(Call);
return;
} else {
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
// t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
Variable *T_1 = makeReg(IceType_i32);
Variable *T_2 = makeReg(Dest->getType());
_cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
_mov(T_2, T_1); // T_1 and T_2 may have different integer types
if (Dest->getType() == IceType_i1)
_and(T_2, Ctx->getConstantInt1(1));
_mov(Dest, T_2);
}
break;
case InstCast::Sitofp:
if (isVectorType(Dest->getType())) {
assert(Dest->getType() == IceType_v4f32 &&
Inst->getSrc(0)->getType() == IceType_v4i32);
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
Variable *T = makeReg(Dest->getType());
_cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
_movp(Dest, T);
} else if (Inst->getSrc(0)->getType() == IceType_i64) {
// Use a helper for x86-32.
const SizeT MaxSrcs = 1;
Type DestType = Dest->getType();
InstCall *Call =
makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
: H_sitofp_i64_f64,
Dest, MaxSrcs);
// TODO: Call the correct compiler-rt helper function.
Call->addArg(Inst->getSrc(0));
lowerCall(Call);
return;
} else {
Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
// Sign-extend the operand.
// t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
Variable *T_1 = makeReg(IceType_i32);
Variable *T_2 = makeReg(Dest->getType());
if (Src0RM->getType() == IceType_i32)
_mov(T_1, Src0RM);
else
_movsx(T_1, Src0RM);
_cvt(T_2, T_1, InstX8632Cvt::Si2ss);
_mov(Dest, T_2);
}
break;
case InstCast::Uitofp: {
Operand *Src0 = Inst->getSrc(0);
if (isVectorType(Src0->getType())) {
assert(Dest->getType() == IceType_v4f32 &&
Src0->getType() == IceType_v4i32);
const SizeT MaxSrcs = 1;
InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
Call->addArg(Src0);
lowerCall(Call);
} else if (Src0->getType() == IceType_i64 ||
Src0->getType() == IceType_i32) {
// Use a helper for x86-32 and x86-64. Also use a helper for
// i32 on x86-32.
const SizeT MaxSrcs = 1;
Type DestType = Dest->getType();
IceString TargetString;
if (isInt32Asserting32Or64(Src0->getType())) {
TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
: H_uitofp_i32_f64;
} else {
TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
: H_uitofp_i64_f64;
}
InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
Call->addArg(Src0);
lowerCall(Call);
return;
} else {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
// Zero-extend the operand.
// t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
Variable *T_1 = makeReg(IceType_i32);
Variable *T_2 = makeReg(Dest->getType());
if (Src0RM->getType() == IceType_i32)
_mov(T_1, Src0RM);
else
_movzx(T_1, Src0RM);
_cvt(T_2, T_1, InstX8632Cvt::Si2ss);
_mov(Dest, T_2);
}
break;
}
case InstCast::Bitcast: {
Operand *Src0 = Inst->getSrc(0);
if (Dest->getType() == Src0->getType()) {
InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
lowerAssign(Assign);
return;
}
switch (Dest->getType()) {
default:
llvm_unreachable("Unexpected Bitcast dest type");
case IceType_i8: {
assert(Src0->getType() == IceType_v8i1);
InstCall *Call = makeHelperCall(H_bitcast_8xi1_i8, Dest, 1);
Call->addArg(Src0);
lowerCall(Call);
} break;
case IceType_i16: {
assert(Src0->getType() == IceType_v16i1);
InstCall *Call = makeHelperCall(H_bitcast_16xi1_i16, Dest, 1);
Call->addArg(Src0);
lowerCall(Call);
} break;
case IceType_i32:
case IceType_f32: {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
Type DestType = Dest->getType();
Type SrcType = Src0RM->getType();
(void)DestType;
assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
(DestType == IceType_f32 && SrcType == IceType_i32));
// a.i32 = bitcast b.f32 ==>
// t.f32 = b.f32
// s.f32 = spill t.f32
// a.i32 = s.f32
Variable *T = nullptr;
// TODO: Should be able to force a spill setup by calling legalize() with
// Legal_Mem and not Legal_Reg or Legal_Imm.
SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(SrcType);
SpillVar->setLinkedTo(Dest);
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
_mov(T, Src0RM);
_mov(Spill, T);
_mov(Dest, Spill);
} break;
case IceType_i64: {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
assert(Src0RM->getType() == IceType_f64);
// a.i64 = bitcast b.f64 ==>
// s.f64 = spill b.f64
// t_lo.i32 = lo(s.f64)
// a_lo.i32 = t_lo.i32
// t_hi.i32 = hi(s.f64)
// a_hi.i32 = t_hi.i32
Operand *SpillLo, *SpillHi;
if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
SpillVariable *SpillVar =
Func->makeVariable<SpillVariable>(IceType_f64);
SpillVar->setLinkedTo(Src0Var);
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
_movq(Spill, Src0RM);
SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
} else {
SpillLo = loOperand(Src0RM);
SpillHi = hiOperand(Src0RM);
}
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *T_Lo = makeReg(IceType_i32);
Variable *T_Hi = makeReg(IceType_i32);
_mov(T_Lo, SpillLo);
_mov(DestLo, T_Lo);
_mov(T_Hi, SpillHi);
_mov(DestHi, T_Hi);
} break;
case IceType_f64: {
Src0 = legalize(Src0);
assert(Src0->getType() == IceType_i64);
if (llvm::isa<OperandX8632Mem>(Src0)) {
Variable *T = Func->makeVariable(Dest->getType());
_movq(T, Src0);
_movq(Dest, T);
break;
}
// a.f64 = bitcast b.i64 ==>
// t_lo.i32 = b_lo.i32
// FakeDef(s.f64)
// lo(s.f64) = t_lo.i32
// t_hi.i32 = b_hi.i32
// hi(s.f64) = t_hi.i32
// a.f64 = s.f64
SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
SpillVar->setLinkedTo(Dest);
Variable *Spill = SpillVar;
Spill->setWeight(RegWeight::Zero);
Variable *T_Lo = nullptr, *T_Hi = nullptr;
VariableSplit *SpillLo =
VariableSplit::create(Func, Spill, VariableSplit::Low);
VariableSplit *SpillHi =
VariableSplit::create(Func, Spill, VariableSplit::High);
_mov(T_Lo, loOperand(Src0));
// Technically, the Spill is defined after the _store happens, but
// SpillLo is considered a "use" of Spill so define Spill before it
// is used.
Context.insert(InstFakeDef::create(Func, Spill));
_store(T_Lo, SpillLo);
_mov(T_Hi, hiOperand(Src0));
_store(T_Hi, SpillHi);
_movq(Dest, Spill);
} break;
case IceType_v8i1: {
assert(Src0->getType() == IceType_i8);
InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1);
Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
// Arguments to functions are required to be at least 32 bits wide.
lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
Call->addArg(Src0AsI32);
lowerCall(Call);
} break;
case IceType_v16i1: {
assert(Src0->getType() == IceType_i16);
InstCall *Call = makeHelperCall(H_bitcast_i16_16xi1, Dest, 1);
Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
// Arguments to functions are required to be at least 32 bits wide.
lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
Call->addArg(Src0AsI32);
lowerCall(Call);
} break;
case IceType_v8i16:
case IceType_v16i8:
case IceType_v4i32:
case IceType_v4f32: {
_movp(Dest, legalizeToVar(Src0));
} break;
}
break;
}
}
}
void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
Operand *SourceVectNotLegalized = Inst->getSrc(0);
ConstantInteger32 *ElementIndex =
llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(1));
// Only constant indices are allowed in PNaCl IR.
assert(ElementIndex);
unsigned Index = ElementIndex->getValue();
Type Ty = SourceVectNotLegalized->getType();
Type ElementTy = typeElementType(Ty);
Type InVectorElementTy = getInVectorElementType(Ty);
Variable *ExtractedElementR = makeReg(InVectorElementTy);
// TODO(wala): Determine the best lowering sequences for each type.
bool CanUsePextr =
Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
if (CanUsePextr && Ty != IceType_v4f32) {
// Use pextrb, pextrw, or pextrd.
Constant *Mask = Ctx->getConstantInt32(Index);
Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized);
_pextr(ExtractedElementR, SourceVectR, Mask);
} else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
// Use pshufd and movd/movss.
Variable *T = nullptr;
if (Index) {
// The shuffle only needs to occur if the element to be extracted
// is not at the lowest index.
Constant *Mask = Ctx->getConstantInt32(Index);
T = makeReg(Ty);
_pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
} else {
T = legalizeToVar(SourceVectNotLegalized);
}
if (InVectorElementTy == IceType_i32) {
_movd(ExtractedElementR, T);
} else { // Ty == IceType_f32
// TODO(wala): _movss is only used here because _mov does not
// allow a vector source and a scalar destination. _mov should be
// able to be used here.
// _movss is a binary instruction, so the FakeDef is needed to
// keep the live range analysis consistent.
Context.insert(InstFakeDef::create(Func, ExtractedElementR));
_movss(ExtractedElementR, T);
}
} else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and do the extraction in memory.
//
// TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
// support for legalizing to mem is implemented.
Variable *Slot = Func->makeVariable(Ty);
Slot->setWeight(RegWeight::Zero);
_movp(Slot, legalizeToVar(SourceVectNotLegalized));
// Compute the location of the element in memory.
unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
OperandX8632Mem *Loc =
getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
_mov(ExtractedElementR, Loc);
}
if (ElementTy == IceType_i1) {
// Truncate extracted integers to i1s if necessary.
Variable *T = makeReg(IceType_i1);
InstCast *Cast =
InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
lowerCast(Cast);
ExtractedElementR = T;
}
// Copy the element to the destination.
Variable *Dest = Inst->getDest();
_mov(Dest, ExtractedElementR);
}
void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
Operand *Src0 = Inst->getSrc(0);
Operand *Src1 = Inst->getSrc(1);
Variable *Dest = Inst->getDest();
if (isVectorType(Dest->getType())) {
InstFcmp::FCond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition);
assert(Index < TableFcmpSize);
if (TableFcmp[Index].SwapVectorOperands) {
Operand *T = Src0;
Src0 = Src1;
Src1 = T;
}
Variable *T = nullptr;
if (Condition == InstFcmp::True) {
// makeVectorOfOnes() requires an integer vector type.
T = makeVectorOfMinusOnes(IceType_v4i32);
} else if (Condition == InstFcmp::False) {
T = makeVectorOfZeros(Dest->getType());
} else {
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
switch (Condition) {
default: {
CondX86::CmppsCond Predicate = TableFcmp[Index].Predicate;
assert(Predicate != CondX86::Cmpps_Invalid);
T = makeReg(Src0RM->getType());
_movp(T, Src0RM);
_cmpps(T, Src1RM, Predicate);
} break;
case InstFcmp::One: {
// Check both unequal and ordered.
T = makeReg(Src0RM->getType());
Variable *T2 = makeReg(Src0RM->getType());
_movp(T, Src0RM);
_cmpps(T, Src1RM, CondX86::Cmpps_neq);
_movp(T2, Src0RM);
_cmpps(T2, Src1RM, CondX86::Cmpps_ord);
_pand(T, T2);
} break;
case InstFcmp::Ueq: {
// Check both equal or unordered.
T = makeReg(Src0RM->getType());
Variable *T2 = makeReg(Src0RM->getType());
_movp(T, Src0RM);
_cmpps(T, Src1RM, CondX86::Cmpps_eq);
_movp(T2, Src0RM);
_cmpps(T2, Src1RM, CondX86::Cmpps_unord);
_por(T, T2);
} break;
}
}
_movp(Dest, T);
eliminateNextVectorSextInstruction(Dest);
return;
}
// Lowering a = fcmp cond, b, c
// ucomiss b, c /* only if C1 != Br_None */
// /* but swap b,c order if SwapOperands==true */
// mov a, <default>
// j<C1> label /* only if C1 != Br_None */
// j<C2> label /* only if C2 != Br_None */
// FakeUse(a) /* only if C1 != Br_None */
// mov a, !<default> /* only if C1 != Br_None */
// label: /* only if C1 != Br_None */
//
// setcc lowering when C1 != Br_None && C2 == Br_None:
// ucomiss b, c /* but swap b,c order if SwapOperands==true */
// setcc a, C1
InstFcmp::FCond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition);
assert(Index < TableFcmpSize);
if (TableFcmp[Index].SwapScalarOperands)
std::swap(Src0, Src1);
bool HasC1 = (TableFcmp[Index].C1 != CondX86::Br_None);
bool HasC2 = (TableFcmp[Index].C2 != CondX86::Br_None);
if (HasC1) {
Src0 = legalize(Src0);
Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
Variable *T = nullptr;
_mov(T, Src0);
_ucomiss(T, Src1RM);
if (!HasC2) {
assert(TableFcmp[Index].Default);
_setcc(Dest, TableFcmp[Index].C1);
return;
}
}
Constant *Default = Ctx->getConstantInt32(TableFcmp[Index].Default);
_mov(Dest, Default);
if (HasC1) {
InstX8632Label *Label = InstX8632Label::create(Func, this);
_br(TableFcmp[Index].C1, Label);
if (HasC2) {
_br(TableFcmp[Index].C2, Label);
}
Constant *NonDefault = Ctx->getConstantInt32(!TableFcmp[Index].Default);
_mov_nonkillable(Dest, NonDefault);
Context.insert(Label);
}
}
void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
Operand *Src0 = legalize(Inst->getSrc(0));
Operand *Src1 = legalize(Inst->getSrc(1));
Variable *Dest = Inst->getDest();
if (isVectorType(Dest->getType())) {
Type Ty = Src0->getType();
// Promote i1 vectors to 128 bit integer vector types.
if (typeElementType(Ty) == IceType_i1) {
Type NewTy = IceType_NUM;
switch (Ty) {
default:
llvm_unreachable("unexpected type");
break;
case IceType_v4i1:
NewTy = IceType_v4i32;
break;
case IceType_v8i1:
NewTy = IceType_v8i16;
break;
case IceType_v16i1:
NewTy = IceType_v16i8;
break;
}
Variable *NewSrc0 = Func->makeVariable(NewTy);
Variable *NewSrc1 = Func->makeVariable(NewTy);
lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
Src0 = NewSrc0;
Src1 = NewSrc1;
Ty = NewTy;
}
InstIcmp::ICond Condition = Inst->getCondition();
Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
// SSE2 only has signed comparison operations. Transform unsigned
// inputs in a manner that allows for the use of signed comparison
// operations by flipping the high order bits.
if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
Variable *T0 = makeReg(Ty);
Variable *T1 = makeReg(Ty);
Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
_movp(T0, Src0RM);
_pxor(T0, HighOrderBits);
_movp(T1, Src1RM);
_pxor(T1, HighOrderBits);
Src0RM = T0;
Src1RM = T1;
}
Variable *T = makeReg(Ty);
switch (Condition) {
default:
llvm_unreachable("unexpected condition");
break;
case InstIcmp::Eq: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpeq(T, Src1RM);
} break;
case InstIcmp::Ne: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpeq(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
case InstIcmp::Ugt:
case InstIcmp::Sgt: {
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpgt(T, Src1RM);
} break;
case InstIcmp::Uge:
case InstIcmp::Sge: {
// !(Src1RM > Src0RM)
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM);
_pcmpgt(T, Src0RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
case InstIcmp::Ult:
case InstIcmp::Slt: {
if (llvm::isa<OperandX8632Mem>(Src0RM))
Src0RM = legalizeToVar(Src0RM);
_movp(T, Src1RM);
_pcmpgt(T, Src0RM);
} break;
case InstIcmp::Ule:
case InstIcmp::Sle: {
// !(Src0RM > Src1RM)
if (llvm::isa<OperandX8632Mem>(Src1RM))
Src1RM = legalizeToVar(Src1RM);
_movp(T, Src0RM);
_pcmpgt(T, Src1RM);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_pxor(T, MinusOne);
} break;
}
_movp(Dest, T);
eliminateNextVectorSextInstruction(Dest);
return;
}
// a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
if (Src0->getType() == IceType_i64) {
InstIcmp::ICond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition);
assert(Index < TableIcmp64Size);
Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
Constant *One = Ctx->getConstantInt32(1);
InstX8632Label *LabelFalse = InstX8632Label::create(Func, this);
InstX8632Label *LabelTrue = InstX8632Label::create(Func, this);
_mov(Dest, One);
_cmp(Src0HiRM, Src1HiRI);
if (TableIcmp64[Index].C1 != CondX86::Br_None)
_br(TableIcmp64[Index].C1, LabelTrue);
if (TableIcmp64[Index].C2 != CondX86::Br_None)
_br(TableIcmp64[Index].C2, LabelFalse);
_cmp(Src0LoRM, Src1LoRI);
_br(TableIcmp64[Index].C3, LabelTrue);
Context.insert(LabelFalse);
_mov_nonkillable(Dest, Zero);
Context.insert(LabelTrue);
return;
}
// cmp b, c
Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
_cmp(Src0RM, Src1);
_setcc(Dest, getIcmp32Mapping(Inst->getCondition()));
}
void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
Operand *SourceVectNotLegalized = Inst->getSrc(0);
Operand *ElementToInsertNotLegalized = Inst->getSrc(1);
ConstantInteger32 *ElementIndex =
llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(2));
// Only constant indices are allowed in PNaCl IR.
assert(ElementIndex);
unsigned Index = ElementIndex->getValue();
assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
Type Ty = SourceVectNotLegalized->getType();
Type ElementTy = typeElementType(Ty);
Type InVectorElementTy = getInVectorElementType(Ty);
if (ElementTy == IceType_i1) {
// Expand the element to the appropriate size for it to be inserted
// in the vector.
Variable *Expanded = Func->makeVariable(InVectorElementTy);
InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
ElementToInsertNotLegalized);
lowerCast(Cast);
ElementToInsertNotLegalized = Expanded;
}
if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
// Use insertps, pinsrb, pinsrw, or pinsrd.
Operand *ElementRM =
legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
Operand *SourceVectRM =
legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
Variable *T = makeReg(Ty);
_movp(T, SourceVectRM);
if (Ty == IceType_v4f32)
_insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
else
_pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
_movp(Inst->getDest(), T);
} else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
// Use shufps or movss.
Variable *ElementR = nullptr;
Operand *SourceVectRM =
legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
if (InVectorElementTy == IceType_f32) {
// ElementR will be in an XMM register since it is floating point.
ElementR = legalizeToVar(ElementToInsertNotLegalized);
} else {
// Copy an integer to an XMM register.
Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
ElementR = makeReg(Ty);
_movd(ElementR, T);
}
if (Index == 0) {
Variable *T = makeReg(Ty);
_movp(T, SourceVectRM);
_movss(T, ElementR);
_movp(Inst->getDest(), T);
return;
}
// shufps treats the source and desination operands as vectors of
// four doublewords. The destination's two high doublewords are
// selected from the source operand and the two low doublewords are
// selected from the (original value of) the destination operand.
// An insertelement operation can be effected with a sequence of two
// shufps operations with appropriate masks. In all cases below,
// Element[0] is being inserted into SourceVectOperand. Indices are
// ordered from left to right.
//
// insertelement into index 1 (result is stored in ElementR):
// ElementR := ElementR[0, 0] SourceVectRM[0, 0]
// ElementR := ElementR[3, 0] SourceVectRM[2, 3]
//
// insertelement into index 2 (result is stored in T):
// T := SourceVectRM
// ElementR := ElementR[0, 0] T[0, 3]
// T := T[0, 1] ElementR[0, 3]
//
// insertelement into index 3 (result is stored in T):
// T := SourceVectRM
// ElementR := ElementR[0, 0] T[0, 2]
// T := T[0, 1] ElementR[3, 0]
const unsigned char Mask1[3] = {0, 192, 128};
const unsigned char Mask2[3] = {227, 196, 52};
Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
if (Index == 1) {
_shufps(ElementR, SourceVectRM, Mask1Constant);
_shufps(ElementR, SourceVectRM, Mask2Constant);
_movp(Inst->getDest(), ElementR);
} else {
Variable *T = makeReg(Ty);
_movp(T, SourceVectRM);
_shufps(ElementR, T, Mask1Constant);
_shufps(T, ElementR, Mask2Constant);
_movp(Inst->getDest(), T);
}
} else {
assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
// Spill the value to a stack slot and perform the insertion in
// memory.
//
// TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
// support for legalizing to mem is implemented.
Variable *Slot = Func->makeVariable(Ty);
Slot->setWeight(RegWeight::Zero);
_movp(Slot, legalizeToVar(SourceVectNotLegalized));
// Compute the location of the position to insert in memory.
unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
OperandX8632Mem *Loc =
getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
_store(legalizeToVar(ElementToInsertNotLegalized), Loc);
Variable *T = makeReg(Ty);
_movp(T, Slot);
_movp(Inst->getDest(), T);
}
}
void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
case Intrinsics::AtomicCmpxchg: {
if (!Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Instr->getArg(3)),
getConstantMemoryOrder(Instr->getArg(4)))) {
Func->setError("Unexpected memory ordering for AtomicCmpxchg");
return;
}
Variable *DestPrev = Instr->getDest();
Operand *PtrToMem = Instr->getArg(0);
Operand *Expected = Instr->getArg(1);
Operand *Desired = Instr->getArg(2);
if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
return;
lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
return;
}
case Intrinsics::AtomicFence:
if (!Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Instr->getArg(0)))) {
Func->setError("Unexpected memory ordering for AtomicFence");
return;
}
_mfence();
return;
case Intrinsics::AtomicFenceAll:
// NOTE: FenceAll should prevent and load/store from being moved
// across the fence (both atomic and non-atomic). The InstX8632Mfence
// instruction is currently marked coarsely as "HasSideEffects".
_mfence();
return;
case Intrinsics::AtomicIsLockFree: {
// X86 is always lock free for 8/16/32/64 bit accesses.
// TODO(jvoung): Since the result is constant when given a constant
// byte size, this opens up DCE opportunities.
Operand *ByteSize = Instr->getArg(0);
Variable *Dest = Instr->getDest();
if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
Constant *Result;
switch (CI->getValue()) {
default:
// Some x86-64 processors support the cmpxchg16b intruction, which
// can make 16-byte operations lock free (when used with the LOCK
// prefix). However, that's not supported in 32-bit mode, so just
// return 0 even for large sizes.
Result = Ctx->getConstantZero(IceType_i32);
break;
case 1:
case 2:
case 4:
case 8:
Result = Ctx->getConstantInt32(1);
break;
}
_mov(Dest, Result);
return;
}
// The PNaCl ABI requires the byte size to be a compile-time constant.
Func->setError("AtomicIsLockFree byte size should be compile-time const");
return;
}
case Intrinsics::AtomicLoad: {
// We require the memory address to be naturally aligned.
// Given that is the case, then normal loads are atomic.
if (!Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Instr->getArg(1)))) {
Func->setError("Unexpected memory ordering for AtomicLoad");
return;
}
Variable *Dest = Instr->getDest();
if (Dest->getType() == IceType_i64) {
// Follow what GCC does and use a movq instead of what lowerLoad()
// normally does (split the load into two).
// Thus, this skips load/arithmetic op folding. Load/arithmetic folding
// can't happen anyway, since this is x86-32 and integer arithmetic only
// happens on 32-bit quantities.
Variable *T = makeReg(IceType_f64);
OperandX8632Mem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
_movq(T, Addr);
// Then cast the bits back out of the XMM register to the i64 Dest.
InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
lowerCast(Cast);
// Make sure that the atomic load isn't elided when unused.
Context.insert(InstFakeUse::create(Func, Dest->getLo()));
Context.insert(InstFakeUse::create(Func, Dest->getHi()));
return;
}
InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
lowerLoad(Load);
// Make sure the atomic load isn't elided when unused, by adding a FakeUse.
// Since lowerLoad may fuse the load w/ an arithmetic instruction,
// insert the FakeUse on the last-inserted instruction's dest.
Context.insert(
InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
return;
}
case Intrinsics::AtomicRMW:
if (!Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Instr->getArg(3)))) {
Func->setError("Unexpected memory ordering for AtomicRMW");
return;
}
lowerAtomicRMW(Instr->getDest(),
static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
Instr->getArg(0))->getValue()),
Instr->getArg(1), Instr->getArg(2));
return;
case Intrinsics::AtomicStore: {
if (!Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Instr->getArg(2)))) {
Func->setError("Unexpected memory ordering for AtomicStore");
return;
}
// We require the memory address to be naturally aligned.
// Given that is the case, then normal stores are atomic.
// Add a fence after the store to make it visible.
Operand *Value = Instr->getArg(0);
Operand *Ptr = Instr->getArg(1);
if (Value->getType() == IceType_i64) {
// Use a movq instead of what lowerStore() normally does
// (split the store into two), following what GCC does.
// Cast the bits from int -> to an xmm register first.
Variable *T = makeReg(IceType_f64);
InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
lowerCast(Cast);
// Then store XMM w/ a movq.
OperandX8632Mem *Addr = formMemoryOperand(Ptr, IceType_f64);
_storeq(T, Addr);
_mfence();
return;
}
InstStore *Store = InstStore::create(Func, Value, Ptr);
lowerStore(Store);
_mfence();
return;
}
case Intrinsics::Bswap: {
Variable *Dest = Instr->getDest();
Operand *Val = Instr->getArg(0);
// In 32-bit mode, bswap only works on 32-bit arguments, and the
// argument must be a register. Use rotate left for 16-bit bswap.
if (Val->getType() == IceType_i64) {
Variable *T_Lo = legalizeToVar(loOperand(Val));
Variable *T_Hi = legalizeToVar(hiOperand(Val));
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
_bswap(T_Lo);
_bswap(T_Hi);
_mov(DestLo, T_Hi);
_mov(DestHi, T_Lo);
} else if (Val->getType() == IceType_i32) {
Variable *T = legalizeToVar(Val);
_bswap(T);
_mov(Dest, T);
} else {
assert(Val->getType() == IceType_i16);
Val = legalize(Val);
Constant *Eight = Ctx->getConstantInt16(8);
Variable *T = nullptr;
_mov(T, Val);
_rol(T, Eight);
_mov(Dest, T);
}
return;
}
case Intrinsics::Ctpop: {
Variable *Dest = Instr->getDest();
Operand *Val = Instr->getArg(0);
InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
? H_call_ctpop_i32
: H_call_ctpop_i64,
Dest, 1);
Call->addArg(Val);
lowerCall(Call);
// The popcount helpers always return 32-bit values, while the intrinsic's
// signature matches the native POPCNT instruction and fills a 64-bit reg
// (in 64-bit mode). Thus, clear the upper bits of the dest just in case
// the user doesn't do that in the IR. If the user does that in the IR,
// then this zero'ing instruction is dead and gets optimized out.
if (Val->getType() == IceType_i64) {
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_mov(DestHi, Zero);
}
return;
}
case Intrinsics::Ctlz: {
// The "is zero undef" parameter is ignored and we always return
// a well-defined value.
Operand *Val = legalize(Instr->getArg(0));
Operand *FirstVal;
Operand *SecondVal = nullptr;
if (Val->getType() == IceType_i64) {
FirstVal = loOperand(Val);
SecondVal = hiOperand(Val);
} else {
FirstVal = Val;
}
const bool IsCttz = false;
lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
SecondVal);
return;
}
case Intrinsics::Cttz: {
// The "is zero undef" parameter is ignored and we always return
// a well-defined value.
Operand *Val = legalize(Instr->getArg(0));
Operand *FirstVal;
Operand *SecondVal = nullptr;
if (Val->getType() == IceType_i64) {
FirstVal = hiOperand(Val);
SecondVal = loOperand(Val);
} else {
FirstVal = Val;
}
const bool IsCttz = true;
lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
SecondVal);
return;
}
case Intrinsics::Fabs: {
Operand *Src = legalize(Instr->getArg(0));
Type Ty = Src->getType();
Variable *Dest = Instr->getDest();
Variable *T = makeVectorOfFabsMask(Ty);
// The pand instruction operates on an m128 memory operand, so if
// Src is an f32 or f64, we need to make sure it's in a register.
if (isVectorType(Ty)) {
if (llvm::isa<OperandX8632Mem>(Src))
Src = legalizeToVar(Src);
} else {
Src = legalizeToVar(Src);
}
_pand(T, Src);
if (isVectorType(Ty))
_movp(Dest, T);
else
_mov(Dest, T);
return;
}
case Intrinsics::Longjmp: {
InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
Call->addArg(Instr->getArg(0));
Call->addArg(Instr->getArg(1));
lowerCall(Call);
return;
}
case Intrinsics::Memcpy: {
// In the future, we could potentially emit an inline memcpy/memset, etc.
// for intrinsic calls w/ a known length.
InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
Call->addArg(Instr->getArg(0));
Call->addArg(Instr->getArg(1));
Call->addArg(Instr->getArg(2));
lowerCall(Call);
return;
}
case Intrinsics::Memmove: {
InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
Call->addArg(Instr->getArg(0));
Call->addArg(Instr->getArg(1));
Call->addArg(Instr->getArg(2));
lowerCall(Call);
return;
}
case Intrinsics::Memset: {
// The value operand needs to be extended to a stack slot size
// because the PNaCl ABI requires arguments to be at least 32 bits
// wide.
Operand *ValOp = Instr->getArg(1);
assert(ValOp->getType() == IceType_i8);
Variable *ValExt = Func->makeVariable(stackSlotType());
lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
Call->addArg(Instr->getArg(0));
Call->addArg(ValExt);
Call->addArg(Instr->getArg(2));
lowerCall(Call);
return;
}
case Intrinsics::NaClReadTP: {
if (Ctx->getFlags().getUseSandboxing()) {
Constant *Zero = Ctx->getConstantZero(IceType_i32);
Operand *Src =
OperandX8632Mem::create(Func, IceType_i32, nullptr, Zero, nullptr, 0,
OperandX8632Mem::SegReg_GS);
Variable *Dest = Instr->getDest();
Variable *T = nullptr;
_mov(T, Src);
_mov(Dest, T);
} else {
InstCall *Call = makeHelperCall(H_call_read_tp, Instr->getDest(), 0);
lowerCall(Call);
}
return;
}
case Intrinsics::Setjmp: {
InstCall *Call = makeHelperCall(H_call_setjmp, Instr->getDest(), 1);
Call->addArg(Instr->getArg(0));
lowerCall(Call);
return;
}
case Intrinsics::Sqrt: {
Operand *Src = legalize(Instr->getArg(0));
Variable *Dest = Instr->getDest();
Variable *T = makeReg(Dest->getType());
_sqrtss(T, Src);
_mov(Dest, T);
return;
}
case Intrinsics::Stacksave: {
Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
Variable *Dest = Instr->getDest();
_mov(Dest, esp);
return;
}
case Intrinsics::Stackrestore: {
Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
_mov_nonkillable(esp, Instr->getArg(0));
return;
}
case Intrinsics::Trap:
_ud2();
return;
case Intrinsics::UnknownIntrinsic:
Func->setError("Should not be lowering UnknownIntrinsic");
return;
}
return;
}
void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
Operand *Expected, Operand *Desired) {
if (Expected->getType() == IceType_i64) {
// Reserve the pre-colored registers first, before adding any more
// infinite-weight variables from formMemoryOperand's legalization.
Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
_mov(T_eax, loOperand(Expected));
_mov(T_edx, hiOperand(Expected));
_mov(T_ebx, loOperand(Desired));
_mov(T_ecx, hiOperand(Desired));
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
const bool Locked = true;
_cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
_mov(DestLo, T_eax);
_mov(DestHi, T_edx);
return;
}
Variable *T_eax = makeReg(Expected->getType(), RegX8632::Reg_eax);
_mov(T_eax, Expected);
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
Variable *DesiredReg = legalizeToVar(Desired);
const bool Locked = true;
_cmpxchg(Addr, T_eax, DesiredReg, Locked);
_mov(DestPrev, T_eax);
}
bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
Operand *Expected,
Operand *Desired) {
if (Ctx->getFlags().getOptLevel() == Opt_m1)
return false;
// Peek ahead a few instructions and see how Dest is used.
// It's very common to have:
//
// %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
// [%y_phi = ...] // list of phi stores
// %p = icmp eq i32 %x, %expected
// br i1 %p, label %l1, label %l2
//
// which we can optimize into:
//
// %x = <cmpxchg code>
// [%y_phi = ...] // list of phi stores
// br eq, %l1, %l2
InstList::iterator I = Context.getCur();
// I is currently the InstIntrinsicCall. Peek past that.
// This assumes that the atomic cmpxchg has not been lowered yet,
// so that the instructions seen in the scan from "Cur" is simple.
assert(llvm::isa<InstIntrinsicCall>(*I));
Inst *NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
// There might be phi assignments right before the compare+branch, since this
// could be a backward branch for a loop. This placement of assignments is
// determined by placePhiStores().
std::vector<InstAssign *> PhiAssigns;
while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
if (PhiAssign->getDest() == Dest)
return false;
PhiAssigns.push_back(PhiAssign);
NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
}
if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
if (!(NextCmp->getCondition() == InstIcmp::Eq &&
((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
(NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
return false;
}
NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
if (!NextBr->isUnconditional() &&
NextCmp->getDest() == NextBr->getCondition() &&
NextBr->isLastUse(NextCmp->getDest())) {
lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
for (size_t i = 0; i < PhiAssigns.size(); ++i) {
// Lower the phi assignments now, before the branch (same placement
// as before).
InstAssign *PhiAssign = PhiAssigns[i];
PhiAssign->setDeleted();
lowerAssign(PhiAssign);
Context.advanceNext();
}
_br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
// Skip over the old compare and branch, by deleting them.
NextCmp->setDeleted();
NextBr->setDeleted();
Context.advanceNext();
Context.advanceNext();
return true;
}
}
}
return false;
}
void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
Operand *Ptr, Operand *Val) {
bool NeedsCmpxchg = false;
LowerBinOp Op_Lo = nullptr;
LowerBinOp Op_Hi = nullptr;
switch (Operation) {
default:
Func->setError("Unknown AtomicRMW operation");
return;
case Intrinsics::AtomicAdd: {
if (Dest->getType() == IceType_i64) {
// All the fall-through paths must set this to true, but use this
// for asserting.
NeedsCmpxchg = true;
Op_Lo = &TargetX8632::_add;
Op_Hi = &TargetX8632::_adc;
break;
}
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
const bool Locked = true;
Variable *T = nullptr;
_mov(T, Val);
_xadd(Addr, T, Locked);
_mov(Dest, T);
return;
}
case Intrinsics::AtomicSub: {
if (Dest->getType() == IceType_i64) {
NeedsCmpxchg = true;
Op_Lo = &TargetX8632::_sub;
Op_Hi = &TargetX8632::_sbb;
break;
}
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
const bool Locked = true;
Variable *T = nullptr;
_mov(T, Val);
_neg(T);
_xadd(Addr, T, Locked);
_mov(Dest, T);
return;
}
case Intrinsics::AtomicOr:
// TODO(jvoung): If Dest is null or dead, then some of these
// operations do not need an "exchange", but just a locked op.
// That appears to be "worth" it for sub, or, and, and xor.
// xadd is probably fine vs lock add for add, and xchg is fine
// vs an atomic store.
NeedsCmpxchg = true;
Op_Lo = &TargetX8632::_or;
Op_Hi = &TargetX8632::_or;
break;
case Intrinsics::AtomicAnd:
NeedsCmpxchg = true;
Op_Lo = &TargetX8632::_and;
Op_Hi = &TargetX8632::_and;
break;
case Intrinsics::AtomicXor:
NeedsCmpxchg = true;
Op_Lo = &TargetX8632::_xor;
Op_Hi = &TargetX8632::_xor;
break;
case Intrinsics::AtomicExchange:
if (Dest->getType() == IceType_i64) {
NeedsCmpxchg = true;
// NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
// just need to be moved to the ecx and ebx registers.
Op_Lo = nullptr;
Op_Hi = nullptr;
break;
}
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
Variable *T = nullptr;
_mov(T, Val);
_xchg(Addr, T);
_mov(Dest, T);
return;
}
// Otherwise, we need a cmpxchg loop.
(void)NeedsCmpxchg;
assert(NeedsCmpxchg);
expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
}
void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
Variable *Dest, Operand *Ptr,
Operand *Val) {
// Expand a more complex RMW operation as a cmpxchg loop:
// For 64-bit:
// mov eax, [ptr]
// mov edx, [ptr + 4]
// .LABEL:
// mov ebx, eax
// <Op_Lo> ebx, <desired_adj_lo>
// mov ecx, edx
// <Op_Hi> ecx, <desired_adj_hi>
// lock cmpxchg8b [ptr]
// jne .LABEL
// mov <dest_lo>, eax
// mov <dest_lo>, edx
//
// For 32-bit:
// mov eax, [ptr]
// .LABEL:
// mov <reg>, eax
// op <reg>, [desired_adj]
// lock cmpxchg [ptr], <reg>
// jne .LABEL
// mov <dest>, eax
//
// If Op_{Lo,Hi} are nullptr, then just copy the value.
Val = legalize(Val);
Type Ty = Val->getType();
if (Ty == IceType_i64) {
Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
_mov(T_eax, loOperand(Addr));
_mov(T_edx, hiOperand(Addr));
Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
InstX8632Label *Label = InstX8632Label::create(Func, this);
const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
if (!IsXchg8b) {
Context.insert(Label);
_mov(T_ebx, T_eax);
(this->*Op_Lo)(T_ebx, loOperand(Val));
_mov(T_ecx, T_edx);
(this->*Op_Hi)(T_ecx, hiOperand(Val));
} else {
// This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
// It just needs the Val loaded into ebx and ecx.
// That can also be done before the loop.
_mov(T_ebx, loOperand(Val));
_mov(T_ecx, hiOperand(Val));
Context.insert(Label);
}
const bool Locked = true;
_cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
_br(CondX86::Br_ne, Label);
if (!IsXchg8b) {
// If Val is a variable, model the extended live range of Val through
// the end of the loop, since it will be re-used by the loop.
if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
Context.insert(InstFakeUse::create(Func, ValLo));
Context.insert(InstFakeUse::create(Func, ValHi));
}
} else {
// For xchg, the loop is slightly smaller and ebx/ecx are used.
Context.insert(InstFakeUse::create(Func, T_ebx));
Context.insert(InstFakeUse::create(Func, T_ecx));
}
// The address base (if any) is also reused in the loop.
if (Variable *Base = Addr->getBase())
Context.insert(InstFakeUse::create(Func, Base));
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
_mov(DestLo, T_eax);
_mov(DestHi, T_edx);
return;
}
OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
Variable *T_eax = makeReg(Ty, RegX8632::Reg_eax);
_mov(T_eax, Addr);
InstX8632Label *Label = InstX8632Label::create(Func, this);
Context.insert(Label);
// We want to pick a different register for T than Eax, so don't use
// _mov(T == nullptr, T_eax).
Variable *T = makeReg(Ty);
_mov(T, T_eax);
(this->*Op_Lo)(T, Val);
const bool Locked = true;
_cmpxchg(Addr, T_eax, T, Locked);
_br(CondX86::Br_ne, Label);
// If Val is a variable, model the extended live range of Val through
// the end of the loop, since it will be re-used by the loop.
if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
Context.insert(InstFakeUse::create(Func, ValVar));
}
// The address base (if any) is also reused in the loop.
if (Variable *Base = Addr->getBase())
Context.insert(InstFakeUse::create(Func, Base));
_mov(Dest, T_eax);
}
// Lowers count {trailing, leading} zeros intrinsic.
//
// We could do constant folding here, but that should have
// been done by the front-end/middle-end optimizations.
void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
Operand *FirstVal, Operand *SecondVal) {
// TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
// Then the instructions will handle the Val == 0 case much more simply
// and won't require conversion from bit position to number of zeros.
//
// Otherwise:
// bsr IF_NOT_ZERO, Val
// mov T_DEST, 63
// cmovne T_DEST, IF_NOT_ZERO
// xor T_DEST, 31
// mov DEST, T_DEST
//
// NOTE: T_DEST must be a register because cmov requires its dest to be a
// register. Also, bsf and bsr require their dest to be a register.
//
// The xor DEST, 31 converts a bit position to # of leading zeroes.
// E.g., for 000... 00001100, bsr will say that the most significant bit
// set is at position 3, while the number of leading zeros is 28. Xor is
// like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
//
// Similar for 64-bit, but start w/ speculating that the upper 32 bits
// are all zero, and compute the result for that case (checking the lower
// 32 bits). Then actually compute the result for the upper bits and
// cmov in the result from the lower computation if the earlier speculation
// was correct.
//
// Cttz, is similar, but uses bsf instead, and doesn't require the xor
// bit position conversion, and the speculation is reversed.
assert(Ty == IceType_i32 || Ty == IceType_i64);
Variable *T = makeReg(IceType_i32);
Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
if (Cttz) {
_bsf(T, FirstValRM);
} else {
_bsr(T, FirstValRM);
}
Variable *T_Dest = makeReg(IceType_i32);
Constant *ThirtyTwo = Ctx->getConstantInt32(32);
Constant *ThirtyOne = Ctx->getConstantInt32(31);
if (Cttz) {
_mov(T_Dest, ThirtyTwo);
} else {
Constant *SixtyThree = Ctx->getConstantInt32(63);
_mov(T_Dest, SixtyThree);
}
_cmov(T_Dest, T, CondX86::Br_ne);
if (!Cttz) {
_xor(T_Dest, ThirtyOne);
}
if (Ty == IceType_i32) {
_mov(Dest, T_Dest);
return;
}
_add(T_Dest, ThirtyTwo);
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
// Will be using "test" on this, so we need a registerized variable.
Variable *SecondVar = legalizeToVar(SecondVal);
Variable *T_Dest2 = makeReg(IceType_i32);
if (Cttz) {
_bsf(T_Dest2, SecondVar);
} else {
_bsr(T_Dest2, SecondVar);
_xor(T_Dest2, ThirtyOne);
}
_test(SecondVar, SecondVar);
_cmov(T_Dest2, T_Dest, CondX86::Br_e);
_mov(DestLo, T_Dest2);
_mov(DestHi, Ctx->getConstantZero(IceType_i32));
}
namespace {
bool isAdd(const Inst *Inst) {
if (const InstArithmetic *Arith =
llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
return (Arith->getOp() == InstArithmetic::Add);
}
return false;
}
void dumpAddressOpt(const Cfg *Func, const Variable *Base,
const Variable *Index, uint16_t Shift, int32_t Offset,
const Inst *Reason) {
if (!ALLOW_DUMP)
return;
if (!Func->isVerbose(IceV_AddrOpt))
return;
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
Str << "Instruction: ";
Reason->dumpDecorated(Func);
Str << " results in Base=";
if (Base)
Base->dump(Func);
else
Str << "<null>";
Str << ", Index=";
if (Index)
Index->dump(Func);
else
Str << "<null>";
Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n";
}
bool matchTransitiveAssign(const VariablesMetadata *VMetadata, Variable *&Var,
const Inst *&Reason) {
// Var originates from Var=SrcVar ==>
// set Var:=SrcVar
if (Var == nullptr)
return false;
if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) {
assert(!VMetadata->isMultiDef(Var));
if (llvm::isa<InstAssign>(VarAssign)) {
Operand *SrcOp = VarAssign->getSrc(0);
assert(SrcOp);
if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
if (!VMetadata->isMultiDef(SrcVar) &&
// TODO: ensure SrcVar stays single-BB
true) {
Var = SrcVar;
Reason = VarAssign;
return true;
}
}
}
}
return false;
}
bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable *&Base,
Variable *&Index, uint16_t &Shift,
const Inst *&Reason) {
// Index==nullptr && Base is Base=Var1+Var2 ==>
// set Base=Var1, Index=Var2, Shift=0
if (Base == nullptr)
return false;
if (Index != nullptr)
return false;
const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
if (BaseInst == nullptr)
return false;
assert(!VMetadata->isMultiDef(Base));
if (BaseInst->getSrcSize() < 2)
return false;
if (Variable *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
if (VMetadata->isMultiDef(Var1))
return false;
if (Variable *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
if (VMetadata->isMultiDef(Var2))
return false;
if (isAdd(BaseInst) &&
// TODO: ensure Var1 and Var2 stay single-BB
true) {
Base = Var1;
Index = Var2;
Shift = 0; // should already have been 0
Reason = BaseInst;
return true;
}
}
}
return false;
}
bool matchShiftedIndex(const VariablesMetadata *VMetadata, Variable *&Index,
uint16_t &Shift, const Inst *&Reason) {
// Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
// Index=Var, Shift+=log2(Const)
if (Index == nullptr)
return false;
const Inst *IndexInst = VMetadata->getSingleDefinition(Index);
if (IndexInst == nullptr)
return false;
assert(!VMetadata->isMultiDef(Index));
if (IndexInst->getSrcSize() < 2)
return false;
if (const InstArithmetic *ArithInst =
llvm::dyn_cast<InstArithmetic>(IndexInst)) {
if (Variable *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
if (ConstantInteger32 *Const =
llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
if (ArithInst->getOp() == InstArithmetic::Mul &&
!VMetadata->isMultiDef(Var) && Const->getType() == IceType_i32) {
uint64_t Mult = Const->getValue();
uint32_t LogMult;
switch (Mult) {
case 1:
LogMult = 0;
break;
case 2:
LogMult = 1;
break;
case 4:
LogMult = 2;
break;
case 8:
LogMult = 3;
break;
default:
return false;
}
if (Shift + LogMult <= 3) {
Index = Var;
Shift += LogMult;
Reason = IndexInst;
return true;
}
}
}
}
}
return false;
}
bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable *&Base,
int32_t &Offset, const Inst *&Reason) {
// Base is Base=Var+Const || Base is Base=Const+Var ==>
// set Base=Var, Offset+=Const
// Base is Base=Var-Const ==>
// set Base=Var, Offset-=Const
if (Base == nullptr)
return false;
const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
if (BaseInst == nullptr)
return false;
assert(!VMetadata->isMultiDef(Base));
if (const InstArithmetic *ArithInst =
llvm::dyn_cast<const InstArithmetic>(BaseInst)) {
if (ArithInst->getOp() != InstArithmetic::Add &&
ArithInst->getOp() != InstArithmetic::Sub)
return false;
bool IsAdd = ArithInst->getOp() == InstArithmetic::Add;
Variable *Var = nullptr;
ConstantInteger32 *Const = nullptr;
if (Variable *VariableOperand =
llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
Var = VariableOperand;
Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1));
} else if (IsAdd) {
Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(0));
Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(1));
}
if (Var == nullptr || Const == nullptr || VMetadata->isMultiDef(Var))
return false;
int32_t MoreOffset = IsAdd ? Const->getValue() : -Const->getValue();
if (Utils::WouldOverflowAdd(Offset, MoreOffset))
return false;
Base = Var;
Offset += MoreOffset;
Reason = BaseInst;
return true;
}
return false;
}
void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
Variable *&Index, uint16_t &Shift, int32_t &Offset) {
Func->resetCurrentNode();
if (Func->isVerbose(IceV_AddrOpt)) {
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
Str << "\nStarting computeAddressOpt for instruction:\n ";
Instr->dumpDecorated(Func);
}
(void)Offset; // TODO: pattern-match for non-zero offsets.
if (Base == nullptr)
return;
// If the Base has more than one use or is live across multiple
// blocks, then don't go further. Alternatively (?), never consider
// a transformation that would change a variable that is currently
// *not* live across basic block boundaries into one that *is*.
if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/)
return;
const VariablesMetadata *VMetadata = Func->getVMetadata();
bool Continue = true;
while (Continue) {
const Inst *Reason = nullptr;
if (matchTransitiveAssign(VMetadata, Base, Reason) ||
matchTransitiveAssign(VMetadata, Index, Reason) ||
matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) ||
matchShiftedIndex(VMetadata, Index, Shift, Reason) ||
matchOffsetBase(VMetadata, Base, Offset, Reason)) {
dumpAddressOpt(Func, Base, Index, Shift, Offset, Reason);
} else {
Continue = false;
}
// Index is Index=Var<<Const && Const+Shift<=3 ==>
// Index=Var, Shift+=Const
// Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
// Index=Var, Shift+=log2(Const)
// Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
// swap(Index,Base)
// Similar for Base=Const*Var and Base=Var<<Const
// Index is Index=Var+Const ==>
// set Index=Var, Offset+=(Const<<Shift)
// Index is Index=Const+Var ==>
// set Index=Var, Offset+=(Const<<Shift)
// Index is Index=Var-Const ==>
// set Index=Var, Offset-=(Const<<Shift)
// TODO: consider overflow issues with respect to Offset.
// TODO: handle symbolic constants.
}
}
} // anonymous namespace
void TargetX8632::lowerLoad(const InstLoad *Load) {
// A Load instruction can be treated the same as an Assign
// instruction, after the source operand is transformed into an
// OperandX8632Mem operand. Note that the address mode
// optimization already creates an OperandX8632Mem operand, so it
// doesn't need another level of transformation.
Variable *DestLoad = Load->getDest();
Type Ty = DestLoad->getType();
Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
lowerAssign(Assign);
}
void TargetX8632::doAddressOptLoad() {
Inst *Inst = Context.getCur();
Variable *Dest = Inst->getDest();
Operand *Addr = Inst->getSrc(0);
Variable *Index = nullptr;
uint16_t Shift = 0;
int32_t Offset = 0; // TODO: make Constant
// Vanilla ICE load instructions should not use the segment registers,
// and computeAddressOpt only works at the level of Variables and Constants,
// not other OperandX8632Mem, so there should be no mention of segment
// registers there either.
const OperandX8632Mem::SegmentRegisters SegmentReg =
OperandX8632Mem::DefaultSegment;
Variable *Base = llvm::dyn_cast<Variable>(Addr);
computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
if (Base && Addr != Base) {
Inst->setDeleted();
Constant *OffsetOp = Ctx->getConstantInt32(Offset);
Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
Shift, SegmentReg);
Context.insert(InstLoad::create(Func, Dest, Addr));
}
}
void TargetX8632::randomlyInsertNop(float Probability) {
RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
if (RNG.getTrueWithProbability(Probability)) {
_nop(RNG(X86_NUM_NOP_VARIANTS));
}
}
void TargetX8632::lowerPhi(const InstPhi * /*Inst*/) {
Func->setError("Phi found in regular instruction list");
}
void TargetX8632::lowerRet(const InstRet *Inst) {
Variable *Reg = nullptr;
if (Inst->hasRetValue()) {
Operand *Src0 = legalize(Inst->getRetValue());
if (Src0->getType() == IceType_i64) {
Variable *eax = legalizeToVar(loOperand(Src0), RegX8632::Reg_eax);
Variable *edx = legalizeToVar(hiOperand(Src0), RegX8632::Reg_edx);
Reg = eax;
Context.insert(InstFakeUse::create(Func, edx));
} else if (isScalarFloatingType(Src0->getType())) {
_fld(Src0);
} else if (isVectorType(Src0->getType())) {
Reg = legalizeToVar(Src0, RegX8632::Reg_xmm0);
} else {
_mov(Reg, Src0, RegX8632::Reg_eax);
}
}
// Add a ret instruction even if sandboxing is enabled, because
// addEpilog explicitly looks for a ret instruction as a marker for
// where to insert the frame removal instructions.
_ret(Reg);
// Add a fake use of esp to make sure esp stays alive for the entire
// function. Otherwise post-call esp adjustments get dead-code
// eliminated. TODO: Are there more places where the fake use
// should be inserted? E.g. "void f(int n){while(1) g(n);}" may not
// have a ret instruction.
Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
Context.insert(InstFakeUse::create(Func, esp));
}
void TargetX8632::lowerSelect(const InstSelect *Inst) {
Variable *Dest = Inst->getDest();
Type DestTy = Dest->getType();
Operand *SrcT = Inst->getTrueOperand();
Operand *SrcF = Inst->getFalseOperand();
Operand *Condition = Inst->getCondition();
if (isVectorType(DestTy)) {
Type SrcTy = SrcT->getType();
Variable *T = makeReg(SrcTy);
Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
if (InstructionSet >= SSE4_1) {
// TODO(wala): If the condition operand is a constant, use blendps
// or pblendw.
//
// Use blendvps or pblendvb to implement select.
if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
SrcTy == IceType_v4f32) {
Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
_movp(xmm0, ConditionRM);
_psll(xmm0, Ctx->getConstantInt8(31));
_movp(T, SrcFRM);
_blendvps(T, SrcTRM, xmm0);
_movp(Dest, T);
} else {
assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
: IceType_v16i8;
Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
_movp(T, SrcFRM);
_pblendvb(T, SrcTRM, xmm0);
_movp(Dest, T);
}
return;
}
// Lower select without SSE4.1:
// a=d?b:c ==>
// if elementtype(d) != i1:
// d=sext(d);
// a=(b&d)|(c&~d);
Variable *T2 = makeReg(SrcTy);
// Sign extend the condition operand if applicable.
if (SrcTy == IceType_v4f32) {
// The sext operation takes only integer arguments.
Variable *T3 = Func->makeVariable(IceType_v4i32);
lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
_movp(T, T3);
} else if (typeElementType(SrcTy) != IceType_i1) {
lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
} else {
Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
_movp(T, ConditionRM);
}
_movp(T2, T);
_pand(T, SrcTRM);
_pandn(T2, SrcFRM);
_por(T, T2);
_movp(Dest, T);
return;
}
CondX86::BrCond Cond = CondX86::Br_ne;
Operand *CmpOpnd0 = nullptr;
Operand *CmpOpnd1 = nullptr;
// Handle folding opportunities.
if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
assert(Producer->isDeleted());
switch (BoolFolding::getProducerKind(Producer)) {
default:
break;
case BoolFolding::PK_Icmp32: {
auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
Cond = getIcmp32Mapping(Cmp->getCondition());
CmpOpnd1 = legalize(Producer->getSrc(1));
CmpOpnd0 = legalizeSrc0ForCmp(Producer->getSrc(0), CmpOpnd1);
} break;
}
}
if (CmpOpnd0 == nullptr) {
CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem);
CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
}
assert(CmpOpnd0);
assert(CmpOpnd1);
_cmp(CmpOpnd0, CmpOpnd1);
if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
// The cmov instruction doesn't allow 8-bit or FP operands, so
// we need explicit control flow.
// d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
InstX8632Label *Label = InstX8632Label::create(Func, this);
SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
_mov(Dest, SrcT);
_br(Cond, Label);
SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
_mov_nonkillable(Dest, SrcF);
Context.insert(Label);
return;
}
// mov t, SrcF; cmov_cond t, SrcT; mov dest, t
// But if SrcT is immediate, we might be able to do better, as
// the cmov instruction doesn't allow an immediate operand:
// mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
std::swap(SrcT, SrcF);
Cond = InstX8632::getOppositeCondition(Cond);
}
if (DestTy == IceType_i64) {
// Set the low portion.
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *TLo = nullptr;
Operand *SrcFLo = legalize(loOperand(SrcF));
_mov(TLo, SrcFLo);
Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Mem);
_cmov(TLo, SrcTLo, Cond);
_mov(DestLo, TLo);
// Set the high portion.
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Variable *THi = nullptr;
Operand *SrcFHi = legalize(hiOperand(SrcF));
_mov(THi, SrcFHi);
Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Mem);
_cmov(THi, SrcTHi, Cond);
_mov(DestHi, THi);
return;
}
assert(DestTy == IceType_i16 || DestTy == IceType_i32);
Variable *T = nullptr;
SrcF = legalize(SrcF);
_mov(T, SrcF);
SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
_cmov(T, SrcT, Cond);
_mov(Dest, T);
}
void TargetX8632::lowerStore(const InstStore *Inst) {
Operand *Value = Inst->getData();
Operand *Addr = Inst->getAddr();
OperandX8632Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
Type Ty = NewAddr->getType();
if (Ty == IceType_i64) {
Value = legalize(Value);
Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
_store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));
_store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));
} else if (isVectorType(Ty)) {
_storep(legalizeToVar(Value), NewAddr);
} else {
Value = legalize(Value, Legal_Reg | Legal_Imm);
_store(Value, NewAddr);
}
}
void TargetX8632::doAddressOptStore() {
InstStore *Inst = llvm::cast<InstStore>(Context.getCur());
Operand *Data = Inst->getData();
Operand *Addr = Inst->getAddr();
Variable *Index = nullptr;
uint16_t Shift = 0;
int32_t Offset = 0; // TODO: make Constant
Variable *Base = llvm::dyn_cast<Variable>(Addr);
// Vanilla ICE store instructions should not use the segment registers,
// and computeAddressOpt only works at the level of Variables and Constants,
// not other OperandX8632Mem, so there should be no mention of segment
// registers there either.
const OperandX8632Mem::SegmentRegisters SegmentReg =
OperandX8632Mem::DefaultSegment;
computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
if (Base && Addr != Base) {
Inst->setDeleted();
Constant *OffsetOp = Ctx->getConstantInt32(Offset);
Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
Shift, SegmentReg);
Context.insert(InstStore::create(Func, Data, Addr));
}
}
void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
// This implements the most naive possible lowering.
// cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
Operand *Src0 = Inst->getComparison();
SizeT NumCases = Inst->getNumCases();
if (Src0->getType() == IceType_i64) {
Src0 = legalize(Src0); // get Base/Index into physical registers
Operand *Src0Lo = loOperand(Src0);
Operand *Src0Hi = hiOperand(Src0);
if (NumCases >= 2) {
Src0Lo = legalizeToVar(Src0Lo);
Src0Hi = legalizeToVar(Src0Hi);
} else {
Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
}
for (SizeT I = 0; I < NumCases; ++I) {
Constant *ValueLo = Ctx->getConstantInt32(Inst->getValue(I));
Constant *ValueHi = Ctx->getConstantInt32(Inst->getValue(I) >> 32);
InstX8632Label *Label = InstX8632Label::create(Func, this);
_cmp(Src0Lo, ValueLo);
_br(CondX86::Br_ne, Label);
_cmp(Src0Hi, ValueHi);
_br(CondX86::Br_e, Inst->getLabel(I));
Context.insert(Label);
}
_br(Inst->getLabelDefault());
return;
}
// OK, we'll be slightly less naive by forcing Src into a physical
// register if there are 2 or more uses.
if (NumCases >= 2)
Src0 = legalizeToVar(Src0);
else
Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
for (SizeT I = 0; I < NumCases; ++I) {
Constant *Value = Ctx->getConstantInt32(Inst->getValue(I));
_cmp(Src0, Value);
_br(CondX86::Br_e, Inst->getLabel(I));
}
_br(Inst->getLabelDefault());
}
void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
Variable *Dest, Operand *Src0,
Operand *Src1) {
assert(isVectorType(Dest->getType()));
Type Ty = Dest->getType();
Type ElementTy = typeElementType(Ty);
SizeT NumElements = typeNumElements(Ty);
Operand *T = Ctx->getConstantUndef(Ty);
for (SizeT I = 0; I < NumElements; ++I) {
Constant *Index = Ctx->getConstantInt32(I);
// Extract the next two inputs.
Variable *Op0 = Func->makeVariable(ElementTy);
lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
Variable *Op1 = Func->makeVariable(ElementTy);
lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
// Perform the arithmetic as a scalar operation.
Variable *Res = Func->makeVariable(ElementTy);
lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
// Insert the result into position.
Variable *DestT = Func->makeVariable(Ty);
lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
T = DestT;
}
lowerAssign(InstAssign::create(Func, Dest, T));
}
// The following pattern occurs often in lowered C and C++ code:
//
// %cmp = fcmp/icmp pred <n x ty> %src0, %src1
// %cmp.ext = sext <n x i1> %cmp to <n x ty>
//
// We can eliminate the sext operation by copying the result of pcmpeqd,
// pcmpgtd, or cmpps (which produce sign extended results) to the result
// of the sext operation.
void TargetX8632::eliminateNextVectorSextInstruction(
Variable *SignExtendedResult) {
if (InstCast *NextCast =
llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
if (NextCast->getCastKind() == InstCast::Sext &&
NextCast->getSrc(0) == SignExtendedResult) {
NextCast->setDeleted();
_movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
// Skip over the instruction.
Context.advanceNext();
}
}
}
void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { _ud2(); }
// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
// preserve integrity of liveness analysis. Undef values are also
// turned into zeroes, since loOperand() and hiOperand() don't expect
// Undef input.
void TargetX8632::prelowerPhis() {
CfgNode *Node = Context.getNode();
for (Inst &I : Node->getPhis()) {
auto Phi = llvm::dyn_cast<InstPhi>(&I);
if (Phi->isDeleted())
continue;
Variable *Dest = Phi->getDest();
if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
InstPhi *PhiLo = InstPhi::create(Func, Phi->getSrcSize(), DestLo);
InstPhi *PhiHi = InstPhi::create(Func, Phi->getSrcSize(), DestHi);
for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
Operand *Src = Phi->getSrc(I);
CfgNode *Label = Phi->getLabel(I);
if (llvm::isa<ConstantUndef>(Src))
Src = Ctx->getConstantZero(Dest->getType());
PhiLo->addArgument(loOperand(Src), Label);
PhiHi->addArgument(hiOperand(Src), Label);
}
Node->getPhis().push_back(PhiLo);
Node->getPhis().push_back(PhiHi);
Phi->setDeleted();
}
}
}
namespace {
bool isMemoryOperand(const Operand *Opnd) {
if (const auto Var = llvm::dyn_cast<Variable>(Opnd))
return !Var->hasReg();
// We treat vector undef values the same as a memory operand,
// because they do in fact need a register to materialize the vector
// of zeroes into.
if (llvm::isa<ConstantUndef>(Opnd))
return isScalarFloatingType(Opnd->getType()) ||
isVectorType(Opnd->getType());
if (llvm::isa<Constant>(Opnd))
return isScalarFloatingType(Opnd->getType());
return true;
}
} // end of anonymous namespace
// Lower the pre-ordered list of assignments into mov instructions.
// Also has to do some ad-hoc register allocation as necessary.
void TargetX8632::lowerPhiAssignments(CfgNode *Node,
const AssignList &Assignments) {
// Check that this is a properly initialized shell of a node.
assert(Node->getOutEdges().size() == 1);
assert(Node->getInsts().empty());
assert(Node->getPhis().empty());
CfgNode *Succ = Node->getOutEdges().front();
getContext().init(Node);
// Register set setup similar to regAlloc().
RegSetMask RegInclude = RegSet_All;
RegSetMask RegExclude = RegSet_StackPointer;
if (hasFramePointer())
RegExclude |= RegSet_FramePointer;
llvm::SmallBitVector Available = getRegisterSet(RegInclude, RegExclude);
bool NeedsRegs = false;
// Initialize the set of available registers to the set of what is
// available (not live) at the beginning of the successor block,
// minus all registers used as Dest operands in the Assignments. To
// do this, we start off assuming all registers are available, then
// iterate through the Assignments and remove Dest registers.
// During this iteration, we also determine whether we will actually
// need any extra registers for memory-to-memory copies. If so, we
// do the actual work of removing the live-in registers from the
// set. TODO(stichnot): This work is being repeated for every split
// edge to the successor, so consider updating LiveIn just once
// after all the edges are split.
for (const Inst &I : Assignments) {
Variable *Dest = I.getDest();
if (Dest->hasReg()) {
Available[Dest->getRegNum()] = false;
} else if (isMemoryOperand(I.getSrc(0))) {
NeedsRegs = true; // Src and Dest are both in memory
}
}
if (NeedsRegs) {
LivenessBV &LiveIn = Func->getLiveness()->getLiveIn(Succ);
for (int i = LiveIn.find_first(); i != -1; i = LiveIn.find_next(i)) {
Variable *Var = Func->getLiveness()->getVariable(i, Succ);
if (Var->hasReg())
Available[Var->getRegNum()] = false;
}
}
// Iterate backwards through the Assignments. After lowering each
// assignment, add Dest to the set of available registers, and
// remove Src from the set of available registers. Iteration is
// done backwards to enable incremental updates of the available
// register set, and the lowered instruction numbers may be out of
// order, but that can be worked around by renumbering the block
// afterwards if necessary.
for (const Inst &I : reverse_range(Assignments)) {
Context.rewind();
auto Assign = llvm::dyn_cast<InstAssign>(&I);
Variable *Dest = Assign->getDest();
Operand *Src = Assign->getSrc(0);
Variable *SrcVar = llvm::dyn_cast<Variable>(Src);
// Use normal assignment lowering, except lower mem=mem specially
// so we can register-allocate at the same time.
if (!isMemoryOperand(Dest) || !isMemoryOperand(Src)) {
lowerAssign(Assign);
} else {
assert(Dest->getType() == Src->getType());
const llvm::SmallBitVector &RegsForType =
getRegisterSetForType(Dest->getType());
llvm::SmallBitVector AvailRegsForType = RegsForType & Available;
Variable *SpillLoc = nullptr;
Variable *Preg = nullptr;
// TODO(stichnot): Opportunity for register randomization.
int32_t RegNum = AvailRegsForType.find_first();
bool IsVector = isVectorType(Dest->getType());
bool NeedSpill = (RegNum == -1);
if (NeedSpill) {
// Pick some register to spill and update RegNum.
// TODO(stichnot): Opportunity for register randomization.
RegNum = RegsForType.find_first();
Preg = getPhysicalRegister(RegNum, Dest->getType());
SpillLoc = Func->makeVariable(Dest->getType());
// Create a fake def of the physical register to avoid
// liveness inconsistency problems during late-stage liveness
// analysis (e.g. asm-verbose mode).
Context.insert(InstFakeDef::create(Func, Preg));
if (IsVector)
_movp(SpillLoc, Preg);
else
_mov(SpillLoc, Preg);
}
assert(RegNum >= 0);
if (llvm::isa<ConstantUndef>(Src))
// Materialize an actual constant instead of undef. RegNum is
// passed in for vector types because undef vectors are
// lowered to vector register of zeroes.
Src =
legalize(Src, Legal_All, IsVector ? RegNum : Variable::NoRegister);
Variable *Tmp = makeReg(Dest->getType(), RegNum);
if (IsVector) {
_movp(Tmp, Src);
_movp(Dest, Tmp);
} else {
_mov(Tmp, Src);
_mov(Dest, Tmp);
}
if (NeedSpill) {
// Restore the spilled register.
if (IsVector)
_movp(Preg, SpillLoc);
else
_mov(Preg, SpillLoc);
// Create a fake use of the physical register to keep it live
// for late-stage liveness analysis (e.g. asm-verbose mode).
Context.insert(InstFakeUse::create(Func, Preg));
}
}
// Update register availability before moving to the previous
// instruction on the Assignments list.
if (Dest->hasReg())
Available[Dest->getRegNum()] = true;
if (SrcVar && SrcVar->hasReg())
Available[SrcVar->getRegNum()] = false;
}
// Add the terminator branch instruction to the end.
Context.setInsertPoint(Context.getEnd());
_br(Succ);
}
// There is no support for loading or emitting vector constants, so the
// vector values returned from makeVectorOfZeros, makeVectorOfOnes,
// etc. are initialized with register operations.
//
// TODO(wala): Add limited support for vector constants so that
// complex initialization in registers is unnecessary.
Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
Variable *Reg = makeReg(Ty, RegNum);
// Insert a FakeDef, since otherwise the live range of Reg might
// be overestimated.
Context.insert(InstFakeDef::create(Func, Reg));
_pxor(Reg, Reg);
return Reg;
}
Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
Variable *MinusOnes = makeReg(Ty, RegNum);
// Insert a FakeDef so the live range of MinusOnes is not overestimated.
Context.insert(InstFakeDef::create(Func, MinusOnes));
_pcmpeq(MinusOnes, MinusOnes);
return MinusOnes;
}
Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
Variable *Dest = makeVectorOfZeros(Ty, RegNum);
Variable *MinusOne = makeVectorOfMinusOnes(Ty);
_psub(Dest, MinusOne);
return Dest;
}
Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
Ty == IceType_v16i8);
if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
Variable *Reg = makeVectorOfOnes(Ty, RegNum);
SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
_psll(Reg, Ctx->getConstantInt8(Shift));
return Reg;
} else {
// SSE has no left shift operation for vectors of 8 bit integers.
const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
Variable *Reg = makeReg(Ty, RegNum);
_movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
_pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
return Reg;
}
}
// Construct a mask in a register that can be and'ed with a
// floating-point value to mask off its sign bit. The value will be
// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
// for f64. Construct it as vector of ones logically right shifted
// one bit. TODO(stichnot): Fix the wala TODO above, to represent
// vector constants in memory.
Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, int32_t RegNum) {
Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
_psrl(Reg, Ctx->getConstantInt8(1));
return Reg;
}
OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
Variable *Slot,
uint32_t Offset) {
// Ensure that Loc is a stack slot.
assert(Slot->getWeight().isZero());
assert(Slot->getRegNum() == Variable::NoRegister);
// Compute the location of Loc in memory.
// TODO(wala,stichnot): lea should not be required. The address of
// the stack slot is known at compile time (although not until after
// addProlog()).
const Type PointerType = IceType_i32;
Variable *Loc = makeReg(PointerType);
_lea(Loc, Slot);
Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
return OperandX8632Mem::create(Func, Ty, Loc, ConstantOffset);
}
// Helper for legalize() to emit the right code to lower an operand to a
// register of the appropriate type.
Variable *TargetX8632::copyToReg(Operand *Src, int32_t RegNum) {
Type Ty = Src->getType();
Variable *Reg = makeReg(Ty, RegNum);
if (isVectorType(Ty)) {
_movp(Reg, Src);
} else {
_mov(Reg, Src);
}
return Reg;
}
Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
int32_t RegNum) {
Type Ty = From->getType();
// Assert that a physical register is allowed. To date, all calls
// to legalize() allow a physical register. If a physical register
// needs to be explicitly disallowed, then new code will need to be
// written to force a spill.
assert(Allowed & Legal_Reg);
// If we're asking for a specific physical register, make sure we're
// not allowing any other operand kinds. (This could be future
// work, e.g. allow the shl shift amount to be either an immediate
// or in ecx.)
assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg);
if (auto Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
// Before doing anything with a Mem operand, we need to ensure
// that the Base and Index components are in physical registers.
Variable *Base = Mem->getBase();
Variable *Index = Mem->getIndex();
Variable *RegBase = nullptr;
Variable *RegIndex = nullptr;
if (Base) {
RegBase = legalizeToVar(Base);
}
if (Index) {
RegIndex = legalizeToVar(Index);
}
if (Base != RegBase || Index != RegIndex) {
From =
OperandX8632Mem::create(Func, Ty, RegBase, Mem->getOffset(), RegIndex,
Mem->getShift(), Mem->getSegmentRegister());
}
if (!(Allowed & Legal_Mem)) {
From = copyToReg(From, RegNum);
}
return From;
}
if (llvm::isa<Constant>(From)) {
if (llvm::isa<ConstantUndef>(From)) {
// Lower undefs to zero. Another option is to lower undefs to an
// uninitialized register; however, using an uninitialized register
// results in less predictable code.
//
// If in the future the implementation is changed to lower undef
// values to uninitialized registers, a FakeDef will be needed:
// Context.insert(InstFakeDef::create(Func, Reg));
// This is in order to ensure that the live range of Reg is not
// overestimated. If the constant being lowered is a 64 bit value,
// then the result should be split and the lo and hi components will
// need to go in uninitialized registers.
if (isVectorType(Ty))
return makeVectorOfZeros(Ty, RegNum);
From = Ctx->getConstantZero(Ty);
}
// There should be no constants of vector type (other than undef).
assert(!isVectorType(Ty));
// Convert a scalar floating point constant into an explicit
// memory operand.
if (isScalarFloatingType(Ty)) {
Variable *Base = nullptr;
std::string Buffer;
llvm::raw_string_ostream StrBuf(Buffer);
llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
From = OperandX8632Mem::create(Func, Ty, Base, Offset);
}
bool NeedsReg = false;
if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
// Immediate specifically not allowed
NeedsReg = true;
if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
// On x86, FP constants are lowered to mem operands.
NeedsReg = true;
if (NeedsReg) {
From = copyToReg(From, RegNum);
}
return From;
}
if (auto Var = llvm::dyn_cast<Variable>(From)) {
// Check if the variable is guaranteed a physical register. This
// can happen either when the variable is pre-colored or when it is
// assigned infinite weight.
bool MustHaveRegister = (Var->hasReg() || Var->getWeight().isInf());
// We need a new physical register for the operand if:
// Mem is not allowed and Var isn't guaranteed a physical
// register, or
// RegNum is required and Var->getRegNum() doesn't match.
if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
(RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
From = copyToReg(From, RegNum);
}
return From;
}
llvm_unreachable("Unhandled operand kind in legalize()");
return From;
}
// Provide a trivial wrapper to legalize() for this common usage.
Variable *TargetX8632::legalizeToVar(Operand *From, int32_t RegNum) {
return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
}
// For the cmp instruction, if Src1 is an immediate, or known to be a
// physical register, we can allow Src0 to be a memory operand.
// Otherwise, Src0 must be copied into a physical register.
// (Actually, either Src0 or Src1 can be chosen for the physical
// register, but unfortunately we have to commit to one or the other
// before register allocation.)
Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
bool IsSrc1ImmOrReg = false;
if (llvm::isa<Constant>(Src1)) {
IsSrc1ImmOrReg = true;
} else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
if (Var->hasReg())
IsSrc1ImmOrReg = true;
}
return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
}
OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
bool DoLegalize) {
OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
// It may be the case that address mode optimization already creates
// an OperandX8632Mem, so in that case it wouldn't need another level
// of transformation.
if (!Mem) {
Variable *Base = llvm::dyn_cast<Variable>(Operand);
Constant *Offset = llvm::dyn_cast<Constant>(Operand);
assert(Base || Offset);
if (Offset) {
// Make sure Offset is not undef.
Offset = llvm::cast<Constant>(legalize(Offset));
assert(llvm::isa<ConstantInteger32>(Offset) ||
llvm::isa<ConstantRelocatable>(Offset));
}
Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
}
return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
}
Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
// There aren't any 64-bit integer registers for x86-32.
assert(Type != IceType_i64);
Variable *Reg = Func->makeVariable(Type);
if (RegNum == Variable::NoRegister)
Reg->setWeightInfinite();
else
Reg->setRegNum(RegNum);
return Reg;
}
void TargetX8632::postLower() {
if (Ctx->getFlags().getOptLevel() == Opt_m1)
return;
inferTwoAddress();
}
void TargetX8632::makeRandomRegisterPermutation(
llvm::SmallVectorImpl<int32_t> &Permutation,
const llvm::SmallBitVector &ExcludeRegisters) const {
// TODO(stichnot): Declaring Permutation this way loses type/size
// information. Fix this in conjunction with the caller-side TODO.
assert(Permutation.size() >= RegX8632::Reg_NUM);
// Expected upper bound on the number of registers in a single
// equivalence class. For x86-32, this would comprise the 8 XMM
// registers. This is for performance, not correctness.
static const unsigned MaxEquivalenceClassSize = 8;
typedef llvm::SmallVector<int32_t, MaxEquivalenceClassSize> RegisterList;
typedef std::map<uint32_t, RegisterList> EquivalenceClassMap;
EquivalenceClassMap EquivalenceClasses;
SizeT NumShuffled = 0, NumPreserved = 0;
// Build up the equivalence classes of registers by looking at the
// register properties as well as whether the registers should be
// explicitly excluded from shuffling.
#define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
if (ExcludeRegisters[RegX8632::val]) { \
/* val stays the same in the resulting permutation. */ \
Permutation[RegX8632::val] = RegX8632::val; \
++NumPreserved; \
} else { \
const uint32_t Index = (scratch << 0) | (preserved << 1) | (isI8 << 2) | \
(isInt << 3) | (isFP << 4); \
/* val is assigned to an equivalence class based on its properties. */ \
EquivalenceClasses[Index].push_back(RegX8632::val); \
}
REGX8632_TABLE
#undef X
RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
// Shuffle the resulting equivalence classes.
for (auto I : EquivalenceClasses) {
const RegisterList &List = I.second;
RegisterList Shuffled(List);
RandomShuffle(Shuffled.begin(), Shuffled.end(), RNG);
for (size_t SI = 0, SE = Shuffled.size(); SI < SE; ++SI) {
Permutation[List[SI]] = Shuffled[SI];
++NumShuffled;
}
}
assert(NumShuffled + NumPreserved == RegX8632::Reg_NUM);
if (Func->isVerbose(IceV_Random)) {
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
Str << "Register equivalence classes:\n";
for (auto I : EquivalenceClasses) {
Str << "{";
const RegisterList &List = I.second;
bool First = true;
for (int32_t Register : List) {
if (!First)
Str << " ";
First = false;
Str << getRegName(Register, IceType_i32);
}
Str << "}\n";
}
}
}
void TargetX8632::emit(const ConstantInteger32 *C) const {
if (!ALLOW_DUMP)
return;
Ostream &Str = Ctx->getStrEmit();
Str << getConstantPrefix() << C->getValue();
}
void TargetX8632::emit(const ConstantInteger64 *) const {
llvm::report_fatal_error("Not expecting to emit 64-bit integers");
}
void TargetX8632::emit(const ConstantFloat *C) const {
if (!ALLOW_DUMP)
return;
Ostream &Str = Ctx->getStrEmit();
C->emitPoolLabel(Str);
}
void TargetX8632::emit(const ConstantDouble *C) const {
if (!ALLOW_DUMP)
return;
Ostream &Str = Ctx->getStrEmit();
C->emitPoolLabel(Str);
}
void TargetX8632::emit(const ConstantUndef *) const {
llvm::report_fatal_error("undef value encountered by emitter.");
}
TargetDataX8632::TargetDataX8632(GlobalContext *Ctx)
: TargetDataLowering(Ctx) {}
void TargetDataX8632::lowerGlobal(const VariableDeclaration &Var) const {
// If external and not initialized, this must be a cross test.
// Don't generate a declaration for such cases.
bool IsExternal = Var.isExternal() || Ctx->getFlags().getDisableInternal();
if (IsExternal && !Var.hasInitializer())
return;
Ostream &Str = Ctx->getStrEmit();
const VariableDeclaration::InitializerListType &Initializers =
Var.getInitializers();
bool HasNonzeroInitializer = Var.hasNonzeroInitializer();
bool IsConstant = Var.getIsConstant();
uint32_t Align = Var.getAlignment();
SizeT Size = Var.getNumBytes();
IceString MangledName = Var.mangleName(Ctx);
IceString SectionSuffix = "";
if (Ctx->getFlags().getDataSections())
SectionSuffix = "." + MangledName;
Str << "\t.type\t" << MangledName << ",@object\n";
if (IsConstant)
Str << "\t.section\t.rodata" << SectionSuffix << ",\"a\",@progbits\n";
else if (HasNonzeroInitializer)
Str << "\t.section\t.data" << SectionSuffix << ",\"aw\",@progbits\n";
else
Str << "\t.section\t.bss" << SectionSuffix << ",\"aw\",@nobits\n";
if (IsExternal)
Str << "\t.globl\t" << MangledName << "\n";
if (Align > 1)
Str << "\t.align\t" << Align << "\n";
Str << MangledName << ":\n";
if (HasNonzeroInitializer) {
for (VariableDeclaration::Initializer *Init : Initializers) {
switch (Init->getKind()) {
case VariableDeclaration::Initializer::DataInitializerKind: {
const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(Init)
->getContents();
for (SizeT i = 0; i < Init->getNumBytes(); ++i) {
Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";
}
break;
}
case VariableDeclaration::Initializer::ZeroInitializerKind:
Str << "\t.zero\t" << Init->getNumBytes() << "\n";
break;
case VariableDeclaration::Initializer::RelocInitializerKind: {
const auto Reloc =
llvm::cast<VariableDeclaration::RelocInitializer>(Init);
Str << "\t.long\t";
Str << Reloc->getDeclaration()->mangleName(Ctx);
if (RelocOffsetT Offset = Reloc->getOffset()) {
if (Offset >= 0 || (Offset == INT32_MIN))
Str << " + " << Offset;
else
Str << " - " << -Offset;
}
Str << "\n";
break;
}
}
}
} else
// NOTE: for non-constant zero initializers, this is BSS (no bits),
// so an ELF writer would not write to the file, and only track
// virtual offsets, but the .s writer still needs this .zero and
// cannot simply use the .size to advance offsets.
Str << "\t.zero\t" << Size << "\n";
Str << "\t.size\t" << MangledName << ", " << Size << "\n";
}
void TargetDataX8632::lowerGlobals(
std::unique_ptr<VariableDeclarationList> Vars) const {
switch (Ctx->getFlags().getOutFileType()) {
case FT_Elf: {
ELFObjectWriter *Writer = Ctx->getObjectWriter();
Writer->writeDataSection(*Vars, llvm::ELF::R_386_32);
} break;
case FT_Asm:
case FT_Iasm: {
const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
OstreamLocker L(Ctx);
for (const VariableDeclaration *Var : *Vars) {
if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
lowerGlobal(*Var);
}
}
} break;
}
}
template <typename T> struct PoolTypeConverter {};
template <> struct PoolTypeConverter<float> {
typedef uint32_t PrimitiveIntType;
typedef ConstantFloat IceType;
static const Type Ty = IceType_f32;
static const char *TypeName;
static const char *AsmTag;
static const char *PrintfString;
};
const char *PoolTypeConverter<float>::TypeName = "float";
const char *PoolTypeConverter<float>::AsmTag = ".long";
const char *PoolTypeConverter<float>::PrintfString = "0x%x";
template <> struct PoolTypeConverter<double> {
typedef uint64_t PrimitiveIntType;
typedef ConstantDouble IceType;
static const Type Ty = IceType_f64;
static const char *TypeName;
static const char *AsmTag;
static const char *PrintfString;
};
const char *PoolTypeConverter<double>::TypeName = "double";
const char *PoolTypeConverter<double>::AsmTag = ".quad";
const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
template <typename T>
void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
if (!ALLOW_DUMP)
return;
Ostream &Str = Ctx->getStrEmit();
Type Ty = T::Ty;
SizeT Align = typeAlignInBytes(Ty);
ConstantList Pool = Ctx->getConstantPool(Ty);
Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
<< "\n";
Str << "\t.align\t" << Align << "\n";
for (Constant *C : Pool) {
typename T::IceType *Const = llvm::cast<typename T::IceType>(C);
typename T::IceType::PrimType Value = Const->getValue();
// Use memcpy() to copy bits from Value into RawValue in a way
// that avoids breaking strict-aliasing rules.
typename T::PrimitiveIntType RawValue;
memcpy(&RawValue, &Value, sizeof(Value));
char buf[30];
int CharsPrinted =
snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
assert(CharsPrinted >= 0 &&
(size_t)CharsPrinted < llvm::array_lengthof(buf));
(void)CharsPrinted; // avoid warnings if asserts are disabled
Const->emitPoolLabel(Str);
Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t# " << T::TypeName << " "
<< Value << "\n";
}
}
void TargetDataX8632::lowerConstants() const {
if (Ctx->getFlags().getDisableTranslation())
return;
// No need to emit constants from the int pool since (for x86) they
// are embedded as immediates in the instructions, just emit float/double.
switch (Ctx->getFlags().getOutFileType()) {
case FT_Elf: {
ELFObjectWriter *Writer = Ctx->getObjectWriter();
Writer->writeConstantPool<ConstantFloat>(IceType_f32);
Writer->writeConstantPool<ConstantDouble>(IceType_f64);
} break;
case FT_Asm:
case FT_Iasm: {
OstreamLocker L(Ctx);
emitConstantPool<PoolTypeConverter<float>>(Ctx);
emitConstantPool<PoolTypeConverter<double>>(Ctx);
} break;
}
}
TargetHeaderX8632::TargetHeaderX8632(GlobalContext *Ctx)
: TargetHeaderLowering(Ctx) {}
} // end of namespace Ice