Subzero: Use the linear-scan register allocator for Om1 as well.
This removes the need for Om1's postLower() code which did its own ad-hoc register allocation. And it actually speeds up Om1 translation significantly.
This mode of register allocation only allocates for infinite-weight Variables, while respecting live ranges of pre-colored Variables.
BUG= none
R=jvoung@chromium.org
Review URL: https://codereview.chromium.org/733643005
diff --git a/src/IceDefs.h b/src/IceDefs.h
index 9812e3e..a28da4b 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -102,6 +102,11 @@
Liveness_Intervals
};
+enum RegAllocKind {
+ RAK_Global, // full, global register allocation
+ RAK_InfOnly // allocation only for infinite-weight Variables
+};
+
enum VerboseItem {
IceV_None = 0,
IceV_Instructions = 1 << 0,
diff --git a/src/IceRegAlloc.cpp b/src/IceRegAlloc.cpp
index 80a9432..569a616 100644
--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -73,17 +73,16 @@
} // end of anonymous namespace
-void LinearScan::initForGlobalAlloc() {
+// Prepare for full register allocation of all variables. We depend
+// on liveness analysis to have calculated live ranges.
+void LinearScan::initForGlobal() {
TimerMarker T(TimerStack::TT_initUnhandled, Func);
- Unhandled.clear();
- UnhandledPrecolored.clear();
- Handled.clear();
- Inactive.clear();
- Active.clear();
- // Gather the live ranges of all variables and add them to the
- // Unhandled set.
+ FindPreference = true;
+ FindOverlap = true;
const VarList &Vars = Func->getVariables();
Unhandled.reserve(Vars.size());
+ // Gather the live ranges of all variables and add them to the
+ // Unhandled set.
for (Variable *Var : Vars) {
// Explicitly don't consider zero-weight variables, which are
// meant to be spill slots.
@@ -101,6 +100,128 @@
UnhandledPrecolored.push_back(Var);
}
}
+
+ // Build the (ordered) list of FakeKill instruction numbers.
+ Kills.clear();
+ for (CfgNode *Node : Func->getNodes()) {
+ for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E;
+ ++I) {
+ if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) {
+ if (!Kill->isDeleted() && !Kill->getLinked()->isDeleted())
+ Kills.push_back(I->getNumber());
+ }
+ }
+ }
+}
+
+// Prepare for very simple register allocation of only infinite-weight
+// Variables while respecting pre-colored Variables. Some properties
+// we take advantage of:
+//
+// * Live ranges of interest consist of a single segment.
+//
+// * Live ranges of interest never span a call instruction.
+//
+// * Phi instructions are not considered because either phis have
+// already been lowered, or they don't contain any pre-colored or
+// infinite-weight Variables.
+//
+// * We don't need to renumber instructions before computing live
+// ranges because all the high-level ICE instructions are deleted
+// prior to lowering, and the low-level instructions are added in
+// monotonically increasing order.
+//
+// * There are no opportunities for register preference or allowing
+// overlap.
+//
+// Some properties we aren't (yet) taking advantage of:
+//
+// * Because live ranges are a single segment, the Unhandled set will
+// always be empty, and the live range trimming operation is
+// unnecessary.
+//
+// * Calculating overlap of single-segment live ranges could be
+// optimized a bit.
+void LinearScan::initForInfOnly() {
+ TimerMarker T(TimerStack::TT_initUnhandled, Func);
+ FindPreference = false;
+ FindOverlap = false;
+ SizeT NumVars = 0;
+ const VarList &Vars = Func->getVariables();
+
+ // Iterate across all instructions and record the begin and end of
+ // the live range for each variable that is pre-colored or infinite
+ // weight.
+ std::vector<InstNumberT> LRBegin(Vars.size(), Inst::NumberSentinel);
+ std::vector<InstNumberT> LREnd(Vars.size(), Inst::NumberSentinel);
+ for (CfgNode *Node : Func->getNodes()) {
+ for (auto Inst = Node->getInsts().begin(), E = Node->getInsts().end();
+ Inst != E; ++Inst) {
+ if (Inst->isDeleted())
+ continue;
+ if (const Variable *Var = Inst->getDest()) {
+ if (Var->hasReg() || Var->getWeight() == RegWeight::Inf) {
+ if (LRBegin[Var->getIndex()] == Inst::NumberSentinel) {
+ LRBegin[Var->getIndex()] = Inst->getNumber();
+ ++NumVars;
+ }
+ }
+ }
+ for (SizeT I = 0; I < Inst->getSrcSize(); ++I) {
+ Operand *Src = Inst->getSrc(I);
+ SizeT NumVars = Src->getNumVars();
+ for (SizeT J = 0; J < NumVars; ++J) {
+ const Variable *Var = Src->getVar(J);
+ if (Var->hasReg() || Var->getWeight() == RegWeight::Inf)
+ LREnd[Var->getIndex()] = Inst->getNumber();
+ }
+ }
+ }
+ }
+
+ Unhandled.reserve(NumVars);
+ for (SizeT i = 0; i < Vars.size(); ++i) {
+ Variable *Var = Vars[i];
+ if (LRBegin[i] != Inst::NumberSentinel) {
+ assert(LREnd[i] != Inst::NumberSentinel);
+ Unhandled.push_back(Var);
+ Var->resetLiveRange();
+ const uint32_t WeightDelta = 1;
+ Var->addLiveRange(LRBegin[i], LREnd[i], WeightDelta);
+ Var->untrimLiveRange();
+ if (Var->hasReg()) {
+ Var->setRegNumTmp(Var->getRegNum());
+ Var->setLiveRangeInfiniteWeight();
+ UnhandledPrecolored.push_back(Var);
+ }
+ --NumVars;
+ }
+ }
+ // This isn't actually a fatal condition, but it would be nice to
+ // know if we somehow pre-calculated Unhandled's size wrong.
+ assert(NumVars == 0);
+
+ // Don't build up the list of Kills because we know that no
+ // infinite-weight Variable has a live range spanning a call.
+ Kills.clear();
+}
+
+void LinearScan::init(RegAllocKind Kind) {
+ Unhandled.clear();
+ UnhandledPrecolored.clear();
+ Handled.clear();
+ Inactive.clear();
+ Active.clear();
+
+ switch (Kind) {
+ case RAK_Global:
+ initForGlobal();
+ break;
+ case RAK_InfOnly:
+ initForInfOnly();
+ break;
+ }
+
struct CompareRanges {
bool operator()(const Variable *L, const Variable *R) {
InstNumberT Lstart = L->getLiveRange().getStart();
@@ -114,20 +235,6 @@
std::sort(Unhandled.rbegin(), Unhandled.rend(), CompareRanges());
std::sort(UnhandledPrecolored.rbegin(), UnhandledPrecolored.rend(),
CompareRanges());
-
- // Build the (ordered) list of FakeKill instruction numbers.
- Kills.clear();
- for (CfgNode *Node : Func->getNodes()) {
- for (auto I = Node->getInsts().begin(), E = Node->getInsts().end(); I != E;
- ++I) {
- if (I->isDeleted())
- continue;
- if (auto Kill = llvm::dyn_cast<InstFakeKill>(I)) {
- if (!Kill->getLinked()->isDeleted())
- Kills.push_back(I->getNumber());
- }
- }
- }
}
// Implements the linear-scan algorithm. Based on "Linear Scan
@@ -292,41 +399,41 @@
Variable *Prefer = NULL;
int32_t PreferReg = Variable::NoRegister;
bool AllowOverlap = false;
- if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) {
- assert(DefInst->getDest() == Cur);
- bool IsAssign = DefInst->isSimpleAssign();
- bool IsSingleDef = !VMetadata->isMultiDef(Cur);
- for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) {
- // TODO(stichnot): Iterate through the actual Variables of the
- // instruction, not just the source operands. This could
- // capture Load instructions, including address mode
- // optimization, for Prefer (but not for AllowOverlap).
- if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) {
- int32_t SrcReg = SrcVar->getRegNumTmp();
- // Only consider source variables that have (so far) been
- // assigned a register. That register must be one in the
- // RegMask set, e.g. don't try to prefer the stack pointer
- // as a result of the stacksave intrinsic.
- if (SrcVar->hasRegTmp() && RegMask[SrcReg]) {
- if (!Free[SrcReg]) {
- // Don't bother trying to enable AllowOverlap if the
- // register is already free.
- AllowOverlap =
- IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar);
- }
- if (AllowOverlap || Free[SrcReg]) {
- Prefer = SrcVar;
- PreferReg = SrcReg;
+ if (FindPreference) {
+ if (const Inst *DefInst = VMetadata->getFirstDefinition(Cur)) {
+ assert(DefInst->getDest() == Cur);
+ bool IsAssign = DefInst->isSimpleAssign();
+ bool IsSingleDef = !VMetadata->isMultiDef(Cur);
+ for (SizeT i = 0; i < DefInst->getSrcSize(); ++i) {
+ // TODO(stichnot): Iterate through the actual Variables of the
+ // instruction, not just the source operands. This could
+ // capture Load instructions, including address mode
+ // optimization, for Prefer (but not for AllowOverlap).
+ if (Variable *SrcVar = llvm::dyn_cast<Variable>(DefInst->getSrc(i))) {
+ int32_t SrcReg = SrcVar->getRegNumTmp();
+ // Only consider source variables that have (so far) been
+ // assigned a register. That register must be one in the
+ // RegMask set, e.g. don't try to prefer the stack pointer
+ // as a result of the stacksave intrinsic.
+ if (SrcVar->hasRegTmp() && RegMask[SrcReg]) {
+ if (FindOverlap && !Free[SrcReg]) {
+ // Don't bother trying to enable AllowOverlap if the
+ // register is already free.
+ AllowOverlap =
+ IsSingleDef && IsAssign && !overlapsDefs(Func, Cur, SrcVar);
+ }
+ if (AllowOverlap || Free[SrcReg]) {
+ Prefer = SrcVar;
+ PreferReg = SrcReg;
+ }
}
}
}
- }
- }
- if (Verbose) {
- if (Prefer) {
- Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
- << " LIVE=" << Prefer->getLiveRange() << " Overlap=" << AllowOverlap
- << "\n";
+ if (Verbose && Prefer) {
+ Str << "Initial Prefer=" << *Prefer << " R=" << PreferReg
+ << " LIVE=" << Prefer->getLiveRange()
+ << " Overlap=" << AllowOverlap << "\n";
+ }
}
}
@@ -353,12 +460,14 @@
// Disable AllowOverlap if an Active variable, which is not
// Prefer, shares Prefer's register, and has a definition within
// Cur's live range.
- for (const Variable *Item : Active) {
- int32_t RegNum = Item->getRegNumTmp();
- if (Item != Prefer && RegNum == PreferReg &&
- overlapsDefs(Func, Cur, Item)) {
- AllowOverlap = false;
- dumpDisableOverlap(Func, Item, "Active");
+ if (AllowOverlap) {
+ for (const Variable *Item : Active) {
+ int32_t RegNum = Item->getRegNumTmp();
+ if (Item != Prefer && RegNum == PreferReg &&
+ overlapsDefs(Func, Cur, Item)) {
+ AllowOverlap = false;
+ dumpDisableOverlap(Func, Item, "Active");
+ }
}
}
diff --git a/src/IceRegAlloc.h b/src/IceRegAlloc.h
index f409e7a..9b9992c 100644
--- a/src/IceRegAlloc.h
+++ b/src/IceRegAlloc.h
@@ -26,12 +26,16 @@
LinearScan &operator=(const LinearScan &) = delete;
public:
- LinearScan(Cfg *Func) : Func(Func) {}
- void initForGlobalAlloc();
+ LinearScan(Cfg *Func)
+ : Func(Func), FindPreference(false), FindOverlap(false) {}
+ void init(RegAllocKind Kind);
void scan(const llvm::SmallBitVector &RegMask);
void dump(Cfg *Func) const;
private:
+ void initForGlobal();
+ void initForInfOnly();
+
Cfg *const Func;
typedef std::vector<Variable *> OrderedRanges;
typedef std::list<Variable *> UnorderedRanges;
@@ -41,6 +45,12 @@
OrderedRanges UnhandledPrecolored;
UnorderedRanges Active, Inactive, Handled;
std::vector<InstNumberT> Kills;
+ bool FindPreference;
+ bool FindOverlap;
+ // TODO(stichnot): We're not really using FindOverlap yet, but we
+ // may want a flavor of register allocation where FindPreference is
+ // useful but we didn't want to initialize VMetadata with VMK_All
+ // and therefore we can't safely allow overlap.
};
} // end of namespace Ice
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index c8f504e..bab46f6 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -225,7 +225,7 @@
// perhaps for the frame pointer) to be allocated. This set of
// registers could potentially be parameterized if we want to restrict
// registers e.g. for performance testing.
-void TargetLowering::regAlloc() {
+void TargetLowering::regAlloc(RegAllocKind Kind) {
TimerMarker T(TimerStack::TT_regAlloc, Func);
LinearScan LinearScan(Func);
RegSetMask RegInclude = RegSet_None;
@@ -234,7 +234,7 @@
RegInclude |= RegSet_CalleeSave;
if (hasFramePointer())
RegExclude |= RegSet_FramePointer;
- LinearScan.initForGlobalAlloc();
+ LinearScan.init(Kind);
llvm::SmallBitVector RegMask = getRegisterSet(RegInclude, RegExclude);
LinearScan.scan(RegMask);
}
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index a8b0fcc..07d665f 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -195,7 +195,7 @@
virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
RegSetMask Exclude) const = 0;
virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
- void regAlloc();
+ void regAlloc(RegAllocKind Kind);
virtual void emitVariable(const Variable *Var) const = 0;
@@ -236,11 +236,7 @@
virtual void doAddressOptStore() {}
virtual void randomlyInsertNop(float Probability) = 0;
// This gives the target an opportunity to post-process the lowered
- // expansion before returning. The primary intention is to do some
- // Register Manager activity as necessary, specifically to eagerly
- // allocate registers based on affinity and other factors. The
- // simplest lowering does nothing here and leaves it all to a
- // subsequent global register allocation pass.
+ // expansion before returning.
virtual void postLower() {}
Cfg *Func;
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 6c3b624..cd40e09 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -9,9 +9,7 @@
//
// This file implements the TargetLoweringX8632 class, which
// consists almost entirely of the lowering sequence for each
-// high-level instruction. It also implements
-// TargetX8632Fast::postLower() which does the simplest possible
-// register allocation for the "fast" target.
+// high-level instruction.
//
//===----------------------------------------------------------------------===//
@@ -375,7 +373,7 @@
// associated cleanup, to make the dump cleaner and more useful.
Func->dump("After initial x8632 codegen");
Func->getVMetadata()->init(VMK_All);
- regAlloc();
+ regAlloc(RAK_Global);
if (Func->hasError())
return;
Func->dump("After linear scan regalloc");
@@ -429,6 +427,11 @@
return;
Func->dump("After initial x8632 codegen");
+ regAlloc(RAK_InfOnly);
+ if (Func->hasError())
+ return;
+ Func->dump("After regalloc of infinite-weight variables");
+
Func->genFrame();
if (Func->hasError())
return;
@@ -1816,9 +1819,6 @@
// stack locations.
for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
- // TODO: Consider calling postLower() here to reduce the register
- // pressure associated with using too many infinite weight
- // temporaries when lowering the call sequence in -Om1 mode.
}
// Copy arguments to be passed in registers to the appropriate
@@ -4112,8 +4112,6 @@
Variable *DestT = Func->makeVariable(Ty);
lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
T = DestT;
- // TODO(stichnot): Use postLower() in -Om1 mode to avoid buildup of
- // infinite weight temporaries.
}
lowerAssign(InstAssign::create(Func, Dest, T));
@@ -4200,7 +4198,7 @@
assert(Node->getPhis().empty());
CfgNode *Succ = Node->getOutEdges().front();
getContext().init(Node);
- // Register set setup similar to regAlloc() and postLower().
+ // Register set setup similar to regAlloc().
RegSetMask RegInclude = RegSet_All;
RegSetMask RegExclude = RegSet_StackPointer;
if (hasFramePointer())
@@ -4512,116 +4510,21 @@
}
void TargetX8632::postLower() {
- if (Ctx->getOptLevel() != Opt_m1) {
- // Find two-address non-SSA instructions where Dest==Src0, and set
- // the DestNonKillable flag to keep liveness analysis consistent.
- for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
- if (Inst->isDeleted())
- continue;
- if (Variable *Dest = Inst->getDest()) {
- // TODO(stichnot): We may need to consider all source
- // operands, not just the first one, if using 3-address
- // instructions.
- if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
- Inst->setDestNonKillable();
- }
- }
+ if (Ctx->getOptLevel() == Opt_m1)
return;
- }
- // TODO: Avoid recomputing WhiteList every instruction.
- RegSetMask RegInclude = RegSet_All;
- RegSetMask RegExclude = RegSet_StackPointer;
- if (hasFramePointer())
- RegExclude |= RegSet_FramePointer;
- llvm::SmallBitVector WhiteList = getRegisterSet(RegInclude, RegExclude);
- // Make one pass to black-list pre-colored registers. TODO: If
- // there was some prior register allocation pass that made register
- // assignments, those registers need to be black-listed here as
- // well.
- llvm::DenseMap<const Variable *, const Inst *> LastUses;
- // The first pass also keeps track of which instruction is the last
- // use for each infinite-weight variable. After the last use, the
- // variable is released to the free list.
+ // Find two-address non-SSA instructions where Dest==Src0, and set
+ // the DestNonKillable flag to keep liveness analysis consistent.
for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
if (Inst->isDeleted())
continue;
- // Don't consider a FakeKill instruction, because (currently) it
- // is only used to kill all scratch registers at a call site, and
- // we don't want to black-list all scratch registers during the
- // call lowering. This could become a problem since it relies on
- // the lowering sequence not keeping any infinite-weight variables
- // live across a call. TODO(stichnot): Consider replacing this
- // whole postLower() implementation with a robust local register
- // allocator, for example compute live ranges only for pre-colored
- // and infinite-weight variables and run the existing linear-scan
- // allocator.
- assert(!llvm::isa<InstFakeKill>(Inst) || Inst->getSrcSize() == 0);
- for (SizeT SrcNum = 0; SrcNum < Inst->getSrcSize(); ++SrcNum) {
- Operand *Src = Inst->getSrc(SrcNum);
- SizeT NumVars = Src->getNumVars();
- for (SizeT J = 0; J < NumVars; ++J) {
- const Variable *Var = Src->getVar(J);
- // Track last uses of all variables, regardless of whether
- // they are pre-colored or infinite-weight.
- LastUses[Var] = Inst;
- if (!Var->hasReg())
- continue;
- WhiteList[Var->getRegNum()] = false;
- }
+ if (Variable *Dest = Inst->getDest()) {
+ // TODO(stichnot): We may need to consider all source
+ // operands, not just the first one, if using 3-address
+ // instructions.
+ if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
+ Inst->setDestNonKillable();
}
}
- // The second pass colors infinite-weight variables.
- llvm::SmallBitVector AvailableRegisters = WhiteList;
- llvm::SmallBitVector FreedRegisters(WhiteList.size());
- for (auto Inst = Context.begin(), E = Context.end(); Inst != E; ++Inst) {
- FreedRegisters.reset();
- if (Inst->isDeleted())
- continue;
- // Iterate over all variables referenced in the instruction,
- // including the Dest variable (if any). If the variable is
- // marked as infinite-weight, find it a register. If this
- // instruction is the last use of the variable in the lowered
- // sequence, release the register to the free list after this
- // instruction is completely processed. Note that the first pass
- // ignores the Dest operand, under the assumption that a
- // pre-colored Dest will appear as a source operand in some
- // subsequent instruction in the lowered sequence.
- Variable *Dest = Inst->getDest();
- SizeT NumSrcs = Inst->getSrcSize();
- if (Dest)
- ++NumSrcs;
- if (NumSrcs == 0)
- continue;
- OperandList Srcs(NumSrcs);
- for (SizeT i = 0; i < Inst->getSrcSize(); ++i)
- Srcs[i] = Inst->getSrc(i);
- if (Dest)
- Srcs[NumSrcs - 1] = Dest;
- for (SizeT SrcNum = 0; SrcNum < NumSrcs; ++SrcNum) {
- Operand *Src = Srcs[SrcNum];
- SizeT NumVars = Src->getNumVars();
- for (SizeT J = 0; J < NumVars; ++J) {
- Variable *Var = Src->getVar(J);
- if (!Var->hasReg() && Var->getWeight().isInf()) {
- llvm::SmallBitVector AvailableTypedRegisters =
- AvailableRegisters & getRegisterSetForType(Var->getType());
- assert(AvailableTypedRegisters.any());
- int32_t RegNum = AvailableTypedRegisters.find_first();
- Var->setRegNum(RegNum);
- AvailableRegisters[RegNum] = false;
- }
- if (Var->hasReg()) {
- int32_t RegNum = Var->getRegNum();
- assert(!AvailableRegisters[RegNum]);
- if (LastUses[Var] == Inst) {
- if (WhiteList[RegNum])
- FreedRegisters[RegNum] = true;
- }
- }
- }
- }
- AvailableRegisters |= FreedRegisters;
- }
}
template <> void ConstantInteger32::emit(GlobalContext *Ctx) const {
diff --git a/src/IceTranslator.cpp b/src/IceTranslator.cpp
index edaa74f..08e5697 100644
--- a/src/IceTranslator.cpp
+++ b/src/IceTranslator.cpp
@@ -83,10 +83,12 @@
ErrorStatus = true;
}
- if (Ctx->getFlags().UseIntegratedAssembler) {
- Func->emitIAS();
- } else {
- Func->emit();
+ if (!ErrorStatus) {
+ if (Ctx->getFlags().UseIntegratedAssembler) {
+ Func->emitIAS();
+ } else {
+ Func->emit();
+ }
}
Ctx->dumpStats(Func->getFunctionName());
}
diff --git a/tests_lit/llvm2ice_tests/ebp_args.ll b/tests_lit/llvm2ice_tests/ebp_args.ll
index 4b7c928..09709b0 100644
--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -38,10 +38,10 @@
; CHECK: movzx eax, byte ptr [ebp - 4]
; CHECK: mov dword ptr [ebp - 16], eax
; CHECK: sub esp, 16
-; CHECK: mov ecx, dword ptr [ebp + 8]
-; CHECK: mov dword ptr [esp], ecx
-; CHECK: mov ecx, dword ptr [ebp - 12]
-; CHECK: mov dword ptr [esp + 4], ecx
-; CHECK: mov ecx, dword ptr [ebp - 16]
-; CHECK: mov dword ptr [esp + 8], ecx
+; CHECK: mov eax, dword ptr [ebp + 8]
+; CHECK: mov dword ptr [esp], eax
+; CHECK: mov eax, dword ptr [ebp - 12]
+; CHECK: mov dword ptr [esp + 4], eax
+; CHECK: mov eax, dword ptr [ebp - 16]
+; CHECK: mov dword ptr [esp + 8], eax
; CHECK: call -4
diff --git a/tests_lit/llvm2ice_tests/nop-insertion.ll b/tests_lit/llvm2ice_tests/nop-insertion.ll
index 835ac76..0c252ca 100644
--- a/tests_lit/llvm2ice_tests/nop-insertion.ll
+++ b/tests_lit/llvm2ice_tests/nop-insertion.ll
@@ -31,9 +31,9 @@
; PROB50: pmuludq %xmm2, %xmm1
; PROB50: nop # variant = 0
; PROB50: shufps $136, %xmm1, %xmm0
-; PROB50: pshufd $216, %xmm0, %xmm1
+; PROB50: pshufd $216, %xmm0, %xmm0
; PROB50: nop # variant = 2
-; PROB50: movups %xmm1, (%esp)
+; PROB50: movups %xmm0, (%esp)
; PROB50: movups (%esp), %xmm0
; PROB50: addl $60, %esp
; PROB50: nop # variant = 0
@@ -59,9 +59,9 @@
; PROB90: nop # variant = 3
; PROB90: shufps $136, %xmm1, %xmm0
; PROB90: nop # variant = 4
-; PROB90: pshufd $216, %xmm0, %xmm1
+; PROB90: pshufd $216, %xmm0, %xmm0
; PROB90: nop # variant = 2
-; PROB90: movups %xmm1, (%esp)
+; PROB90: movups %xmm0, (%esp)
; PROB90: nop # variant = 4
; PROB90: movups (%esp), %xmm0
; PROB90: nop # variant = 2
@@ -86,9 +86,9 @@
; MAXNOPS2: nop # variant = 3
; MAXNOPS2: pmuludq %xmm2, %xmm1
; MAXNOPS2: shufps $136, %xmm1, %xmm0
-; MAXNOPS2: pshufd $216, %xmm0, %xmm1
+; MAXNOPS2: pshufd $216, %xmm0, %xmm0
; MAXNOPS2: nop # variant = 3
-; MAXNOPS2: movups %xmm1, (%esp)
+; MAXNOPS2: movups %xmm0, (%esp)
; MAXNOPS2: nop # variant = 0
; MAXNOPS2: movups (%esp), %xmm0
; MAXNOPS2: nop # variant = 2