Add support for passing and returning vectors in accordance with the x86 calling convention. - Add TargetLowering::lowerArguments() as a new stage in TargetLowering. - Add support for passing arguments/return values in XMM registers in the x86 target. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/372113005

commit: 45a06236119e496fe2df2469122a15a2e05d8f8e [log] [tgz]
author: Matt Wala <wala@chromium.org> Wed Jul 09 16:33:22 2014 -0700
committer: Matt Wala <wala@chromium.org> Wed Jul 09 16:33:22 2014 -0700
tree: 18bd3511f19b2450ced116d5918886865fc35e26
parent: f37fbbe924edccd340b7448a7482c45a9fccda5b [diff]
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 6bd9bf7..bc1bf63 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp

@@ -110,6 +110,10 @@
   }
 }
 
+void Cfg::doArgLowering() {
+  getTarget()->lowerArguments();
+}
+
 void Cfg::doAddressOpt() {
   for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
     (*I)->doAddressOpt();

diff --git a/src/IceCfg.h b/src/IceCfg.h
index dbf08c6..856b145 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h

@@ -69,6 +69,7 @@
   // Manage arguments to the function.
   void addArg(Variable *Arg);
   const VarList &getArgs() const { return Args; }
+  VarList &getArgs() { return Args; }
 
   // Miscellaneous accessors.
   TargetLowering *getTarget() const { return Target.get(); }
@@ -86,6 +87,7 @@
   void placePhiStores();
   void deletePhis();
   void doAddressOpt();
+  void doArgLowering();
   void genCode();
   void genFrame();
   void livenessLightweight();

diff --git a/src/IceDefs.h b/src/IceDefs.h
index 4c616f8..6aee09a 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h

@@ -62,6 +62,7 @@
 typedef std::list<Inst *> InstList;
 typedef std::list<InstPhi *> PhiList;
 typedef std::vector<Variable *> VarList;
+typedef std::vector<Operand *> OperandList;
 typedef std::vector<CfgNode *> NodeList;
 typedef std::vector<Constant *> ConstantList;
 

diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 63e2cd7..a3f0786 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp

@@ -734,6 +734,14 @@
   Str << "\n";
 }
 
+void InstX8632Movp::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "movups." << getDest()->getType() << " ";
+  dumpDest(Func);
+  Str << ", ";
+  dumpSources(Func);
+}
+
 void InstX8632Movq::emit(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
@@ -746,14 +754,6 @@
   Str << "\n";
 }
 
-void InstX8632Movp::dump(const Cfg *Func) const {
-  Ostream &Str = Func->getContext()->getStrDump();
-  Str << "movups." << getDest()->getType() << " ";
-  dumpDest(Func);
-  Str << ", ";
-  dumpSources(Func);
-}
-
 void InstX8632Movq::dump(const Cfg *Func) const {
   Ostream &Str = Func->getContext()->getStrDump();
   Str << "movq." << getDest()->getType() << " ";
@@ -882,14 +882,19 @@
   assert(getSrcSize() == 1);
   Type Ty = getSrc(0)->getType();
   Variable *Var = llvm::dyn_cast<Variable>(getSrc(0));
-  if ((Ty == IceType_f32 || Ty == IceType_f64) && Var && Var->hasReg()) {
+  if ((isVectorType(Ty) || Ty == IceType_f32 || Ty == IceType_f64) && Var &&
+      Var->hasReg()) {
     // The xmm registers can't be directly pushed, so we fake it by
     // decrementing esp and then storing to [esp].
     Str << "\tsub\tesp, " << typeWidthInBytes(Ty) << "\n";
     if (!SuppressStackAdjustment)
       Func->getTarget()->updateStackAdjustment(typeWidthInBytes(Ty));
-    Str << "\tmov" << TypeX8632Attributes[Ty].SdSsString << "\t"
-        << TypeX8632Attributes[Ty].WidthString << " [esp], ";
+    if (isVectorType(Ty)) {
+      Str << "\tmovups\txmmword ptr [esp], ";
+    } else {
+      Str << "\tmov" << TypeX8632Attributes[Ty].SdSsString << "\t"
+          << TypeX8632Attributes[Ty].WidthString << " [esp], ";
+    }
     getSrc(0)->emit(Func);
     Str << "\n";
   } else if (Ty == IceType_f64 && (!Var || !Var->hasReg())) {

diff --git a/src/IceOperand.cpp b/src/IceOperand.cpp
index 02e85c7..d8a754b 100644
--- a/src/IceOperand.cpp
+++ b/src/IceOperand.cpp

@@ -159,14 +159,18 @@
   setDefinition(Inst, Node);
 }
 
-void Variable::setIsArg(Cfg *Func) {
-  IsArgument = true;
-  if (DefNode == NULL)
-    return;
-  CfgNode *Entry = Func->getEntryNode();
-  if (DefNode == Entry)
-    return;
-  DefNode = NULL;
+void Variable::setIsArg(Cfg *Func, bool IsArg) {
+  if (IsArg) {
+    IsArgument = true;
+    if (DefNode == NULL)
+      return;
+    CfgNode *Entry = Func->getEntryNode();
+    if (DefNode == Entry)
+      return;
+    DefNode = NULL;
+  } else {
+    IsArgument = false;
+  }
 }
 
 IceString Variable::getName() const {

diff --git a/src/IceOperand.h b/src/IceOperand.h
index 018445a..bbd6688 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h

@@ -339,7 +339,7 @@
   void setUse(const Inst *Inst, const CfgNode *Node);
 
   bool getIsArg() const { return IsArgument; }
-  void setIsArg(Cfg *Func);
+  void setIsArg(Cfg *Func, bool IsArg = true);
 
   int32_t getStackOffset() const { return StackOffset; }
   void setStackOffset(int32_t Offset) { StackOffset = Offset; }

diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index ddb66fa..ed5389c 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h

@@ -148,6 +148,9 @@
 
   virtual void emitVariable(const Variable *Var, const Cfg *Func) const = 0;
 
+  // Performs target-specific argument lowering.
+  virtual void lowerArguments() = 0;
+
   virtual void addProlog(CfgNode *Node) = 0;
   virtual void addEpilog(CfgNode *Node) = 0;
 

diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 62a525c..c2782f6 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp

@@ -85,6 +85,9 @@
   return TableIcmp32[Index].Mapping;
 }
 
+// The maximum number of arguments to pass in XMM registers
+const unsigned X86_MAX_XMM_ARGS = 4;
+
 // In some cases, there are x-macros tables for both high-level and
 // low-level instructions/operands that use the same enum key value.
 // The tables are kept separate to maintain a proper separation
@@ -247,6 +250,11 @@
   Func->doAddressOpt();
   T_doAddressOpt.printElapsedUs(Context, "doAddressOpt()");
 
+  // Argument lowering
+  Timer T_argLowering;
+  Func->doArgLowering();
+  T_argLowering.printElapsedUs(Context, "lowerArguments()");
+
   // Target lowering.  This requires liveness analysis for some parts
   // of the lowering decisions, such as compare/branch fusing.  If
   // non-lightweight liveness analysis is used, the instructions need
@@ -258,6 +266,7 @@
   if (Func->hasError())
     return;
   T_renumber1.printElapsedUs(Context, "renumberInstructions()");
+
   // TODO: It should be sufficient to use the fastest liveness
   // calculation, i.e. livenessLightweight().  However, for some
   // reason that slows down the rest of the translation.  Investigate.
@@ -267,6 +276,7 @@
     return;
   T_liveness1.printElapsedUs(Context, "liveness()");
   Func->dump("After x86 address mode opt");
+
   Timer T_genCode;
   Func->genCode();
   if (Func->hasError())
@@ -329,6 +339,10 @@
   T_deletePhis.printElapsedUs(Context, "deletePhis()");
   Func->dump("After Phi lowering");
 
+  Timer T_argLowering;
+  Func->doArgLowering();
+  T_argLowering.printElapsedUs(Context, "lowerArguments()");
+
   Timer T_genCode;
   Func->genCode();
   if (Func->hasError())
@@ -412,34 +426,74 @@
   Str << "]";
 }
 
-// Helper function for addProlog().  Sets the frame offset for Arg,
-// updates InArgsSizeBytes according to Arg's width, and generates an
-// instruction to copy Arg into its assigned register if applicable.
-// For an I64 arg that has been split into Lo and Hi components, it
-// calls itself recursively on the components, taking care to handle
-// Lo first because of the little-endian architecture.
-void TargetX8632::setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                                      size_t BasicFrameOffset,
-                                      size_t &InArgsSizeBytes) {
+void TargetX8632::lowerArguments() {
+  VarList &Args = Func->getArgs();
+  // The first four arguments of vector type, regardless of their
+  // position relative to the other arguments in the argument list, are
+  // passed in registers xmm0 - xmm3.
+  unsigned NumXmmArgs = 0;
+
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+
+  for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS;
+       ++I) {
+    Variable *Arg = Args[I];
+    Type Ty = Arg->getType();
+    if (!isVectorType(Ty))
+      continue;
+    // Replace Arg in the argument list with the home register.  Then
+    // generate an instruction in the prolog to copy the home register
+    // to the assigned location of Arg.
+    int32_t RegNum = Reg_xmm0 + NumXmmArgs;
+    ++NumXmmArgs;
+    IceString Name = "home_reg:" + Arg->getName();
+    const CfgNode *DefNode = NULL;
+    Variable *RegisterArg = Func->makeVariable(Ty, DefNode, Name);
+    RegisterArg->setRegNum(RegNum);
+    RegisterArg->setIsArg(Func);
+    Arg->setIsArg(Func, false);
+
+    Args[I] = RegisterArg;
+    Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+  }
+}
+
+// Helper function for addProlog().
+//
+// This assumes Arg is an argument passed on the stack.  This sets the
+// frame offset for Arg and updates InArgsSizeBytes according to Arg's
+// width.  For an I64 arg that has been split into Lo and Hi components,
+// it calls itself recursively on the components, taking care to handle
+// Lo first because of the little-endian architecture.  Lastly, this
+// function generates an instruction to copy Arg into its assigned
+// register if applicable.
+void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                                         size_t BasicFrameOffset,
+                                         size_t &InArgsSizeBytes) {
   Variable *Lo = Arg->getLo();
   Variable *Hi = Arg->getHi();
   Type Ty = Arg->getType();
   if (Lo && Hi && Ty == IceType_i64) {
     assert(Lo->getType() != IceType_i64); // don't want infinite recursion
     assert(Hi->getType() != IceType_i64); // don't want infinite recursion
-    setArgOffsetAndCopy(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
-    setArgOffsetAndCopy(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
     return;
   }
   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
+  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   if (Arg->hasReg()) {
     assert(Ty != IceType_i64);
     OperandX8632Mem *Mem = OperandX8632Mem::create(
         Func, Ty, FramePtr,
         Ctx->getConstantInt(IceType_i32, Arg->getStackOffset()));
-    _mov(Arg, Mem);
+    if (isVectorType(Arg->getType())) {
+      _movp(Arg, Mem);
+    } else {
+      _mov(Arg, Mem);
+    }
   }
-  InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
 }
 
 Type TargetX8632::stackSlotType() { return IceType_i32; }
@@ -489,7 +543,8 @@
       RegsUsed[Var->getRegNum()] = true;
       continue;
     }
-    // An argument passed on the stack already has a stack slot.
+    // An argument either does not need a stack slot (if passed in a
+    // register) or already has one (if passed on the stack).
     if (Var->getIsArg())
       continue;
     // An unreferenced variable doesn't need a stack slot.
@@ -547,23 +602,23 @@
 
   resetStackAdjustment();
 
-  // Fill in stack offsets for args, and copy args into registers for
-  // those that were register-allocated.  Args are pushed right to
+  // Fill in stack offsets for stack args, and copy args into registers
+  // for those that were register-allocated.  Args are pushed right to
   // left, so Arg[0] is closest to the stack/frame pointer.
-  //
-  // TODO: Make this right for different width args, calling
-  // conventions, etc.  For one thing, args passed in registers will
-  // need to be copied/shuffled to their home registers (the
-  // RegManager code may have some permutation logic to leverage),
-  // and if they have no home register, home space will need to be
-  // allocated on the stack to copy into.
   Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
   size_t BasicFrameOffset = PreservedRegsSizeBytes + RetIpSizeBytes;
   if (!IsEbpBasedFrame)
     BasicFrameOffset += LocalsSizeBytes;
+
+  unsigned NumXmmArgs = 0;
   for (SizeT i = 0; i < Args.size(); ++i) {
     Variable *Arg = Args[i];
-    setArgOffsetAndCopy(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
+    // Skip arguments passed in registers.
+    if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
+      ++NumXmmArgs;
+      continue;
+    }
+    finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   }
 
   // Fill in stack offsets for locals.
@@ -1253,7 +1308,10 @@
     const bool AllowOverlap = true;
     // RI is either a physical register or an immediate.
     Operand *RI = legalize(Src0, Legal_Reg | Legal_Imm, AllowOverlap);
-    _mov(Dest, RI);
+    if (isVectorType(Dest->getType()))
+      _movp(Dest, RI);
+    else
+      _mov(Dest, RI);
   }
 }
 
@@ -1269,31 +1327,44 @@
 }
 
 void TargetX8632::lowerCall(const InstCall *Instr) {
-  // Generate a sequence of push instructions, pushing right to left,
-  // keeping track of stack offsets in case a push involves a stack
-  // operand and we are using an esp-based frame.
+  // Classify each argument operand according to the location where the
+  // argument is passed.
+  OperandList XmmArgs;
+  OperandList StackArgs;
+  for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
+    Operand *Arg = Instr->getArg(i);
+    if (isVectorType(Arg->getType()) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
+      XmmArgs.push_back(Arg);
+    } else {
+      StackArgs.push_back(Arg);
+    }
+  }
+  // For stack arguments, generate a sequence of push instructions,
+  // pushing right to left, keeping track of stack offsets in case a
+  // push involves a stack operand and we are using an esp-based frame.
   uint32_t StackOffset = 0;
+  // TODO: Consolidate the stack adjustment for function calls by
+  // reserving enough space for the arguments only once.
+  //
   // TODO: If for some reason the call instruction gets dead-code
   // eliminated after lowering, we would need to ensure that the
   // pre-call push instructions and the post-call esp adjustment get
   // eliminated as well.
-  for (SizeT NumArgs = Instr->getNumArgs(), i = 0; i < NumArgs; ++i) {
-    Operand *Arg = legalize(Instr->getArg(NumArgs - i - 1));
+  for (OperandList::reverse_iterator I = StackArgs.rbegin(),
+           E = StackArgs.rend(); I != E; ++I) {
+    Operand *Arg = legalize(*I);
     if (Arg->getType() == IceType_i64) {
       _push(hiOperand(Arg));
       _push(loOperand(Arg));
-    } else if (Arg->getType() == IceType_f64) {
-      // If the Arg turns out to be a memory operand, we need to push
-      // 8 bytes, which requires two push instructions.  This ends up
-      // being somewhat clumsy in the current IR, so we use a
-      // workaround.  Force the operand into a (xmm) register, and
-      // then push the register.  An xmm register push is actually not
-      // possible in x86, but the Push instruction emitter handles
-      // this by decrementing the stack pointer and directly writing
-      // the xmm register value.
-      Variable *T = NULL;
-      _mov(T, Arg);
-      _push(T);
+    } else if (Arg->getType() == IceType_f64 || isVectorType(Arg->getType())) {
+      // If the Arg turns out to be a memory operand, more than one push
+      // instruction is required.  This ends up being somewhat clumsy in
+      // the current IR, so we use a workaround.  Force the operand into
+      // a (xmm) register, and then push the register.  An xmm register
+      // push is actually not possible in x86, but the Push instruction
+      // emitter handles this by decrementing the stack pointer and
+      // directly writing the xmm register value.
+      _push(legalize(Arg, Legal_Reg));
     } else {
       // Otherwise PNaCl requires parameter types to be at least 32-bits.
       assert(Arg->getType() == IceType_f32 || Arg->getType() == IceType_i32);
@@ -1301,11 +1372,28 @@
     }
     StackOffset += typeWidthInBytesOnStack(Arg->getType());
   }
+  // Copy arguments to be passed in registers to the appropriate
+  // registers.
+  // TODO: Investigate the impact of lowering arguments passed in
+  // registers after lowering stack arguments as opposed to the other
+  // way around.  Lowering register arguments after stack arguments may
+  // reduce register pressure.  On the other hand, lowering register
+  // arguments first (before stack arguments) may result in more compact
+  // code, as the memory operand displacements may end up being smaller
+  // before any stack adjustment is done.
+  for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
+    Variable *Reg = legalizeToVar(XmmArgs[i], false, Reg_xmm0 + i);
+    // Generate a FakeUse of register arguments so that they do not get
+    // dead code eliminated as a result of the FakeKill of scratch
+    // registers after the call.
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
   // Generate the call instruction.  Assign its result to a temporary
   // with high register allocation weight.
   Variable *Dest = Instr->getDest();
-  Variable *eax = NULL; // doubles as RegLo as necessary
-  Variable *edx = NULL;
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = NULL;
+  Variable *ReturnRegHi = NULL;
   if (Dest) {
     switch (Dest->getType()) {
     case IceType_NUM:
@@ -1317,16 +1405,16 @@
     case IceType_i8:
     case IceType_i16:
     case IceType_i32:
-      eax = makeReg(Dest->getType(), Reg_eax);
+      ReturnReg = makeReg(Dest->getType(), Reg_eax);
       break;
     case IceType_i64:
-      eax = makeReg(IceType_i32, Reg_eax);
-      edx = makeReg(IceType_i32, Reg_edx);
+      ReturnReg = makeReg(IceType_i32, Reg_eax);
+      ReturnRegHi = makeReg(IceType_i32, Reg_edx);
       break;
     case IceType_f32:
     case IceType_f64:
-      // Leave eax==edx==NULL, and capture the result with the fstp
-      // instruction.
+      // Leave ReturnReg==ReturnRegHi==NULL, and capture the result with
+      // the fstp instruction.
       break;
     case IceType_v4i1:
     case IceType_v8i1:
@@ -1334,24 +1422,18 @@
     case IceType_v16i8:
     case IceType_v8i16:
     case IceType_v4i32:
-    case IceType_v4f32: {
-      // TODO(wala): Handle return values of vector type in the caller.
-      IceString Ty;
-      llvm::raw_string_ostream BaseOS(Ty);
-      Ostream OS(&BaseOS);
-      OS << Dest->getType();
-      Func->setError("Unhandled dest type: " + BaseOS.str());
-      return;
-    }
+    case IceType_v4f32:
+      ReturnReg = makeReg(Dest->getType(), Reg_xmm0);
+      break;
     }
   }
   // TODO(stichnot): LEAHACK: remove Legal_All (and use default) once
   // a proper emitter is used.
   Operand *CallTarget = legalize(Instr->getCallTarget(), Legal_All);
-  Inst *NewCall = InstX8632Call::create(Func, eax, CallTarget);
+  Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
   Context.insert(NewCall);
-  if (edx)
-    Context.insert(InstFakeDef::create(Func, edx));
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
 
   // Add the appropriate offset to esp.
   if (StackOffset) {
@@ -1368,34 +1450,42 @@
   Context.insert(InstFakeKill::create(Func, KilledRegs, NewCall));
 
   // Generate a FakeUse to keep the call live if necessary.
-  if (Instr->hasSideEffects() && eax) {
-    Inst *FakeUse = InstFakeUse::create(Func, eax);
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
     Context.insert(FakeUse);
   }
+  
+  if (!Dest)
+    return;
 
-  // Generate Dest=eax assignment.
-  if (Dest && eax) {
-    if (edx) {
+  // Assign the result of the call to Dest.
+  if (ReturnReg) {
+    if (ReturnRegHi) {
+      assert(Dest->getType() == IceType_i64);
       split64(Dest);
       Variable *DestLo = Dest->getLo();
       Variable *DestHi = Dest->getHi();
-      DestLo->setPreferredRegister(eax, false);
-      DestHi->setPreferredRegister(edx, false);
-      _mov(DestLo, eax);
-      _mov(DestHi, edx);
+      DestLo->setPreferredRegister(ReturnReg, false);
+      DestHi->setPreferredRegister(ReturnRegHi, false);
+      _mov(DestLo, ReturnReg);
+      _mov(DestHi, ReturnRegHi);
     } else {
-      Dest->setPreferredRegister(eax, false);
-      _mov(Dest, eax);
+      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+             isVectorType(Dest->getType()));
+      Dest->setPreferredRegister(ReturnReg, false);
+      if (isVectorType(Dest->getType())) {
+        _movp(Dest, ReturnReg);
+      } else {
+        _mov(Dest, ReturnReg);
+      }
     }
-  }
-
-  // Special treatment for an FP function which returns its result in
-  // st(0).
-  if (Dest &&
-      (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64)) {
+  } else if (Dest->getType() == IceType_f32 || Dest->getType() == IceType_f64) {
+    // Special treatment for an FP function which returns its result in
+    // st(0).
     _fstp(Dest);
-    // If Dest ends up being a physical xmm register, the fstp emit
-    // code will route st(0) through a temporary stack slot.
+    // If Dest ends up being a physical xmm register, the fstp emit code
+    // will route st(0) through a temporary stack slot.
   }
 }
 

diff --git a/src/IceTargetLoweringX8632.h b/src/IceTargetLoweringX8632.h
index f709704..b2871c0 100644
--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h

@@ -46,6 +46,7 @@
     return (typeWidthInBytes(Ty) + 3) & ~3;
   }
   virtual void emitVariable(const Variable *Var, const Cfg *Func) const;
+  virtual void lowerArguments();
   virtual void addProlog(CfgNode *Node);
   virtual void addEpilog(CfgNode *Node);
   virtual void emitConstants() const;
@@ -56,8 +57,8 @@
   // function calls using the 32-bit push instruction (though the
   // latter could be done by directly writing to the stack).
   void split64(Variable *Var);
-  void setArgOffsetAndCopy(Variable *Arg, Variable *FramePtr,
-                           size_t BasicFrameOffset, size_t &InArgsSizeBytes);
+  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
+                              size_t BasicFrameOffset, size_t &InArgsSizeBytes);
   Operand *loOperand(Operand *Operand);
   Operand *hiOperand(Operand *Operand);
 

diff --git a/tests_lit/llvm2ice_tests/vector-arg.ll b/tests_lit/llvm2ice_tests/vector-arg.ll
new file mode 100644
index 0000000..ada7dbe
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/vector-arg.ll

@@ -0,0 +1,226 @@
+; This file checks that Subzero generates code in accordance with the
+; calling convention for vectors.
+
+; NOTE: CHECK / OPTM1 lines containing the following strings may be
+; subject to change:
+;
+; * movups: The movups instruction may be changed to movaps when the
+; load / store operation is 16 byte aligned.
+;
+; * stack offsets: These may need to be changed if stack alignment
+; support is implemented.
+;
+; * stack adjustment operations
+
+; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
+; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck --check-prefix=OPTM1 %s
+; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
+; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
+; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
+; RUN:                           | FileCheck --check-prefix=DUMP %s
+
+; The first five functions test that vectors are moved from their
+; correct argument location to xmm0.
+
+define <4 x float> @test_returning_arg0(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5) {
+entry:
+  ret <4 x float> %arg0
+; CHECK-LABEL: test_returning_arg0:
+; CHECK-NOT: mov
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_arg0:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm0
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_arg1(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5) {
+entry:
+  ret <4 x float> %arg1
+; CHECK-LABEL: test_returning_arg1:
+; CHECK: movups xmm0, xmm1
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_arg1:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm1
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_arg2(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5) {
+entry:
+  ret <4 x float> %arg2
+; CHECK-LABEL: test_returning_arg2:
+; CHECK: movups xmm0, xmm2
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_arg2:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm2
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_arg3(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5) {
+entry:
+  ret <4 x float> %arg3
+; CHECK-LABEL: test_returning_arg3:
+; CHECK: movups xmm0, xmm3
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_arg3:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm3
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_arg4(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5) {
+entry:
+  ret <4 x float> %arg4
+; CHECK-LABEL: test_returning_arg4:
+; CHECK: movups xmm0, xmmword ptr [esp+4]
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_arg4:
+; OPTM1: movups xmm0, xmmword ptr {{.*}}
+; OPTM1: ret
+}
+
+; The next five functions check that xmm arguments are handled
+; correctly when interspersed with stack arguments in the argument
+; list.
+
+define <4 x float> @test_returning_interspersed_arg0(i32 %i32arg0, double %doublearg0, <4 x float> %arg0, <4 x float> %arg1, i32 %i32arg1, <4 x float> %arg2, double %doublearg1, <4 x float> %arg3, i32 %i32arg2, double %doublearg2, float %floatarg0, <4 x float> %arg4, <4 x float> %arg5, float %floatarg1) {
+entry:
+  ret <4 x float> %arg0
+; CHECK-LABEL: test_returning_interspersed_arg0:
+; CHECK-NOT: mov
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_interspersed_arg0:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm0
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_interspersed_arg1(i32 %i32arg0, double %doublearg0, <4 x float> %arg0, <4 x float> %arg1, i32 %i32arg1, <4 x float> %arg2, double %doublearg1, <4 x float> %arg3, i32 %i32arg2, double %doublearg2, float %floatarg0, <4 x float> %arg4, <4 x float> %arg5, float %floatarg1) {
+entry:
+  ret <4 x float> %arg1
+; CHECK-LABEL: test_returning_interspersed_arg1:
+; CHECK: movups xmm0, xmm1
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_interspersed_arg1:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm1
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_interspersed_arg2(i32 %i32arg0, double %doublearg0, <4 x float> %arg0, <4 x float> %arg1, i32 %i32arg1, <4 x float> %arg2, double %doublearg1, <4 x float> %arg3, i32 %i32arg2, double %doublearg2, float %floatarg0, <4 x float> %arg4, <4 x float> %arg5, float %floatarg1) {
+entry:
+  ret <4 x float> %arg2
+; CHECK-LABEL: test_returning_interspersed_arg2:
+; CHECK: movups xmm0, xmm2
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_interspersed_arg2:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm2
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_interspersed_arg3(i32 %i32arg0, double %doublearg0, <4 x float> %arg0, <4 x float> %arg1, i32 %i32arg1, <4 x float> %arg2, double %doublearg1, <4 x float> %arg3, i32 %i32arg2, double %doublearg2, float %floatarg0, <4 x float> %arg4, <4 x float> %arg5, float %floatarg1) {
+entry:
+  ret <4 x float> %arg3
+; CHECK-LABEL: test_returning_interspersed_arg3:
+; CHECK: movups xmm0, xmm3
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_interspersed_arg3:
+; OPTM1: movups xmmword ptr [[LOC:.*]], xmm3
+; OPTM1: movups xmm0, xmmword ptr [[LOC]]
+; OPTM1: ret
+}
+
+define <4 x float> @test_returning_interspersed_arg4(i32 %i32arg0, double %doublearg0, <4 x float> %arg0, <4 x float> %arg1, i32 %i32arg1, <4 x float> %arg2, double %doublearg1, <4 x float> %arg3, i32 %i32arg2, double %doublearg2, float %floatarg0, <4 x float> %arg4, <4 x float> %arg5, float %floatarg1) {
+entry:
+  ret <4 x float> %arg4
+; CHECK-LABEL: test_returning_interspersed_arg4:
+; CHECK: movups xmm0, xmmword ptr [esp+44]
+; CHECK: ret
+
+; OPTM1-LABEL: test_returning_interspersed_arg4:
+; OPTM1: movups xmm0, xmmword ptr {{.*}}
+; OPTM1: ret
+}
+
+; Test that vectors are passed correctly as arguments to a function.
+
+declare void @VectorArgs(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float>)
+
+declare void @killXmmRegisters()
+
+define void @test_passing_vectors(<4 x float> %arg0, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3, <4 x float> %arg4, <4 x float> %arg5, <4 x float> %arg6, <4 x float> %arg7, <4 x float> %arg8, <4 x float> %arg9) {
+entry:
+  ; Kills XMM registers so that no in-arg lowering code interferes
+  ; with the test.
+  call void @killXmmRegisters()
+  call void @VectorArgs(<4 x float> %arg9, <4 x float> %arg8, <4 x float> %arg7, <4 x float> %arg6, <4 x float> %arg5, <4 x float> %arg4)
+  ret void
+; CHECK-LABEL: test_passing_vectors:
+; CHECK: movups  [[ARG6:.*]], xmmword ptr [esp+4]
+; CHECK: sub esp, 16
+; CHECK-NEXT: movups xmmword ptr [esp], [[ARG6]]
+; CHECK: movups  [[ARG5:.*]], xmmword ptr [esp+36]
+; CHECK: sub esp, 16
+; CHECK-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; CHECK: movups  xmm0, xmmword ptr [esp+116]
+; CHECK: movups  xmm1, xmmword ptr [esp+100]
+; CHECK: movups  xmm2, xmmword ptr [esp+84]
+; CHECK: movups  xmm3, xmmword ptr [esp+68]
+; CHECK: call VectorArgs
+; CHECK-NEXT: add esp, 32
+; CHECK: ret
+
+; OPTM1-LABEL: test_passing_vectors:
+; OPTM1: movups  [[ARG6:.*]], xmmword ptr {{.*}}
+; OPTM1: sub esp, 16
+; OPTM1: movups xmmword ptr [esp], [[ARG6]]
+; OPTM1: movups  [[ARG5:.*]], xmmword ptr {{.*}}
+; OPTM1: sub esp, 16
+; OPTM1-NEXT: movups xmmword ptr [esp], [[ARG5]]
+; OPTM1: movups  xmm0, xmmword ptr {{.*}}
+; OPTM1: movups  xmm1, xmmword ptr {{.*}}
+; OPTM1: movups  xmm2, xmmword ptr {{.*}}
+; OPTM1: movups  xmm3, xmmword ptr {{.*}}
+; OPTM1: call VectorArgs
+; OPTM1: add esp, 32
+; OPTM1: ret
+}
+
+; Test that a vector returned from a function is recognized to be in
+; xmm0.
+
+declare <4 x float> @VectorReturn(<4 x float> %arg0)
+
+define void @test_receiving_vectors(<4 x float> %arg0) {
+entry:
+  %result = call <4 x float> @VectorReturn(<4 x float> %arg0)
+  %result2 = call <4 x float> @VectorReturn(<4 x float> %result)
+  ret void
+; CHECK-LABEL: test_receiving_vectors:
+; CHECK: call VectorReturn
+; CHECK-NOT: movups xmm0
+; CHECK: call VectorReturn
+; CHECK: ret
+
+; OPTM1-LABEL: test_receiving_vectors:
+; OPTM1: call VectorReturn
+; OPTM1: movups [[LOC:.*]], xmm0
+; OPTM1: movups xmm0, [[LOC]]
+; OPTM1: call VectorReturn
+; OPTM1: ret
+}
+
+; ERRORS-NOT: ICE translation error
+; DUMP-NOT: SZ
commit	45a06236119e496fe2df2469122a15a2e05d8f8e	[log] [tgz]
author	Matt Wala <wala@chromium.org>	Wed Jul 09 16:33:22 2014 -0700
committer	Matt Wala <wala@chromium.org>	Wed Jul 09 16:33:22 2014 -0700
tree	18bd3511f19b2450ced116d5918886865fc35e26
parent	f37fbbe924edccd340b7448a7482c45a9fccda5b [diff]