Support 16-byte aligned stack on 32-bit Windows.

On Windows x86-32, the ABI only guarantees the stack to be 4-byte
aligned. We therefore need the stack pointer to be explicitly
aligned when using vectors. This demands using a frame pointer (to
access function arguments). Also, we had to change accessing spilled
variables from the stack pointer instead of the frame pointer so they
are also aligned. This change does not affect PNaCl. Projects using
the Microsoft ABI should define SUBZERO_USE_MICROSOFT_ABI.

BUG=swiftshader:29

Change-Id: I186ce9435244d6fa9494ec514a91122b6be130b3
Reviewed-on: https://chromium-review.googlesource.com/427348
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 802abfc..f75ca29 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -1006,6 +1006,13 @@
   assert(EntryNode);
   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(StackAlignment));
+  // If the ABI's stack alignment is smaller than the vector size (16 bytes),
+  // conservatively use a frame pointer to allow for explicit alignment of the
+  // stack pointer. This needs to happen before register allocation so the frame
+  // pointer can be reserved.
+  if (getTarget()->needsStackPointerAlignment()) {
+    getTarget()->setHasFramePointer();
+  }
   // Determine if there are large alignment allocations in the entry block or
   // dynamic allocations (variable size in the entry block).
   bool HasLargeAlignment = false;
@@ -1083,7 +1090,7 @@
   // Add instructions to the head of the entry block in reverse order.
   InstList &Insts = getEntryNode()->getInsts();
   if (HasDynamicAllocation && HasLargeAlignment) {
-    // We are using a frame pointer, but fixed large-alignment alloca addresses,
+    // We are using a frame pointer, but fixed large-alignment alloca addresses
     // do not have a known offset from either the stack or frame pointer.
     // They grow up from a user pointer from an alloca.
     sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer);
diff --git a/src/IceInstX8664.def b/src/IceInstX8664.def
index 5b8f121..57038d6 100644
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -211,7 +211,7 @@
 //          sboxres, isGPR, is64, is32, is16, is8, isXmm, is64To8, is32To8,
 //          is16To8, isTrunc8Rcvr, isAhRcvr, aliases)
 
-#if defined(_WIN32) && defined(SUBZERO_USE_MICROSOFT_ABI)  // Microsoft x86-64 ABI
+#if defined(SUBZERO_USE_MICROSOFT_ABI)  // Microsoft x86-64 ABI
 #define REGX8664_BYTEREG_TABLE REGX8664_BYTEREG_TABLE2(0, 1)
 #define REGX8664_GPR_TABLE REGX8664_GPR_TABLE2(0, 1)
 #else  // System V AMD64 ABI
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 41108f8..bfe0a0e 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -250,6 +250,7 @@
   virtual RegNumT getFrameOrStackReg() const = 0;
   virtual size_t typeWidthInBytesOnStack(Type Ty) const = 0;
   virtual uint32_t getStackAlignment() const = 0;
+  virtual bool needsStackPointerAlignment() const { return false; }
   virtual void reserveFixedAllocaArea(size_t Size, size_t Align) = 0;
   virtual int32_t getFrameFixedAllocaOffset() const = 0;
   virtual uint32_t maxOutArgsSizeBytes() const { return 0; }
diff --git a/src/IceTargetLoweringMIPS32.cpp b/src/IceTargetLoweringMIPS32.cpp
index 5110c86..827ef19 100644
--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -1391,7 +1391,7 @@
   // +------------------------+
   // | 8. padding             |
   // +------------------------+
-  // | 9. out args           |
+  // | 9. out args            |
   // +------------------------+ <--- StackPointer
   //
   // The following variables record the size in bytes of the given areas:
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index 89adf3a..ef09d24 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -116,7 +116,12 @@
 const size_t TargetX8632Traits::TableTypeX8632AttributesSize =
     llvm::array_lengthof(TableTypeX8632Attributes);
 
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
+// Windows 32-bit only guarantees 4 byte stack alignment
+const uint32_t TargetX8632Traits::X86_STACK_ALIGNMENT_BYTES = 4;
+#else
 const uint32_t TargetX8632Traits::X86_STACK_ALIGNMENT_BYTES = 16;
+#endif
 const char *TargetX8632Traits::TargetName = "X8632";
 
 template <>
diff --git a/src/IceTargetLoweringX8664Traits.h b/src/IceTargetLoweringX8664Traits.h
index d0f1d91..2d7ea95 100644
--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -702,7 +702,7 @@
 
   static RegNumT getRdxOrDie() { return RegisterSet::Reg_rdx; }
 
-#if defined(_WIN32) && defined(SUBZERO_USE_MICROSOFT_ABI)
+#if defined(SUBZERO_USE_MICROSOFT_ABI)
   // Microsoft x86-64 calling convention:
   //
   // * The first four arguments of vector/fp type, regardless of their
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index db81c18..19e745f 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -153,6 +153,10 @@
   RegNumT getStackReg() const override { return Traits::StackPtr; }
   RegNumT getFrameReg() const override { return Traits::FramePtr; }
   RegNumT getFrameOrStackReg() const override {
+    // If the stack pointer needs to be aligned, then the frame pointer is
+    // unaligned, so always use the stack pointer.
+    if (needsStackPointerAlignment())
+      return getStackReg();
     return IsEbpBasedFrame ? getFrameReg() : getStackReg();
   }
   size_t typeWidthInBytesOnStack(Type Ty) const override {
@@ -163,6 +167,11 @@
   uint32_t getStackAlignment() const override {
     return Traits::X86_STACK_ALIGNMENT_BYTES;
   }
+  bool needsStackPointerAlignment() const override {
+    // If the ABI's stack alignment is smaller than the vector size (16 bytes),
+    // use the (realigned) stack pointer for addressing any stack variables.
+    return Traits::X86_STACK_ALIGNMENT_BYTES < 16;
+  }
   void reserveFixedAllocaArea(size_t Size, size_t Align) override {
     FixedAllocaSizeBytes = Size;
     assert(llvm::isPowerOf2_32(Align));
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index c368e5f..6931f42 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -998,7 +998,7 @@
   // | 1. return address      |
   // +------------------------+
   // | 2. preserved registers |
-  // +------------------------+
+  // +------------------------+ <--- BasePointer (if used)
   // | 3. padding             |
   // +------------------------+
   // | 4. global spill area   |
@@ -1017,14 +1017,16 @@
   // +------------------------+ <--- StackPointer
   //
   // The following variables record the size in bytes of the given areas:
-  //  * X86_RET_IP_SIZE_BYTES:  area 1
-  //  * PreservedRegsSizeBytes: area 2
-  //  * SpillAreaPaddingBytes:  area 3
-  //  * GlobalsSize:            area 4
+  //  * X86_RET_IP_SIZE_BYTES:   area 1
+  //  * PreservedRegsSizeBytes:  area 2
+  //  * SpillAreaPaddingBytes:   area 3
+  //  * GlobalsSize:             area 4
+  //  * LocalsSlotsPaddingBytes: area 5
   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
-  //  * LocalsSpillAreaSize:    area 6
-  //  * SpillAreaSizeBytes:     areas 3 - 10
-  //  * maxOutArgsSizeBytes():  area 10
+  //  * LocalsSpillAreaSize:     area 6
+  //  * FixedAllocaSizeBytes:    areas 7 - 8
+  //  * SpillAreaSizeBytes:      areas 3 - 10
+  //  * maxOutArgsSizeBytes():   areas 9 - 10
 
   // Determine stack frame offsets for each Variable without a register
   // assignment. This can be done as one variable per stack slot. Or, do
@@ -1105,7 +1107,6 @@
   // after the preserved registers and before the spill areas.
   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
   // locals area if they are separate.
-  assert(SpillAreaAlignmentBytes <= Traits::X86_STACK_ALIGNMENT_BYTES);
   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
   uint32_t SpillAreaPaddingBytes = 0;
   uint32_t LocalsSlotsPaddingBytes = 0;
@@ -1177,8 +1178,8 @@
   // Fill in stack offsets for stack args, and copy args into registers for
   // those that were register-allocated. Args are pushed right to left, so
   // Arg[0] is closest to the stack/frame pointer.
-  Variable *FramePtr =
-      getPhysicalRegister(getFrameOrStackReg(), Traits::WordType);
+  RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
+  Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, Traits::WordType);
   size_t BasicFrameOffset =
       PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
   if (!IsEbpBasedFrame)
@@ -1226,7 +1227,7 @@
   // Fill in stack offsets for locals.
   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
-                      IsEbpBasedFrame);
+                      IsEbpBasedFrame && !needsStackPointerAlignment());
   // Assign stack offsets to variables that have been linked to spilled
   // variables.
   for (Variable *Var : VariablesLinkedToSpillSlots) {