Fix push & pop of XMM registers.

Microsoft's x86-64 calling convention ABI requires registers XMM6-15 to
be preserved by the callee:
https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention?view=vs-2017#calling-convention-defaults

Implement a _pop_reg() analog to _push_reg(), so we can specialize
the XMM treatment. Pass the RegNum all the way down to the specialized
class, because we don't know which registers should be 128-bit at the call site.

Bug chromium:931926
Bug swiftshader:22

Change-Id: I57637c852f0f3bb9a374d61a16a7aaa434ac908d
Reviewed-on: https://swiftshader-review.googlesource.com/c/25468
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Stephen White <senorblanco@chromium.org>
diff --git a/src/Common/Version.h b/src/Common/Version.h
index 423fcf1..58de47f 100644
--- a/src/Common/Version.h
+++ b/src/Common/Version.h
@@ -15,7 +15,7 @@
 #define MAJOR_VERSION 4
 #define MINOR_VERSION 1
 #define BUILD_VERSION 0
-#define BUILD_REVISION 6
+#define BUILD_REVISION 7
 
 #define STRINGIFY(x) #x
 #define MACRO_STRINGIFY(x) STRINGIFY(x)
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 7cbde5d..1242d00 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -992,6 +992,94 @@
 	delete routine;
 }
 
+// Check that a complex generated function which utilizes all 8 or 16 XMM
+// registers computes the correct result.
+// (Note that due to MSC's lack of support for inline assembly in x64,
+// this test does not actually check that the register contents are 
+// preserved, just that the generated function computes the correct value.
+// It's necessary to inspect the registers in a debugger to actually verify.)
+TEST(ReactorUnitTests, PreserveXMMRegisters)
+{
+    Routine *routine = nullptr;
+
+    {
+        Function<Void(Pointer<Byte>, Pointer<Byte>)> function;
+        {
+            Pointer<Byte> in = function.Arg<0>();
+            Pointer<Byte> out = function.Arg<1>();
+
+            Float4 a = *Pointer<Float4>(in + 16 * 0);
+            Float4 b = *Pointer<Float4>(in + 16 * 1);
+            Float4 c = *Pointer<Float4>(in + 16 * 2);
+            Float4 d = *Pointer<Float4>(in + 16 * 3);
+            Float4 e = *Pointer<Float4>(in + 16 * 4);
+            Float4 f = *Pointer<Float4>(in + 16 * 5);
+            Float4 g = *Pointer<Float4>(in + 16 * 6);
+            Float4 h = *Pointer<Float4>(in + 16 * 7);
+            Float4 i = *Pointer<Float4>(in + 16 * 8);
+            Float4 j = *Pointer<Float4>(in + 16 * 9);
+            Float4 k = *Pointer<Float4>(in + 16 * 10);
+            Float4 l = *Pointer<Float4>(in + 16 * 11);
+            Float4 m = *Pointer<Float4>(in + 16 * 12);
+            Float4 n = *Pointer<Float4>(in + 16 * 13);
+            Float4 o = *Pointer<Float4>(in + 16 * 14);
+            Float4 p = *Pointer<Float4>(in + 16 * 15);
+
+            Float4 ab = a + b;
+            Float4 cd = c + d;
+            Float4 ef = e + f;
+            Float4 gh = g + h;
+            Float4 ij = i + j;
+            Float4 kl = k + l;
+            Float4 mn = m + n;
+            Float4 op = o + p;
+
+            Float4 abcd = ab + cd;
+            Float4 efgh = ef + gh;
+            Float4 ijkl = ij + kl;
+            Float4 mnop = mn + op;
+
+            Float4 abcdefgh = abcd + efgh;
+            Float4 ijklmnop = ijkl + mnop;
+            Float4 sum = abcdefgh + ijklmnop;
+            *Pointer<Float4>(out) = sum;
+            Return();
+        }
+
+        routine = function("one");
+        assert(routine);
+
+        float input[64] = { 1.0f,  0.0f,   0.0f, 0.0f,
+                           -1.0f,  1.0f,  -1.0f, 0.0f,
+                            1.0f,  2.0f,  -2.0f, 0.0f,
+                           -1.0f,  3.0f,  -3.0f, 0.0f,
+                            1.0f,  4.0f,  -4.0f, 0.0f,
+                           -1.0f,  5.0f,  -5.0f, 0.0f,
+                            1.0f,  6.0f,  -6.0f, 0.0f,
+                           -1.0f,  7.0f,  -7.0f, 0.0f,
+                            1.0f,  8.0f,  -8.0f, 0.0f,
+                           -1.0f,  9.0f,  -9.0f, 0.0f,
+                            1.0f, 10.0f, -10.0f, 0.0f,
+                           -1.0f, 11.0f, -11.0f, 0.0f,
+                            1.0f, 12.0f, -12.0f, 0.0f,
+                           -1.0f, 13.0f, -13.0f, 0.0f,
+                            1.0f, 14.0f, -14.0f, 0.0f,
+                           -1.0f, 15.0f, -15.0f, 0.0f };
+
+        float result[4];
+        void (*callable)(float*, float*) = (void(*)(float*,float*))routine->getEntry();
+
+        callable(input, result);
+
+        EXPECT_EQ(result[0], 0.0f);
+        EXPECT_EQ(result[1], 120.0f);
+        EXPECT_EQ(result[2], -120.0f);
+        EXPECT_EQ(result[3], 0.0f);
+    }
+
+    delete routine;
+}
+
 int main(int argc, char **argv)
 {
 	::testing::InitGoogleTest(&argc, argv);
diff --git a/third_party/subzero/src/IceInstX8664.def b/third_party/subzero/src/IceInstX8664.def
index 3d455a6..cc9b10b 100644
--- a/third_party/subzero/src/IceInstX8664.def
+++ b/third_party/subzero/src/IceInstX8664.def
@@ -222,7 +222,7 @@
 // Note: It would be more appropriate to list the xmm register aliases as
 // REGLIST0(), but the corresponding empty initializer gives a syntax error, so
 // we use REGLIST1() to redundantly assign the register itself as an alias.
-#define REGX8664_XMM_TABLE                                                     \
+#define REGX8664_XMM_TABLE2(U, W)                                              \
   /* val, encode, name, base, scratch,preserved,stackptr,frameptr,sboxres,     \
      isGPR,is64,is32,is16,is8, isXmm,                                          \
      is64To8,is32To8,is16To8,isTrunc8Rcvr,isAhRcvr, aliases */                 \
@@ -239,31 +239,37 @@
     NO_ALIASES())                                                              \
   X(Reg_xmm5,   5, "xmm5",  Reg_xmm5,  1,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm6,   6, "xmm6",  Reg_xmm6,  0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm6,   6, "xmm6",  Reg_xmm6,  U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm7,   7, "xmm7",  Reg_xmm7,  0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm7,   7, "xmm7",  Reg_xmm7,  U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm8,   8, "xmm8",  Reg_xmm8,  0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm8,   8, "xmm8",  Reg_xmm8,  U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm9,   9, "xmm9",  Reg_xmm9,  0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm9,   9, "xmm9",  Reg_xmm9,  U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm10, 10, "xmm10", Reg_xmm10, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm10, 10, "xmm10", Reg_xmm10, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm11, 11, "xmm11", Reg_xmm11, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm11, 11, "xmm11", Reg_xmm11, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm12, 12, "xmm12", Reg_xmm12, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm12, 12, "xmm12", Reg_xmm12, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm13, 13, "xmm13", Reg_xmm13, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm13, 13, "xmm13", Reg_xmm13, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm14, 14, "xmm14", Reg_xmm14, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm14, 14, "xmm14", Reg_xmm14, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
-  X(Reg_xmm15, 15, "xmm15", Reg_xmm15, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
+  X(Reg_xmm15, 15, "xmm15", Reg_xmm15, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0,     \
     NO_ALIASES())                                                              \
   /* End of xmm register set */
 //#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,
 //          sboxres, isGPR, is64, is32, is16, is8, isXmm, is64To8, is32To8,
 //          is16To8, isTrunc8Rcvr, isAhRcvr, aliases)
 
+#if defined(SUBZERO_USE_MICROSOFT_ABI)  // Microsoft x86-64 ABI
+#define REGX8664_XMM_TABLE REGX8664_XMM_TABLE2(0, 1)
+#else  // System V AMD64 ABI
+#define REGX8664_XMM_TABLE REGX8664_XMM_TABLE2(1, 0)
+#endif
+
 // We also provide a combined table, so that there is a namespace where
 // all of the registers are considered and have distinct numberings.
 // This is in contrast to the above, where the "encode" is based on how
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.cpp b/third_party/subzero/src/IceTargetLoweringX8632.cpp
index ef09d24..0a7a56e 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -230,7 +230,13 @@
   _pop(ebp);
 }
 
-void TargetX8632::_push_reg(Variable *Reg) { _push(Reg); }
+void TargetX8632::_push_reg(RegNumT RegNum) {
+  _push(getPhysicalRegister(RegNum, Traits::WordType));
+}
+
+void TargetX8632::_pop_reg(RegNumT RegNum) {
+  _pop(getPhysicalRegister(RegNum, Traits::WordType));
+}
 
 void TargetX8632::emitGetIP(CfgNode *Node) {
   // If there is a non-deleted InstX86GetIP instruction, we need to move it to
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.h b/third_party/subzero/src/IceTargetLoweringX8632.h
index 8a0db78..2715b0f 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632.h
@@ -52,7 +52,8 @@
   void _sub_sp(Operand *Adjustment);
   void _link_bp();
   void _unlink_bp();
-  void _push_reg(Variable *Reg);
+  void _push_reg(RegNumT RegNum);
+  void _pop_reg(RegNumT RegNum);
 
   void initRebasePtr();
   void initSandbox();
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.cpp b/third_party/subzero/src/IceTargetLoweringX8664.cpp
index de2d996..9cfab50 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -292,16 +292,36 @@
   }
 }
 
-void TargetX8664::_push_reg(Variable *Reg) {
-  Variable *rbp =
-      getPhysicalRegister(Traits::RegisterSet::Reg_rbp, Traits::WordType);
-  if (Reg != rbp || !NeedSandboxing) {
-    _push(Reg);
+void TargetX8664::_push_reg(RegNumT RegNum) {
+  if (Traits::isXmm(RegNum)) {
+    Variable *reg =
+        getPhysicalRegister(RegNum, IceType_v4f32);
+    Variable *rsp =
+        getPhysicalRegister(Traits::RegisterSet::Reg_rsp, Traits::WordType);
+    auto* address = Traits::X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
+    _sub_sp(Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets and adjust the stack pointer once.
+    _storep(reg, address);
+  } else if (RegNum != Traits::RegisterSet::Reg_rbp || !NeedSandboxing) {
+    _push(getPhysicalRegister(RegNum, Traits::WordType));
   } else {
     _push_rbp();
   }
 }
 
+void TargetX8664::_pop_reg(RegNumT RegNum) {
+  if (Traits::isXmm(RegNum)) {
+    Variable *reg =
+        getPhysicalRegister(RegNum, IceType_v4f32);
+    Variable *rsp =
+        getPhysicalRegister(Traits::RegisterSet::Reg_rsp, Traits::WordType);
+    auto* address = Traits::X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
+    _movp(reg, address);
+    _add_sp(Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets and adjust the stack pointer once.
+  } else {
+    _pop(getPhysicalRegister(RegNum, Traits::WordType));
+  }
+}
+
 void TargetX8664::emitGetIP(CfgNode *Node) {
   // No IP base register is needed on X86-64.
   (void)Node;
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.h b/third_party/subzero/src/IceTargetLoweringX8664.h
index e90f66a..ec24df6 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -55,7 +55,8 @@
   void _sub_sp(Operand *Adjustment);
   void _link_bp();
   void _unlink_bp();
-  void _push_reg(Variable *Reg);
+  void _push_reg(RegNumT RegNum);
+  void _pop_reg(RegNumT RegNum);
 
   void initRebasePtr();
   void initSandbox();
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Traits.h b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
index 2d7ea95..0f152e5 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
@@ -342,6 +342,18 @@
     return ByteRegs[RegNum];
   }
 
+  static bool isXmm(RegNumT RegNum) {
+    static const bool IsXmm [RegisterSet::Reg_NUM] = {
+#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,     \
+          sboxres, isGPR, is64, is32, is16, is8, isXmm, is64To8, is32To8,      \
+          is16To8, isTrunc8Rcvr, isAhRcvr, aliases)                            \
+        isXmm,
+        REGX8664_TABLE
+#undef X
+    };
+    return IsXmm[RegNum];
+  }
+
   static XmmRegister getEncodedXmm(RegNumT RegNum) {
     static const XmmRegister XmmRegs[RegisterSet::Reg_NUM] = {
 #define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,     \
diff --git a/third_party/subzero/src/IceTargetLoweringX86Base.h b/third_party/subzero/src/IceTargetLoweringX86Base.h
index 2c7d8f5..fbb7d75 100644
--- a/third_party/subzero/src/IceTargetLoweringX86Base.h
+++ b/third_party/subzero/src/IceTargetLoweringX86Base.h
@@ -693,8 +693,11 @@
     Context.insert<typename Traits::Insts::Lea>(Dest, Src0);
   }
   void _link_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_link_bp); }
-  void _push_reg(Variable *Reg) {
-    dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(Reg));
+  void _push_reg(RegNumT RegNum) {
+    dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(RegNum));
+  }
+  void _pop_reg(RegNumT RegNum) {
+    dispatchToConcrete(&Traits::ConcreteTarget::_pop_reg, std::move(RegNum));
   }
   void _mfence() { Context.insert<typename Traits::Insts::Mfence>(); }
   /// Moves can be used to redefine registers, creating "partial kills" for
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index c63a6e6..dbc0ee2 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -1093,7 +1093,7 @@
     assert(RegNum == Traits::getBaseReg(RegNum));
     ++NumCallee;
     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
-    _push_reg(getPhysicalRegister(RegNum, Traits::WordType));
+    _push_reg(RegNum);
   }
   Ctx->statsUpdateRegistersSaved(NumCallee);
 
@@ -1359,7 +1359,7 @@
       continue;
     const auto RegNum = RegNumT::fromInt(i);
     assert(RegNum == Traits::getBaseReg(RegNum));
-    _pop(getPhysicalRegister(RegNum, Traits::WordType));
+    _pop_reg(RegNum);
   }
 
   if (!NeedSandboxing) {