Fix push & pop of XMM registers.
Microsoft's x86-64 calling convention ABI requires registers XMM6-15 to
be preserved by the callee:
https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention?view=vs-2017#calling-convention-defaults
Implement a _pop_reg() analog to _push_reg(), so we can specialize
the XMM treatment. Pass the RegNum all the way down to the specialized
class, because we don't know which registers should be 128-bit at the call site.
Bug chromium:931926
Bug swiftshader:22
Change-Id: I57637c852f0f3bb9a374d61a16a7aaa434ac908d
Reviewed-on: https://swiftshader-review.googlesource.com/c/25468
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
Tested-by: Stephen White <senorblanco@chromium.org>
diff --git a/src/Common/Version.h b/src/Common/Version.h
index 423fcf1..58de47f 100644
--- a/src/Common/Version.h
+++ b/src/Common/Version.h
@@ -15,7 +15,7 @@
#define MAJOR_VERSION 4
#define MINOR_VERSION 1
#define BUILD_VERSION 0
-#define BUILD_REVISION 6
+#define BUILD_REVISION 7
#define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x)
diff --git a/src/Reactor/ReactorUnitTests.cpp b/src/Reactor/ReactorUnitTests.cpp
index 7cbde5d..1242d00 100644
--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -992,6 +992,94 @@
delete routine;
}
+// Check that a complex generated function which utilizes all 8 or 16 XMM
+// registers computes the correct result.
+// (Note that due to MSC's lack of support for inline assembly in x64,
+// this test does not actually check that the register contents are
+// preserved, just that the generated function computes the correct value.
+// It's necessary to inspect the registers in a debugger to actually verify.)
+TEST(ReactorUnitTests, PreserveXMMRegisters)
+{
+ Routine *routine = nullptr;
+
+ {
+ Function<Void(Pointer<Byte>, Pointer<Byte>)> function;
+ {
+ Pointer<Byte> in = function.Arg<0>();
+ Pointer<Byte> out = function.Arg<1>();
+
+ Float4 a = *Pointer<Float4>(in + 16 * 0);
+ Float4 b = *Pointer<Float4>(in + 16 * 1);
+ Float4 c = *Pointer<Float4>(in + 16 * 2);
+ Float4 d = *Pointer<Float4>(in + 16 * 3);
+ Float4 e = *Pointer<Float4>(in + 16 * 4);
+ Float4 f = *Pointer<Float4>(in + 16 * 5);
+ Float4 g = *Pointer<Float4>(in + 16 * 6);
+ Float4 h = *Pointer<Float4>(in + 16 * 7);
+ Float4 i = *Pointer<Float4>(in + 16 * 8);
+ Float4 j = *Pointer<Float4>(in + 16 * 9);
+ Float4 k = *Pointer<Float4>(in + 16 * 10);
+ Float4 l = *Pointer<Float4>(in + 16 * 11);
+ Float4 m = *Pointer<Float4>(in + 16 * 12);
+ Float4 n = *Pointer<Float4>(in + 16 * 13);
+ Float4 o = *Pointer<Float4>(in + 16 * 14);
+ Float4 p = *Pointer<Float4>(in + 16 * 15);
+
+ Float4 ab = a + b;
+ Float4 cd = c + d;
+ Float4 ef = e + f;
+ Float4 gh = g + h;
+ Float4 ij = i + j;
+ Float4 kl = k + l;
+ Float4 mn = m + n;
+ Float4 op = o + p;
+
+ Float4 abcd = ab + cd;
+ Float4 efgh = ef + gh;
+ Float4 ijkl = ij + kl;
+ Float4 mnop = mn + op;
+
+ Float4 abcdefgh = abcd + efgh;
+ Float4 ijklmnop = ijkl + mnop;
+ Float4 sum = abcdefgh + ijklmnop;
+ *Pointer<Float4>(out) = sum;
+ Return();
+ }
+
+ routine = function("one");
+ assert(routine);
+
+ float input[64] = { 1.0f, 0.0f, 0.0f, 0.0f,
+ -1.0f, 1.0f, -1.0f, 0.0f,
+ 1.0f, 2.0f, -2.0f, 0.0f,
+ -1.0f, 3.0f, -3.0f, 0.0f,
+ 1.0f, 4.0f, -4.0f, 0.0f,
+ -1.0f, 5.0f, -5.0f, 0.0f,
+ 1.0f, 6.0f, -6.0f, 0.0f,
+ -1.0f, 7.0f, -7.0f, 0.0f,
+ 1.0f, 8.0f, -8.0f, 0.0f,
+ -1.0f, 9.0f, -9.0f, 0.0f,
+ 1.0f, 10.0f, -10.0f, 0.0f,
+ -1.0f, 11.0f, -11.0f, 0.0f,
+ 1.0f, 12.0f, -12.0f, 0.0f,
+ -1.0f, 13.0f, -13.0f, 0.0f,
+ 1.0f, 14.0f, -14.0f, 0.0f,
+ -1.0f, 15.0f, -15.0f, 0.0f };
+
+ float result[4];
+ void (*callable)(float*, float*) = (void(*)(float*,float*))routine->getEntry();
+
+ callable(input, result);
+
+ EXPECT_EQ(result[0], 0.0f);
+ EXPECT_EQ(result[1], 120.0f);
+ EXPECT_EQ(result[2], -120.0f);
+ EXPECT_EQ(result[3], 0.0f);
+ }
+
+ delete routine;
+}
+
int main(int argc, char **argv)
{
::testing::InitGoogleTest(&argc, argv);
diff --git a/third_party/subzero/src/IceInstX8664.def b/third_party/subzero/src/IceInstX8664.def
index 3d455a6..cc9b10b 100644
--- a/third_party/subzero/src/IceInstX8664.def
+++ b/third_party/subzero/src/IceInstX8664.def
@@ -222,7 +222,7 @@
// Note: It would be more appropriate to list the xmm register aliases as
// REGLIST0(), but the corresponding empty initializer gives a syntax error, so
// we use REGLIST1() to redundantly assign the register itself as an alias.
-#define REGX8664_XMM_TABLE \
+#define REGX8664_XMM_TABLE2(U, W) \
/* val, encode, name, base, scratch,preserved,stackptr,frameptr,sboxres, \
isGPR,is64,is32,is16,is8, isXmm, \
is64To8,is32To8,is16To8,isTrunc8Rcvr,isAhRcvr, aliases */ \
@@ -239,31 +239,37 @@
NO_ALIASES()) \
X(Reg_xmm5, 5, "xmm5", Reg_xmm5, 1,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm6, 6, "xmm6", Reg_xmm6, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm6, 6, "xmm6", Reg_xmm6, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm7, 7, "xmm7", Reg_xmm7, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm7, 7, "xmm7", Reg_xmm7, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm8, 8, "xmm8", Reg_xmm8, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm8, 8, "xmm8", Reg_xmm8, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm9, 9, "xmm9", Reg_xmm9, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm9, 9, "xmm9", Reg_xmm9, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm10, 10, "xmm10", Reg_xmm10, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm10, 10, "xmm10", Reg_xmm10, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm11, 11, "xmm11", Reg_xmm11, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm11, 11, "xmm11", Reg_xmm11, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm12, 12, "xmm12", Reg_xmm12, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm12, 12, "xmm12", Reg_xmm12, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm13, 13, "xmm13", Reg_xmm13, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm13, 13, "xmm13", Reg_xmm13, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm14, 14, "xmm14", Reg_xmm14, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm14, 14, "xmm14", Reg_xmm14, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
- X(Reg_xmm15, 15, "xmm15", Reg_xmm15, 0,0,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
+ X(Reg_xmm15, 15, "xmm15", Reg_xmm15, U,W,0,0,0, 0,0,0,0,0, 1, 0,0,0,0,0, \
NO_ALIASES()) \
/* End of xmm register set */
//#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,
// sboxres, isGPR, is64, is32, is16, is8, isXmm, is64To8, is32To8,
// is16To8, isTrunc8Rcvr, isAhRcvr, aliases)
+#if defined(SUBZERO_USE_MICROSOFT_ABI) // Microsoft x86-64 ABI
+#define REGX8664_XMM_TABLE REGX8664_XMM_TABLE2(0, 1)
+#else // System V AMD64 ABI
+#define REGX8664_XMM_TABLE REGX8664_XMM_TABLE2(1, 0)
+#endif
+
// We also provide a combined table, so that there is a namespace where
// all of the registers are considered and have distinct numberings.
// This is in contrast to the above, where the "encode" is based on how
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.cpp b/third_party/subzero/src/IceTargetLoweringX8632.cpp
index ef09d24..0a7a56e 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8632.cpp
@@ -230,7 +230,13 @@
_pop(ebp);
}
-void TargetX8632::_push_reg(Variable *Reg) { _push(Reg); }
+void TargetX8632::_push_reg(RegNumT RegNum) {
+ _push(getPhysicalRegister(RegNum, Traits::WordType));
+}
+
+void TargetX8632::_pop_reg(RegNumT RegNum) {
+ _pop(getPhysicalRegister(RegNum, Traits::WordType));
+}
void TargetX8632::emitGetIP(CfgNode *Node) {
// If there is a non-deleted InstX86GetIP instruction, we need to move it to
diff --git a/third_party/subzero/src/IceTargetLoweringX8632.h b/third_party/subzero/src/IceTargetLoweringX8632.h
index 8a0db78..2715b0f 100644
--- a/third_party/subzero/src/IceTargetLoweringX8632.h
+++ b/third_party/subzero/src/IceTargetLoweringX8632.h
@@ -52,7 +52,8 @@
void _sub_sp(Operand *Adjustment);
void _link_bp();
void _unlink_bp();
- void _push_reg(Variable *Reg);
+ void _push_reg(RegNumT RegNum);
+ void _pop_reg(RegNumT RegNum);
void initRebasePtr();
void initSandbox();
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.cpp b/third_party/subzero/src/IceTargetLoweringX8664.cpp
index de2d996..9cfab50 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.cpp
+++ b/third_party/subzero/src/IceTargetLoweringX8664.cpp
@@ -292,16 +292,36 @@
}
}
-void TargetX8664::_push_reg(Variable *Reg) {
- Variable *rbp =
- getPhysicalRegister(Traits::RegisterSet::Reg_rbp, Traits::WordType);
- if (Reg != rbp || !NeedSandboxing) {
- _push(Reg);
+void TargetX8664::_push_reg(RegNumT RegNum) {
+ if (Traits::isXmm(RegNum)) {
+ Variable *reg =
+ getPhysicalRegister(RegNum, IceType_v4f32);
+ Variable *rsp =
+ getPhysicalRegister(Traits::RegisterSet::Reg_rsp, Traits::WordType);
+ auto* address = Traits::X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
+ _sub_sp(Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets and adjust the stack pointer once.
+ _storep(reg, address);
+ } else if (RegNum != Traits::RegisterSet::Reg_rbp || !NeedSandboxing) {
+ _push(getPhysicalRegister(RegNum, Traits::WordType));
} else {
_push_rbp();
}
}
+void TargetX8664::_pop_reg(RegNumT RegNum) {
+ if (Traits::isXmm(RegNum)) {
+ Variable *reg =
+ getPhysicalRegister(RegNum, IceType_v4f32);
+ Variable *rsp =
+ getPhysicalRegister(Traits::RegisterSet::Reg_rsp, Traits::WordType);
+ auto* address = Traits::X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
+ _movp(reg, address);
+ _add_sp(Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets and adjust the stack pointer once.
+ } else {
+ _pop(getPhysicalRegister(RegNum, Traits::WordType));
+ }
+}
+
void TargetX8664::emitGetIP(CfgNode *Node) {
// No IP base register is needed on X86-64.
(void)Node;
diff --git a/third_party/subzero/src/IceTargetLoweringX8664.h b/third_party/subzero/src/IceTargetLoweringX8664.h
index e90f66a..ec24df6 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664.h
@@ -55,7 +55,8 @@
void _sub_sp(Operand *Adjustment);
void _link_bp();
void _unlink_bp();
- void _push_reg(Variable *Reg);
+ void _push_reg(RegNumT RegNum);
+ void _pop_reg(RegNumT RegNum);
void initRebasePtr();
void initSandbox();
diff --git a/third_party/subzero/src/IceTargetLoweringX8664Traits.h b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
index 2d7ea95..0f152e5 100644
--- a/third_party/subzero/src/IceTargetLoweringX8664Traits.h
+++ b/third_party/subzero/src/IceTargetLoweringX8664Traits.h
@@ -342,6 +342,18 @@
return ByteRegs[RegNum];
}
+ static bool isXmm(RegNumT RegNum) {
+ static const bool IsXmm [RegisterSet::Reg_NUM] = {
+#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr, \
+ sboxres, isGPR, is64, is32, is16, is8, isXmm, is64To8, is32To8, \
+ is16To8, isTrunc8Rcvr, isAhRcvr, aliases) \
+ isXmm,
+ REGX8664_TABLE
+#undef X
+ };
+ return IsXmm[RegNum];
+ }
+
static XmmRegister getEncodedXmm(RegNumT RegNum) {
static const XmmRegister XmmRegs[RegisterSet::Reg_NUM] = {
#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr, \
diff --git a/third_party/subzero/src/IceTargetLoweringX86Base.h b/third_party/subzero/src/IceTargetLoweringX86Base.h
index 2c7d8f5..fbb7d75 100644
--- a/third_party/subzero/src/IceTargetLoweringX86Base.h
+++ b/third_party/subzero/src/IceTargetLoweringX86Base.h
@@ -693,8 +693,11 @@
Context.insert<typename Traits::Insts::Lea>(Dest, Src0);
}
void _link_bp() { dispatchToConcrete(&Traits::ConcreteTarget::_link_bp); }
- void _push_reg(Variable *Reg) {
- dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(Reg));
+ void _push_reg(RegNumT RegNum) {
+ dispatchToConcrete(&Traits::ConcreteTarget::_push_reg, std::move(RegNum));
+ }
+ void _pop_reg(RegNumT RegNum) {
+ dispatchToConcrete(&Traits::ConcreteTarget::_pop_reg, std::move(RegNum));
}
void _mfence() { Context.insert<typename Traits::Insts::Mfence>(); }
/// Moves can be used to redefine registers, creating "partial kills" for
diff --git a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
index c63a6e6..dbc0ee2 100644
--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -1093,7 +1093,7 @@
assert(RegNum == Traits::getBaseReg(RegNum));
++NumCallee;
PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
- _push_reg(getPhysicalRegister(RegNum, Traits::WordType));
+ _push_reg(RegNum);
}
Ctx->statsUpdateRegistersSaved(NumCallee);
@@ -1359,7 +1359,7 @@
continue;
const auto RegNum = RegNumT::fromInt(i);
assert(RegNum == Traits::getBaseReg(RegNum));
- _pop(getPhysicalRegister(RegNum, Traits::WordType));
+ _pop_reg(RegNum);
}
if (!NeedSandboxing) {