[SystemZ] Use MVC to spill loads and stores

Try to use MVC when spilling the destination of a simple load or the source
of a simple store.  As explained in the comment, this doesn't yet handle
the case where the load or store location is also a frame index, since
that could lead to two simultaneous scavenger spills, something the
backend can't handle yet.  spill-02.py tests that this restriction kicks in,
but unfortunately I've not yet found a case that would fail without it.
The volatile trick I used for other scavenger tests doesn't work here
because we can't use MVC for volatile accesses anyway.

I'm planning on relaxing the restriction later, hopefully with a test
that does trigger the problem...

Tests @f8 and @f9 also showed that L(G)RL and ST(G)RL were wrongly
classified as SimpleBDX{Load,Store}.  It wouldn't be easy to test for
that bug separately, which is why I didn't split out the fix as a
separate patch.

llvm-svn: 185434
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 0d30432..af3b711 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "SystemZInstrInfo.h"
 #include "SystemZInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define GET_INSTRINFO_CTOR
@@ -80,7 +81,8 @@
 // Return 0 otherwise.
 //
 // Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
-static int isSimpleMove(const MachineInstr *MI, int &FrameIndex, int Flag) {
+static int isSimpleMove(const MachineInstr *MI, int &FrameIndex,
+                        unsigned Flag) {
   const MCInstrDesc &MCID = MI->getDesc();
   if ((MCID.TSFlags & Flag) &&
       MI->getOperand(1).isFI() &&
@@ -315,6 +317,96 @@
                     FrameIdx);
 }
 
+// Return true if MI is a simple load or store with a 12-bit displacement
+// and no index.  Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  return ((MCID.TSFlags & Flag) &&
+          isUInt<12>(MI->getOperand(2).getImm()) &&
+          MI->getOperand(3).getReg() == 0);
+}
+
+// Return a MachineMemOperand for FrameIndex with flags MMOFlags.
+// Offset is the byte offset from the start of FrameIndex.
+static MachineMemOperand *getFrameMMO(MachineFunction &MF, int FrameIndex,
+                                      uint64_t &Offset, unsigned MMOFlags) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Value *V = PseudoSourceValue::getFixedStack(FrameIndex);
+  return MF.getMachineMemOperand(MachinePointerInfo(V, Offset), MMOFlags,
+                                 MFI->getObjectSize(FrameIndex),
+                                 MFI->getObjectAlignment(FrameIndex));
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                        MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+
+  // Eary exit for cases we don't care about
+  if (Ops.size() != 1)
+    return 0;
+
+  unsigned OpNum = Ops[0];
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  unsigned RegSize = MF.getRegInfo().getRegClass(Reg)->getSize();
+  assert(Size == RegSize && "Invalid size combination");
+
+  // Look for cases where the source of a simple store or the destination
+  // of a simple load is being spilled.  Try to use MVC instead.
+  //
+  // Although MVC is in practice a fast choice in these cases, it is still
+  // logically a bytewise copy.  This means that we cannot use it if the
+  // load or store is volatile.  It also means that the transformation is
+  // not valid in cases where the two memories partially overlap; however,
+  // that is not a problem here, because we know that one of the memories
+  // is a full frame index.
+  //
+  // For now we punt if the load or store is also to a frame index.
+  // In that case we might end up eliminating both of them to out-of-range
+  // offsets, which might then force the register scavenger to spill two
+  // other registers.  The backend can only handle one such scavenger spill
+  // at a time.
+  if (OpNum == 0 && MI->hasOneMemOperand()) {
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    if (MMO->getSize() == Size && !MMO->isVolatile()) {
+      // Handle conversion of loads.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad) &&
+          !MI->getOperand(1).isFI()) {
+        uint64_t Offset = 0;
+        MachineMemOperand *FrameMMO = getFrameMMO(MF, FrameIndex, Offset,
+                                                  MachineMemOperand::MOStore);
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addFrameIndex(FrameIndex).addImm(Offset).addImm(Size)
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addMemOperand(FrameMMO).addMemOperand(MMO);
+      }
+      // Handle conversion of stores.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore) &&
+          !MI->getOperand(1).isFI()) {
+        uint64_t Offset = 0;
+        MachineMemOperand *FrameMMO = getFrameMMO(MF, FrameIndex, Offset,
+                                                  MachineMemOperand::MOLoad);
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addImm(Size).addFrameIndex(FrameIndex).addImm(Offset)
+          .addMemOperand(MMO).addMemOperand(FrameMMO);
+      }
+    }
+  }
+
+  return 0;
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        MachineInstr* LoadMI) const {
+  return 0;
+}
+
 bool
 SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index d6980f7..8d9a3ea 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -111,6 +111,14 @@
                          unsigned DestReg, int FrameIdx,
                          const TargetRegisterClass *RC,
                          const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
+  virtual MachineInstr *
+    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                          const SmallVectorImpl<unsigned> &Ops,
+                          int FrameIndex) const;
+  virtual MachineInstr *
+    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                          const SmallVectorImpl<unsigned> &Ops,
+                          MachineInstr* LoadMI) const;
   virtual bool
     expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const LLVM_OVERRIDE;
   virtual bool
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 3af41e5..1b53eb0 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -242,11 +242,8 @@
 
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
-  defm L   : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>;
-  def  LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>;
-
-  def LG   : UnaryRXY<"lg", 0xE304, load, GR64>;
-  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>;
+  def LG : UnaryRXY<"lg", 0xE304, load, GR64>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -255,16 +252,16 @@
                       [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
   }
 }
+let canFoldAsLoad = 1 in {
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+}
 
 // Register stores.
 let SimpleBDXStore = 1 in {
-  let isCodeGenOnly = 1 in {
-    defm ST32   : StoreRXPair<"st", 0x50, 0xE350, store, GR32>;
-    def  STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
-  }
-
-  def STG   : StoreRXY<"stg", 0xE324, store, GR64>;
-  def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+  let isCodeGenOnly = 1 in
+    defm ST32 : StoreRXPair<"st", 0x50, 0xE350, store, GR32>;
+  def STG : StoreRXY<"stg", 0xE324, store, GR64>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -273,6 +270,9 @@
                        [(store GR128:$src, bdxaddr20only128:$dst)]>;
   }
 }
+let isCodeGenOnly = 1 in
+  def STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
 
 // 8-bit immediate stores to 8-bit fields.
 defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;