[AArch64] Static (de)allocation of SVE stack objects.

Adds support to AArch64FrameLowering to allocate fixed-stack SVE objects.

The focus of this patch is purely to allow the stack frame to
allocate/deallocate space for scalable SVE objects. More dynamic
allocation (at compile-time, i.e. determining placement of SVE objects
on the stack), or resolving frame-index references that include
scalable-sized offsets, are left for subsequent patches.

SVE objects are allocated in the stack frame as a separate region below
the callee-save area, and above the alignment gap. This is done so that
the SVE objects can be accessed directly from the FP at (runtime)
VL-based offsets to benefit from using the VL-scaled addressing modes.

The layout looks as follows:

     +-------------+
     | stack arg   |   
     +-------------+
     | Callee Saves|
     |   X29, X30  |       (if available)
     |-------------| <- FP (if available)
     |     :       |   
     |  SVE area   |   
     |     :       |   
     +-------------+
     |/////////////| alignment gap.
     |     :       |   
     | Stack objs  |
     |     :       |   
     +-------------+ <- SP after call and frame-setup

SVE and non-SVE stack objects are distinguished using different
StackIDs. The offsets for objects with TargetStackID::SVEVector should be
interpreted as purely scalable offsets within their respective SVE region.

Reviewers: thegameg, rovka, t.p.northover, efriedma, rengolin, greened

Reviewed By: efriedma

Differential Revision: https://reviews.llvm.org/D61437

llvm-svn: 373585
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 8357b76..c42c16b 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -55,6 +55,10 @@
 // | callee-saved fp/simd/SVE regs     |
 // |                                   |
 // |-----------------------------------|
+// |                                   |
+// |        SVE stack objects          |
+// |                                   |
+// |-----------------------------------|
 // |.empty.space.to.make.part.below....|
 // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
 // |.the.standard.16-byte.alignment....|  compile time; if present)
@@ -202,6 +206,12 @@
   return DefaultSafeSPDisplacement;
 }
 
+/// Returns the size of the entire SVE stackframe (calleesaves + spills).
+static StackOffset getSVEStackSize(const MachineFunction &MF) {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8};
+}
+
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -214,7 +224,8 @@
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned NumBytes = AFI->getLocalStackSize();
 
-  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128);
+  return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
+           getSVEStackSize(MF));
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -456,6 +467,11 @@
   if (canUseRedZone(MF))
     return false;
 
+  // When there is an SVE area on the stack, always allocate the
+  // callee-saves and spills/locals separately.
+  if (getSVEStackSize(MF))
+    return false;
+
   return true;
 }
 
@@ -870,6 +886,8 @@
   // Ideally it should match SP value after prologue.
   AFI->setTaggedBasePointerOffset(MFI.getStackSize());
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
   // getStackSize() includes all the locals in its size calculation. We don't
   // include these locals when computing the stack size of a funclet, as they
   // are allocated in the parent's stack frame and accessed via the frame
@@ -880,6 +898,8 @@
                            : (int)MFI.getStackSize();
   if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
     assert(!HasFP && "unexpected function without stack frame but with FP");
+    assert(!SVEStackSize &&
+           "unexpected function without stack frame but with SVE objects");
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
     if (!NumBytes)
@@ -926,6 +946,7 @@
   AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
   bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   if (CombineSPBump) {
+    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                     {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
                     NeedsWinCFI, &HasWinCFI);
@@ -1083,6 +1104,9 @@
     NumBytes = 0;
   }
 
+  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+                  MachineInstr::FrameSetup);
+
   // Allocate space for the rest of the frame.
   if (NumBytes) {
     const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
@@ -1431,8 +1455,11 @@
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+
   // If there is a single SP update, insert it before the ret and we're done.
   if (CombineSPBump) {
+    assert(!SVEStackSize && "Cannot combine SP bump with SVE");
     emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
                     {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII,
                     MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
@@ -1446,6 +1473,12 @@
   NumBytes -= PrologueSaveSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
+  // Deallocate the SVE area.
+  if (SVEStackSize)
+    if (!AFI->isStackRealigned())
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
+                      TII, MachineInstr::FrameDestroy);
+
   if (!hasFP(MF)) {
     bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
@@ -1595,6 +1628,11 @@
   bool isCSR =
       !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
 
+  const StackOffset &SVEStackSize = getSVEStackSize(MF);
+  if (SVEStackSize)
+    llvm_unreachable("Accessing frame indices in presence of SVE "
+                     "not yet supported");
+
   // Use frame pointer to reference fixed objects. Use it for locals if
   // there are VLAs or a dynamically realigned SP (and thus the SP isn't
   // reliable as a base). Make sure useFPForScavengingIndex() does the
@@ -2175,8 +2213,19 @@
              << ' ' << printReg(Reg, RegInfo);
              dbgs() << "\n";);
 
+  bool HasSVEStackObjects = [&MFI]() {
+    for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
+      if (MFI.getStackID(I) == TargetStackID::SVEVector &&
+          MFI.getObjectOffset(I) < 0)
+        return true;
+    // Note: We don't take allocatable stack objects into
+    // account yet, because allocation for those is not yet
+    // implemented.
+    return false;
+  }();
+
   // If any callee-saved registers are used, the frame cannot be eliminated.
-  bool CanEliminateFrame = SavedRegs.count() == 0;
+  bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects;
 
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
@@ -2239,12 +2288,34 @@
 
 void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     MachineFunction &MF, RegScavenger *RS) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
+         "Upwards growing stack unsupported");
+
+  // Process all fixed stack SVE objects.
+  int64_t Offset = 0;
+  for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) {
+    unsigned StackID = MFI.getStackID(I);
+    if (StackID == TargetStackID::SVEVector) {
+      int64_t FixedOffset = -MFI.getObjectOffset(I);
+      if (FixedOffset > Offset)
+        Offset = FixedOffset;
+    }
+  }
+
+  unsigned MaxAlign = getStackAlignment();
+  uint64_t SVEStackSize = alignTo(Offset, MaxAlign);
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  AFI->setStackSizeSVE(SVEStackSize);
+  assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+
   // If this function isn't doing Win64-style C++ EH, we don't need to do
   // anything.
   if (!MF.hasEHFunclets())
     return;
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  MachineFrameInfo &MFI = MF.getFrameInfo();
   WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
 
   MachineBasicBlock &MBB = MF.front();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 7ed20d2..99d868a 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -87,6 +87,17 @@
                                int FI) const override;
   int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
+  bool isSupportedStackID(TargetStackID::Value ID) const override {
+    switch (ID) {
+    default:
+      return false;
+    case TargetStackID::Default:
+    case TargetStackID::SVEVector:
+    case TargetStackID::NoAlloc:
+      return true;
+    }
+  }
+
 private:
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       unsigned StackBumpBytes) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 097a8ba..1cc3177 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3046,6 +3046,16 @@
     MaxEncoding = 0xfff;
     ShiftSize = 12;
     break;
+  case AArch64::ADDVL_XXI:
+  case AArch64::ADDPL_XXI:
+    MaxEncoding = 31;
+    ShiftSize = 0;
+    if (Offset < 0) {
+      MaxEncoding = 32;
+      Sign = -1;
+      Offset = -Offset;
+    }
+    break;
   default:
     llvm_unreachable("Unsupported opcode");
   }
@@ -3117,8 +3127,8 @@
                            StackOffset Offset, const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV,
                            bool NeedsWinCFI, bool *HasWinCFI) {
-  int64_t Bytes;
-  Offset.getForFrameOffset(Bytes);
+  int64_t Bytes, NumPredicateVectors, NumDataVectors;
+  Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors);
 
   // First emit non-scalable frame offsets, or a simple 'mov'.
   if (Bytes || (!Offset && SrcReg != DestReg)) {
@@ -3133,6 +3143,23 @@
                        NeedsWinCFI, HasWinCFI);
     SrcReg = DestReg;
   }
+
+  assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) &&
+         "SetNZCV not supported with SVE vectors");
+  assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) &&
+         "WinCFI not supported with SVE vectors");
+
+  if (NumDataVectors) {
+    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors,
+                       AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+    SrcReg = DestReg;
+  }
+
+  if (NumPredicateVectors) {
+    assert(DestReg != AArch64::SP && "Unaligned access to SP");
+    emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors,
+                       AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr);
+  }
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0efeeb2..a7d0a74 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -95,6 +95,13 @@
   /// returned struct in a register. This field holds the virtual register into
   /// which the sret argument is passed.
   unsigned SRetReturnReg = 0;
+  /// SVE stack size (for predicates and data vectors) are maintained here
+  /// rather than in FrameInfo, as the placement and Stack IDs are target
+  /// specific.
+  uint64_t StackSizeSVE = 0;
+
+  /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid.
+  bool HasCalculatedStackSizeSVE = false;
 
   /// Has a value when it is known whether or not the function uses a
   /// redzone, and no value otherwise.
@@ -131,6 +138,15 @@
     ArgumentStackToRestore = bytes;
   }
 
+  bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; }
+
+  void setStackSizeSVE(uint64_t S) {
+    HasCalculatedStackSizeSVE = true;
+    StackSizeSVE = S;
+  }
+
+  uint64_t getStackSizeSVE() const { return StackSizeSVE; }
+
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h
index 5f5cdfa..13f12a6 100644
--- a/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h
@@ -35,32 +35,38 @@
 /// vector and a 64bit GPR.
 class StackOffset {
   int64_t Bytes;
+  int64_t ScalableBytes;
 
   explicit operator int() const;
 
 public:
   using Part = std::pair<int64_t, MVT>;
 
-  StackOffset() : Bytes(0) {}
+  StackOffset() : Bytes(0), ScalableBytes(0) {}
 
   StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() {
-    assert(!MVT(T).isScalableVector() && "Scalable types not supported");
+    assert(MVT(T).getSizeInBits() % 8 == 0 &&
+           "Offset type is not a multiple of bytes");
     *this += Part(Offset, T);
   }
 
-  StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {}
+  StackOffset(const StackOffset &Other)
+      : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {}
 
   StackOffset &operator=(const StackOffset &) = default;
 
   StackOffset &operator+=(const StackOffset::Part &Other) {
-    assert(Other.second.getSizeInBits() % 8 == 0 &&
-           "Offset type is not a multiple of bytes");
-    Bytes += Other.first * (Other.second.getSizeInBits() / 8);
+    int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8);
+    if (Other.second.isScalableVector())
+      ScalableBytes += OffsetInBytes;
+    else
+      Bytes += OffsetInBytes;
     return *this;
   }
 
   StackOffset &operator+=(const StackOffset &Other) {
     Bytes += Other.Bytes;
+    ScalableBytes += Other.ScalableBytes;
     return *this;
   }
 
@@ -72,6 +78,7 @@
 
   StackOffset &operator-=(const StackOffset &Other) {
     Bytes -= Other.Bytes;
+    ScalableBytes -= Other.ScalableBytes;
     return *this;
   }
 
@@ -88,16 +95,42 @@
     return Res;
   }
 
+  /// Returns the scalable part of the offset in bytes.
+  int64_t getScalableBytes() const { return ScalableBytes; }
+
   /// Returns the non-scalable part of the offset in bytes.
   int64_t getBytes() const { return Bytes; }
 
   /// Returns the offset in parts to which this frame offset can be
   /// decomposed for the purpose of describing a frame offset.
   /// For non-scalable offsets this is simply its byte size.
-  void getForFrameOffset(int64_t &ByteSized) const { ByteSized = Bytes; }
+  void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors,
+                         int64_t &NumDataVectors) const {
+    assert(isValid() && "Invalid frame offset");
+
+    NumBytes = Bytes;
+    NumDataVectors = 0;
+    NumPredicateVectors = ScalableBytes / 2;
+    // This method is used to get the offsets to adjust the frame offset.
+    // If the function requires ADDPL to be used and needs more than two ADDPL
+    // instructions, part of the offset is folded into NumDataVectors so that it
+    // uses ADDVL for part of it, reducing the number of ADDPL instructions.
+    if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 ||
+        NumPredicateVectors > 62) {
+      NumDataVectors = NumPredicateVectors / 8;
+      NumPredicateVectors -= NumDataVectors * 8;
+    }
+  }
 
   /// Returns whether the offset is known zero.
-  explicit operator bool() const { return Bytes; }
+  explicit operator bool() const { return Bytes || ScalableBytes; }
+
+  bool isValid() const {
+    // The smallest scalable element supported by scaled SVE addressing
+    // modes are predicates, which are 2 scalable bytes in size. So the scalable
+    // byte offset must always be a multiple of 2.
+    return ScalableBytes % 2 == 0;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 22f035e..ed07ed1 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -673,6 +673,8 @@
   case TargetStackID::NoAlloc:
   case TargetStackID::SGPRSpill:
     return true;
+  case TargetStackID::SVEVector:
+    return false;
   }
   llvm_unreachable("Invalid TargetStackID::Value");
 }