[AArch64][GlobalISel] Support sibling calls with outgoing arguments

This adds support for lowering sibling calls with outgoing arguments.

e.g

```
define void @foo(i32 %a)
```

Support is ported from AArch64ISelLowering's `isEligibleForTailCallOptimization`.
The only thing that is missing is a full port of
`TargetLowering::parametersInCSRMatch`. So, if we're using swiftself,
we'll never tail call.

- Rename `analyzeCallResult` to `analyzeArgInfo`, since the function is now used
  for both outgoing and incoming arguments
- Teach `OutgoingArgHandler` about tail calls. Tail calls use frame indices for
  stack arguments.
- Teach `lowerFormalArguments` to set the bytes in the caller's stack argument
  area. This is used later to check if the tail call's parameters will fit on
  the caller's stack.
- Add `areCalleeOutgoingArgsTailCallable` to perform the eligibility check on
  the callee's outgoing arguments.

For testing:

- Update call-translator-tail-call to verify that we can now tail call with
  outgoing arguments, use G_FRAME_INDEX for stack arguments, and respect the
  size of the caller's stack
- Remove GISel-specific check lines from speculation-hardening.ll, since GISel
  now tail calls like the other selectors
- Add a GISel test line to tailcall-string-rvo.ll since we can tail call in that
  test now
- Add a GISel test line to tailcall_misched_graph.ll since we tail call there
  now. Add specific check lines for GISel, since the debug output from the
  machine-scheduler differs with GlobalISel. The dependency still holds, but
  the output comes out in a different order.

Differential Revision: https://reviews.llvm.org/D67471

llvm-svn: 371780
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index ccbe56d..9280c9e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -378,7 +378,7 @@
   return true;
 }
 
-bool CallLowering::analyzeCallResult(CCState &CCState,
+bool CallLowering::analyzeArgInfo(CCState &CCState,
                                      SmallVectorImpl<ArgInfo> &Args,
                                      CCAssignFn &Fn) const {
   for (unsigned i = 0, e = Args.size(); i < e; ++i) {
@@ -407,12 +407,12 @@
 
   SmallVector<CCValAssign, 16> ArgLocs1;
   CCState CCInfo1(CalleeCC, false, MF, ArgLocs1, F.getContext());
-  if (!analyzeCallResult(CCInfo1, InArgs, CalleeAssignFn))
+  if (!analyzeArgInfo(CCInfo1, InArgs, CalleeAssignFn))
     return false;
 
   SmallVector<CCValAssign, 16> ArgLocs2;
   CCState CCInfo2(CallerCC, false, MF, ArgLocs2, F.getContext());
-  if (!analyzeCallResult(CCInfo2, InArgs, CallerAssignFn))
+  if (!analyzeArgInfo(CCInfo2, InArgs, CallerAssignFn))
     return false;
 
   // We need the argument locations to match up exactly. If there's more in
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index c5d4b18..bad8fb3 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -130,14 +130,26 @@
 struct OutgoingArgHandler : public CallLowering::ValueHandler {
   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                      MachineInstrBuilder MIB, CCAssignFn *AssignFn,
-                     CCAssignFn *AssignFnVarArg)
+                     CCAssignFn *AssignFnVarArg, bool IsTailCall = false)
       : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
-        AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
+        AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), StackSize(0) {}
 
   Register getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
+    MachineFunction &MF = MIRBuilder.getMF();
     LLT p0 = LLT::pointer(0, 64);
     LLT s64 = LLT::scalar(64);
+
+    if (IsTailCall) {
+      // TODO: For -tailcallopt tail calls, Offset will need FPDiff like in
+      // ISelLowering.
+      int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
+      Register FIReg = MRI.createGenericVirtualRegister(p0);
+      MIRBuilder.buildFrameIndex(FIReg, FI);
+      MPO = MachinePointerInfo::getFixedStack(MF, FI);
+      return FIReg;
+    }
+
     Register SPReg = MRI.createGenericVirtualRegister(p0);
     MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
 
@@ -147,7 +159,7 @@
     Register AddrReg = MRI.createGenericVirtualRegister(p0);
     MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
 
-    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    MPO = MachinePointerInfo::getStack(MF, Offset);
     return AddrReg;
   }
 
@@ -188,6 +200,7 @@
 
   MachineInstrBuilder MIB;
   CCAssignFn *AssignFnVarArg;
+  bool IsTailCall;
   uint64_t StackSize;
 };
 } // namespace
@@ -378,6 +391,8 @@
   if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  uint64_t StackOffset = Handler.StackUsed;
   if (F.isVarArg()) {
     auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
     if (!Subtarget.isTargetDarwin()) {
@@ -387,14 +402,20 @@
     }
 
     // We currently pass all varargs at 8-byte alignment, or 4 in ILP32.
-    uint64_t StackOffset =
-        alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
+    StackOffset = alignTo(Handler.StackUsed, Subtarget.isTargetILP32() ? 4 : 8);
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
-    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
   }
 
+  // TODO: Port checks for stack to restore for -tailcallopt from ISelLowering.
+  // We need to keep track of the size of function stacks for tail call
+  // optimization. When we tail call, we need to check if the callee's arguments
+  // will fit on the caller's stack. So, whenever we lower formal arguments,
+  // we should keep track of this information, since we might lower a tail call
+  // in this function later.
+  FuncInfo->setBytesInStackArgArea(StackOffset);
+
   auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   if (Subtarget.hasCustomCallingConv())
     Subtarget.getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
@@ -454,9 +475,67 @@
   return TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved);
 }
 
+bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
+    CallLoweringInfo &Info, MachineFunction &MF,
+    SmallVectorImpl<ArgInfo> &OutArgs) const {
+  // If there are no outgoing arguments, then we are done.
+  if (OutArgs.empty())
+    return true;
+
+  const Function &CallerF = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = CallerF.getCallingConv();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+
+  // We have outgoing arguments. Make sure that we can tail call with them.
+  SmallVector<CCValAssign, 16> OutLocs;
+  CCState OutInfo(CalleeCC, false, MF, OutLocs, CallerF.getContext());
+
+  if (!analyzeArgInfo(OutInfo, OutArgs,
+                      *TLI.CCAssignFnForCall(CalleeCC, Info.IsVarArg))) {
+    LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
+    return false;
+  }
+
+  // Make sure that they can fit on the caller's stack.
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+    LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
+    return false;
+  }
+
+  // Verify that the parameters in callee-saved registers match.
+  // TODO: Port this over to CallLowering as general code once swiftself is
+  // supported.
+  auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const uint32_t *CallerPreservedMask = TRI->getCallPreservedMask(MF, CallerCC);
+
+  for (auto &ArgLoc : OutLocs) {
+    // If it's not a register, it's fine.
+    if (!ArgLoc.isRegLoc())
+      continue;
+
+    Register Reg = ArgLoc.getLocReg();
+
+    // Only look at callee-saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+      continue;
+
+    // TODO: Port the remainder of this check from TargetLowering to support
+    // tail calling swiftself.
+    LLVM_DEBUG(
+        dbgs()
+        << "... Cannot handle callee-saved registers in outgoing args yet.\n");
+    return false;
+  }
+
+  return true;
+}
+
 bool AArch64CallLowering::isEligibleForTailCallOptimization(
     MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
-    SmallVectorImpl<ArgInfo> &InArgs) const {
+    SmallVectorImpl<ArgInfo> &InArgs,
+    SmallVectorImpl<ArgInfo> &OutArgs) const {
   CallingConv::ID CalleeCC = Info.CallConv;
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &CallerF = MF.getFunction();
@@ -535,7 +614,8 @@
   assert((!Info.IsVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
-  // Look at the incoming values.
+  // Verify that the incoming and outgoing arguments from the callee are
+  // safe to tail call.
   if (!doCallerAndCalleePassArgsTheSameWay(Info, MF, InArgs)) {
     LLVM_DEBUG(
         dbgs()
@@ -543,13 +623,8 @@
     return false;
   }
 
-  // For now, only handle callees that take no arguments.
-  if (!Info.OrigArgs.empty()) {
-    LLVM_DEBUG(
-        dbgs()
-        << "... Cannot tail call callees with outgoing arguments yet.\n");
+  if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
     return false;
-  }
 
   LLVM_DEBUG(
       dbgs() << "... Call is eligible for tail call optimization.\n");
@@ -592,20 +667,20 @@
     return false;
   }
 
-  SmallVector<ArgInfo, 8> SplitArgs;
+  SmallVector<ArgInfo, 8> OutArgs;
   for (auto &OrigArg : Info.OrigArgs) {
-    splitToValueTypes(OrigArg, SplitArgs, DL, MRI, Info.CallConv);
+    splitToValueTypes(OrigArg, OutArgs, DL, MRI, Info.CallConv);
     // AAPCS requires that we zero-extend i1 to 8 bits by the caller.
     if (OrigArg.Ty->isIntegerTy(1))
-      SplitArgs.back().Flags[0].setZExt();
+      OutArgs.back().Flags[0].setZExt();
   }
 
   SmallVector<ArgInfo, 8> InArgs;
   if (!Info.OrigRet.Ty->isVoidTy())
     splitToValueTypes(Info.OrigRet, InArgs, DL, MRI, F.getCallingConv());
 
-  bool IsSibCall = Info.IsTailCall &&
-                   isEligibleForTailCallOptimization(MIRBuilder, Info, InArgs);
+  bool IsSibCall = Info.IsTailCall && isEligibleForTailCallOptimization(
+                                          MIRBuilder, Info, InArgs, OutArgs);
   if (IsSibCall)
     MF.getFrameInfo().setHasTailCall();
 
@@ -655,8 +730,8 @@
   // Do the actual argument marshalling.
   SmallVector<unsigned, 8> PhysRegs;
   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
-                             AssignFnVarArg);
-  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+                             AssignFnVarArg, IsSibCall);
+  if (!handleAssignments(MIRBuilder, OutArgs, Handler))
     return false;
 
   // Now we can add the actual call instruction to the correct basic block.
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.h b/llvm/lib/Target/AArch64/AArch64CallLowering.h
index 696d4d8..4777a16 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64CallLowering.h
@@ -47,7 +47,8 @@
   bool
   isEligibleForTailCallOptimization(MachineIRBuilder &MIRBuilder,
                                     CallLoweringInfo &Info,
-                                    SmallVectorImpl<ArgInfo> &InArgs) const;
+                                    SmallVectorImpl<ArgInfo> &InArgs,
+                                    SmallVectorImpl<ArgInfo> &OutArgs) const;
 
   bool supportSwiftError() const override { return true; }
 
@@ -67,6 +68,10 @@
   doCallerAndCalleePassArgsTheSameWay(CallLoweringInfo &Info,
                                       MachineFunction &MF,
                                       SmallVectorImpl<ArgInfo> &InArgs) const;
+
+  bool
+  areCalleeOutgoingArgsTailCallable(CallLoweringInfo &Info, MachineFunction &MF,
+                                    SmallVectorImpl<ArgInfo> &OutArgs) const;
 };
 
 } // end namespace llvm