[ppc64] Enable sibling call optimization on ppc64 ELFv1/ELFv2 abi

This patch enable sibling call optimization on ppc64 ELFv1/ELFv2 abi, and
add a couple of test cases. This patch also passed llvm/clang bootstrap
test, and spec2006 build/run/result validation.

Original issue: https://llvm.org/bugs/show_bug.cgi?id=25617

Great thanks to Tom's (tjablin) help, he contributed a lot to this patch.
Thanks Hal and Kit's invaluable opinions!

Reviewers: hfinkel kbarton

http://reviews.llvm.org/D16315

llvm-svn: 265506
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index c645e07..6f649dc 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "PPCTargetMachine.h"
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -36,12 +37,15 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-lowering"
+
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -51,6 +55,12 @@
 static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
 cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
 
+static cl::opt<bool> DisableSCO("disable-ppc-sco",
+cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumSiblingCalls, "Number of sibling calls");
+
 // FIXME: Remove this once the bug has been fixed!
 extern cl::opt<bool> ANDIGlueBug;
 
@@ -3842,6 +3852,176 @@
   return SPDiff;
 }
 
+static bool isFunctionGlobalAddress(SDValue Callee);
+
+static bool
+resideInSameModule(SDValue Callee, Reloc::Model RelMod) {
+  // If !G, Callee can be an external symbol.
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (!G) return false;
+
+  const GlobalValue *GV = G->getGlobal();
+
+  if (GV->isDeclaration()) return false;
+
+  switch(GV->getLinkage()) {
+  default: llvm_unreachable("unknow linkage type");
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    return false;
+
+  // Callee with weak linkage is allowed if it has hidden or protected
+  // visibility
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage: // e.g. c++ inline functions
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:     // e.g. c++ template instantiation
+    if (GV->hasDefaultVisibility())
+      return false;
+
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+    break;
+  }
+
+  // With '-fPIC', calling default visiblity function need insert 'nop' after
+  // function call, no matter that function resides in same module or not, so
+  // we treat it as in different module.
+  if (RelMod == Reloc::PIC_ && GV->hasDefaultVisibility())
+    return false;
+
+  return true;
+}
+
+static bool
+needStackSlotPassParameters(const PPCSubtarget &Subtarget,
+                            const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  assert(Subtarget.isSVR4ABI() && Subtarget.isPPC64());
+
+  const unsigned PtrByteSize = 8;
+  const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+
+  static const MCPhysReg GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const MCPhysReg VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs = array_lengthof(VR);
+  const unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+
+  unsigned NumBytes = LinkageSize;
+  unsigned AvailableFPRs = NumFPRs;
+  unsigned AvailableVRs = NumVRs;
+
+  for (const ISD::OutputArg& Param : Outs) {
+    if (Param.Flags.isNest()) continue;
+
+    if (CalculateStackSlotUsed(Param.VT, Param.ArgVT, Param.Flags,
+                               PtrByteSize, LinkageSize, ParamAreaSize,
+                               NumBytes, AvailableFPRs, AvailableVRs,
+                               Subtarget.hasQPX()))
+      return true;
+  }
+  return false;
+}
+
+static bool
+hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
+  if (CS->arg_size() != CallerFn->getArgumentList().size())
+    return false;
+
+  ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
+  ImmutableCallSite::arg_iterator CalleeArgEnd = CS->arg_end();
+  Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
+
+  for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
+    const Value* CalleeArg = *CalleeArgIter;
+    const Value* CallerArg = &(*CallerArgIter);
+    if (CalleeArg == CallerArg)
+      continue;
+
+    // e.g. @caller([4 x i64] %a, [4 x i64] %b) {
+    //        tail call @callee([4 x i64] undef, [4 x i64] %b)
+    //      }
+    // 1st argument of callee is undef and has the same type as caller.
+    if (CalleeArg->getType() == CallerArg->getType() &&
+        isa<UndefValue>(CalleeArg))
+      continue;
+
+    return false;
+  }
+
+  return true;
+}
+
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const {
+  bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
+
+  if (DisableSCO && !TailCallOpt) return false;
+
+  // Variadic argument functions are not supported.
+  if (isVarArg) return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+
+  // Tail or Sibling call optimization (TCO/SCO) needs callee and caller has
+  // the same calling convention
+  if (CallerCC != CalleeCC) return false;
+
+  // SCO support C calling convention
+  if (CalleeCC != CallingConv::Fast && CalleeCC != CallingConv::C)
+    return false;
+
+  // Functions containing by val parameters are not supported.
+  if (std::any_of(Ins.begin(), Ins.end(),
+                  [](const ISD::InputArg& IA) { return IA.Flags.isByVal(); }))
+    return false;
+
+  // No TCO/SCO on indirect call because Caller have to restore its TOC
+  if (!isFunctionGlobalAddress(Callee) &&
+      !isa<ExternalSymbolSDNode>(Callee))
+    return false;
+
+  // Check if Callee resides in the same module, because for now, PPC64 SVR4 ABI
+  // (ELFv1/ELFv2) doesn't allow tail calls to a symbol resides in another
+  // module.
+  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+  if (!resideInSameModule(Callee, getTargetMachine().getRelocationModel()))
+    return false;
+
+  // TCO allows altering callee ABI, so we don't have to check further.
+  if (CalleeCC == CallingConv::Fast && TailCallOpt)
+    return true;
+
+  if (DisableSCO) return false;
+
+  // If callee use the same argument list that caller is using, then we can
+  // apply SCO on this case. If it is not, then we need to check if callee needs
+  // stack for passing arguments.
+  if (!hasSameArgumentList(MF.getFunction(), CS) &&
+      needStackSlotPassParameters(Subtarget, Outs)) {
+    return false;
+  }
+
+  return true;
+}
+
 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
 /// for tail call optimization. Targets which want to do tail call
 /// optimization should implement this function.
@@ -4479,9 +4659,32 @@
   bool IsPatchPoint                     = CLI.IsPatchPoint;
   ImmutableCallSite *CS                 = CLI.CS;
 
-  if (isTailCall)
-    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
-                                                   Ins, DAG);
+  if (isTailCall) {
+    if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
+      isTailCall =
+        IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
+                                                 isVarArg, Outs, Ins, DAG);
+    else
+      isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                     Ins, DAG);
+    if (isTailCall) {
+      ++NumTailCalls;
+      if (!getTargetMachine().Options.GuaranteedTailCallOpt)
+        ++NumSiblingCalls;
+
+      assert(isa<GlobalAddressSDNode>(Callee) &&
+             "Callee should be an llvm::Function object.");
+      DEBUG(
+        const GlobalValue *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
+        const unsigned Width = 80 - strlen("TCO caller: ")
+                                  - strlen(", callee linkage: 0, 0");
+        dbgs() << "TCO caller: "
+               << left_justify(DAG.getMachineFunction().getName(), Width)
+               << ", callee linkage: "
+               << GV->getVisibility() << ", " << GV->getLinkage() << "\n"
+      );
+    }
+  }
 
   if (!isTailCall && CS && CS->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
@@ -4760,12 +4963,16 @@
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
   bool hasNest = false;
+  bool IsSibCall = false;
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
+  if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+    IsSibCall = true;
+
   // Mark this function as potentially containing a function that contains a
   // tail call. As a consequence the frame pointer will be used for dynamicalloc
   // and restoring the callers stack pointer in this functions epilog. This is
@@ -4885,9 +5092,12 @@
       CallConv == CallingConv::Fast)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
+  int SPDiff = 0;
+
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
-  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+  if (!IsSibCall)
+    SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
@@ -4896,8 +5106,9 @@
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
-                               dl);
+  if (!IsSibCall)
+    Chain = DAG.getCALLSEQ_START(Chain,
+                                 DAG.getIntPtrConstant(NumBytes, dl, true), dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -5366,7 +5577,7 @@
     InFlag = Chain.getValue(1);
   }
 
-  if (isTailCall)
+  if (isTailCall && !IsSibCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
                     FPOp, true, TailCallArguments);
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index c83c86f..2c4409d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -713,6 +713,16 @@
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       SelectionDAG& DAG) const;
 
+    bool
+    IsEligibleForTailCallOptimization_64SVR4(
+                                    SDValue Callee,
+                                    CallingConv::ID CalleeCC,
+                                    ImmutableCallSite *CS,
+                                    bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    SelectionDAG& DAG) const;
+
     SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                          int SPDiff,
                                          SDValue Chain,
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-calls.ll b/llvm/test/CodeGen/PowerPC/ppc64-calls.ll
index 23a14e6..c134b66 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-calls.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -14,7 +14,8 @@
 define void @test_direct() nounwind readnone {
 ; CHECK-LABEL: test_direct:
   tail call void @foo() nounwind
-; CHECK: bl foo
+; Because of tail call optimization, it can be 'b' instruction.
+; CHECK: [[BR:b[l]?]] foo
 ; CHECK-NOT: nop
   ret void
 }
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll
new file mode 100644
index 0000000..744ca07
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall-shrinkwrap.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=false | FileCheck %s -check-prefix=CHECK-SCO-ONLY
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu --enable-shrink-wrap=true | FileCheck %s -check-prefix=CHECK-SCO-SHRK
+
+%"class.clang::NamedDecl" = type { i32 }
+declare void @__assert_fail();
+
+define i8 @_ZNK5clang9NamedDecl23getLinkageAndVisibilityEv(
+    %"class.clang::NamedDecl"* %this) {
+entry:
+  %tobool = icmp eq %"class.clang::NamedDecl"* %this, null
+  br i1 %tobool, label %cond.false, label %exit
+
+cond.false:
+  tail call void @__assert_fail()
+  unreachable
+
+exit:
+  %DeclKind = getelementptr inbounds
+                            %"class.clang::NamedDecl",
+                            %"class.clang::NamedDecl"* %this, i64 0, i32 0
+  %bf.load = load i32, i32* %DeclKind, align 4
+  %call.i = tail call i8 @LVComputationKind(
+    %"class.clang::NamedDecl"* %this,
+    i32 %bf.load)
+  ret i8 %call.i
+
+; CHECK-SCO-SHRK-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
+; CHECK-SCO-SHRK: b LVComputationKind
+; CHECK-SCO-SHRK: #TC_RETURNd8
+; CHECK-SCO-SHRK: stdu 1, -{{[0-9]+}}(1)
+; CHECK-SCO-SHRK: bl __assert_fail
+;
+; CHECK-SCO-ONLY-LABEL: _ZNK5clang9NamedDecl23getLinkageAndVisibilityEv:
+; CHECK-SCO-ONLY: stdu 1, -{{[0-9]+}}(1)
+; CHECK-SCO-ONLY: b LVComputationKind
+; CHECK-SCO-ONLY: #TC_RETURNd8
+; CHECK-SCO-ONLY: bl __assert_fail
+}
+
+define fastcc i8 @LVComputationKind(
+    %"class.clang::NamedDecl"* %D,
+    i32 %computation) {
+  ret i8 0
+}
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll
new file mode 100644
index 0000000..38018b6
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppc64-sibcall.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s -check-prefix=CHECK-SCO
+; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
+; RUN: llc < %s -O1 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 | FileCheck %s -check-prefix=CHECK-SCO-HASQPX
+
+; No combination of "powerpc64le-unknown-linux-gnu" + "CHECK-SCO", because
+; only Power8 (and later) fully support LE.
+
+%S_56 = type { [13 x i32], i32 }
+%S_64 = type { [15 x i32], i32 }
+%S_32 = type { [7 x i32], i32 }
+
+; Function Attrs: noinline nounwind
+define void @callee_56_copy([7 x i64] %a, %S_56* %b) #0 { ret void }
+define void @callee_64_copy([8 x i64] %a, %S_64* %b) #0 { ret void }
+
+; Function Attrs: nounwind
+define void @caller_56_reorder_copy(%S_56* %b, [7 x i64] %a) #1 {
+  tail call void @callee_56_copy([7 x i64] %a, %S_56* %b)
+  ret void
+
+; CHECK-SCO-LABEL: caller_56_reorder_copy:
+; CHECK-SCO-NOT: stdu 1
+; CHECK-SCO: TC_RETURNd8 callee_56_copy
+}
+
+define void @caller_64_reorder_copy(%S_64* %b, [8 x i64] %a) #1 {
+  tail call void @callee_64_copy([8 x i64] %a, %S_64* %b)
+  ret void
+
+; CHECK-SCO-LABEL: caller_64_reorder_copy:
+; CHECK-SCO: bl callee_64_copy
+}
+
+define void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b) #0 { ret void }
+define void @caller_64_64_copy([8 x i64] %a, [8 x i64] %b) #1 {
+  tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] %b)
+  ret void
+
+; CHECK-SCO-LABEL: caller_64_64_copy:
+; CHECK-SCO: b callee_64_64_copy
+}
+
+define void @caller_64_64_reorder_copy([8 x i64] %a, [8 x i64] %b) #1 {
+  tail call void @callee_64_64_copy([8 x i64] %b, [8 x i64] %a)
+  ret void
+
+; CHECK-SCO-LABEL: caller_64_64_reorder_copy:
+; CHECK-SCO: bl callee_64_64_copy
+}
+
+define void @caller_64_64_undef_copy([8 x i64] %a, [8 x i64] %b) #1 {
+  tail call void @callee_64_64_copy([8 x i64] %a, [8 x i64] undef)
+  ret void
+
+; CHECK-SCO-LABEL: caller_64_64_undef_copy:
+; CHECK-SCO: b callee_64_64_copy
+}
+
+define void @arg8_callee(
+  float %a, i32 signext %b, float %c, i32* %d,
+  i8 zeroext %e, float %f, i32* %g, i32 signext %h)
+{
+  ret void
+}
+
+define void @arg8_caller(float %a, i32 signext %b, i8 zeroext %c, i32* %d) {
+entry:
+  tail call void @arg8_callee(float undef, i32 signext undef, float undef,
+                              i32* %d, i8 zeroext undef, float undef,
+                              i32* undef, i32 signext undef)
+  ret void
+
+; CHECK-SCO-LABEL: arg8_caller:
+; CHECK-SCO: b arg8_callee
+}
+
+; Struct return test
+
+; Function Attrs: noinline nounwind
+define void @callee_sret_56(%S_56* noalias sret %agg.result) #0 { ret void }
+define void @callee_sret_32(%S_32* noalias sret %agg.result) #0 { ret void }
+
+; Function Attrs: nounwind
+define void @caller_do_something_sret_32(%S_32* noalias sret %agg.result) #1 {
+  %1 = alloca %S_56, align 4
+  %2 = bitcast %S_56* %1 to i8*
+  call void @callee_sret_56(%S_56* nonnull sret %1)
+  tail call void @callee_sret_32(%S_32* sret %agg.result)
+  ret void
+
+; CHECK-SCO-LABEL: caller_do_something_sret_32:
+; CHECK-SCO: stdu 1
+; CHECK-SCO: bl callee_sret_56
+; CHECK-SCO: addi 1
+; CHECK-SCO: TC_RETURNd8 callee_sret_32
+}
+
+define void @caller_local_sret_32(%S_32* %a) #1 {
+  %tmp = alloca %S_32, align 4
+  tail call void @callee_sret_32(%S_32* nonnull sret %tmp)
+  ret void
+
+; CHECK-SCO-LABEL: caller_local_sret_32:
+; CHECK-SCO: bl callee_sret_32
+}
+
+attributes #0 = { noinline nounwind  }
+attributes #1 = { nounwind }
+
+; vector <4 x i1> test
+
+define void @callee_v4i1(i8 %a, <4 x i1> %b, <4 x i1> %c) { ret void }
+define void @caller_v4i1_reorder(i8 %a, <4 x i1> %b, <4 x i1> %c) {
+  tail call void @callee_v4i1(i8 %a, <4 x i1> %c, <4 x i1> %b)
+  ret void
+
+; <4 x i1> is 32 bytes aligned, if subtarget doesn't support qpx, then we can't
+; place b, c to qpx register, so we can't do sco on caller_v4i1_reorder
+
+; CHECK-SCO-LABEL: caller_v4i1_reorder:
+; CHECK-SCO: bl callee_v4i1
+
+; CHECK-SCO-HASQPX-LABEL: caller_v4i1_reorder:
+; CHECK-SCO-HASQPX: b callee_v4i1
+}
+
+define void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) { ret void }
+define void @f128_caller(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b) {
+  tail call void @f128_callee(i32* %ptr, ppc_fp128 %a, ppc_fp128 %b)
+  ret void
+
+; CHECK-SCO-LABEL: f128_caller:
+; CHECK-SCO: b f128_callee
+}
+
+; weak linkage test
+%class.T = type { [2 x i8] }
+
+define weak_odr hidden void @wo_hcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_hcaller(%class.T* %this, i8* %c) {
+  tail call void @wo_hcallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-SCO-LABEL: wo_hcaller:
+; CHECK-SCO: b wo_hcallee
+}
+
+define weak_odr protected void @wo_pcallee(%class.T* %this, i8* %c) { ret void }
+define void @wo_pcaller(%class.T* %this, i8* %c) {
+  tail call void @wo_pcallee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-SCO-LABEL: wo_pcaller:
+; CHECK-SCO: b wo_pcallee
+}
+
+define weak_odr void @wo_callee(%class.T* %this, i8* %c) { ret void }
+define void @wo_caller(%class.T* %this, i8* %c) {
+  tail call void @wo_callee(%class.T* %this, i8* %c)
+  ret void
+
+; CHECK-SCO-LABEL: wo_caller:
+; CHECK-SCO: bl wo_callee
+}
+
+define weak protected void @w_pcallee(i8* %ptr) { ret void }
+define void @w_pcaller(i8* %ptr) {
+  tail call void @w_pcallee(i8* %ptr)
+  ret void
+
+; CHECK-SCO-LABEL: w_pcaller:
+; CHECK-SCO: b w_pcallee
+}
+
+define weak hidden void @w_hcallee(i8* %ptr) { ret void }
+define void @w_hcaller(i8* %ptr) {
+  tail call void @w_hcallee(i8* %ptr)
+  ret void
+
+; CHECK-SCO-LABEL: w_hcaller:
+; CHECK-SCO: b w_hcallee
+}
+
+define weak void @w_callee(i8* %ptr) { ret void }
+define void @w_caller(i8* %ptr) {
+  tail call void @w_callee(i8* %ptr)
+  ret void
+
+; CHECK-SCO-LABEL: w_caller:
+; CHECK-SCO: bl w_callee
+}