[CGP] form usub with overflow from sub+icmp

The motivating x86 cases for forming the intrinsic are shown in PR31754 and PR40487:
https://bugs.llvm.org/show_bug.cgi?id=31754
https://bugs.llvm.org/show_bug.cgi?id=40487
..and those are shown in the IR test file and x86 codegen file.

Matching the usubo pattern is harder than uaddo because we have 2 independent values rather than a def-use.

This adds a TLI hook that should preserve the existing behavior for uaddo formation, but disables usubo
formation by default. Only x86 overrides that setting for now although other targets will likely benefit
by forming usbuo too.

Differential Revision: https://reviews.llvm.org/D57789

llvm-svn: 354298
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 14f56279..20a2ae1 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1162,9 +1162,18 @@
 static void replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
                                         Instruction *InsertPt,
                                         Intrinsic::ID IID) {
+  Value *Arg0 = BO->getOperand(0);
+  Value *Arg1 = BO->getOperand(1);
+
+  // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
+  if (BO->getOpcode() == Instruction::Add &&
+      IID == Intrinsic::usub_with_overflow) {
+    assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
+    Arg1 = ConstantExpr::getNeg(cast<Constant>(Arg1));
+  }
+
   IRBuilder<> Builder(InsertPt);
-  Value *MathOV = Builder.CreateBinaryIntrinsic(IID, BO->getOperand(0),
-                                                BO->getOperand(1));
+  Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
   Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
   BO->replaceAllUsesWith(Math);
@@ -1182,13 +1191,8 @@
   if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
     return false;
 
-  // Allow the transform as long as we have an integer type that is not
-  // obviously illegal and unsupported.
-  Type *Ty = Add->getType();
-  if (!isa<IntegerType>(Ty))
-    return false;
-  EVT CodegenVT = TLI.getValueType(DL, Ty);
-  if (!CodegenVT.isSimple() && TLI.isOperationExpand(ISD::UADDO, CodegenVT))
+  if (!TLI.shouldFormOverflowOp(ISD::UADDO,
+                                TLI.getValueType(DL, Add->getType())))
     return false;
 
   // We don't want to move around uses of condition values this late, so we
@@ -1210,6 +1214,64 @@
   return true;
 }
 
+static bool combineToUSubWithOverflow(CmpInst *Cmp, const TargetLowering &TLI,
+                                      const DataLayout &DL, bool &ModifiedDT) {
+  // Convert (A u> B) to (A u< B) to simplify pattern matching.
+  Value *A = Cmp->getOperand(0), *B = Cmp->getOperand(1);
+  ICmpInst::Predicate Pred = Cmp->getPredicate();
+  if (Pred == ICmpInst::ICMP_UGT) {
+    std::swap(A, B);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  // Convert special-case: (A == 0) is the same as (A u< 1).
+  if (Pred == ICmpInst::ICMP_EQ && match(B, m_ZeroInt())) {
+    B = ConstantInt::get(B->getType(), 1);
+    Pred = ICmpInst::ICMP_ULT;
+  }
+  if (Pred != ICmpInst::ICMP_ULT)
+    return false;
+
+  // Walk the users of a variable operand of a compare looking for a subtract or
+  // add with that same operand. Also match the 2nd operand of the compare to
+  // the add/sub, but that may be a negated constant operand of an add.
+  Value *CmpVariableOperand = isa<Constant>(A) ? B : A;
+  BinaryOperator *Sub = nullptr;
+  for (User *U : CmpVariableOperand->users()) {
+    // A - B, A u< B --> usubo(A, B)
+    if (match(U, m_Sub(m_Specific(A), m_Specific(B)))) {
+      Sub = cast<BinaryOperator>(U);
+      break;
+    }
+
+    // A + (-C), A u< C (canonicalized form of (sub A, C))
+    const APInt *CmpC, *AddC;
+    if (match(U, m_Add(m_Specific(A), m_APInt(AddC))) &&
+        match(B, m_APInt(CmpC)) && *AddC == -(*CmpC)) {
+      Sub = cast<BinaryOperator>(U);
+      break;
+    }
+  }
+  if (!Sub)
+    return false;
+
+  if (!TLI.shouldFormOverflowOp(ISD::USUBO,
+                                TLI.getValueType(DL, Sub->getType())))
+    return false;
+
+  // Pattern matched and profitability checked. Check dominance to determine the
+  // insertion point for an intrinsic that replaces the subtract and compare.
+  DominatorTree DT(*Sub->getFunction());
+  bool SubDominates = DT.dominates(Sub, Cmp);
+  if (!SubDominates && !DT.dominates(Cmp, Sub))
+    return false;
+  Instruction *InPt = SubDominates ? cast<Instruction>(Sub)
+                                   : cast<Instruction>(Cmp);
+  replaceMathCmpWithIntrinsic(Sub, Cmp, InPt, Intrinsic::usub_with_overflow);
+  // Reset callers - do not crash by iterating over a dead instruction.
+  ModifiedDT = true;
+  return true;
+}
+
 /// Sink the given CmpInst into user blocks to reduce the number of virtual
 /// registers that must be created and coalesced. This is a clear win except on
 /// targets with multiple condition code registers (PowerPC), where it might
@@ -1276,14 +1338,17 @@
   return MadeChange;
 }
 
-static bool optimizeCmpExpression(CmpInst *Cmp, const TargetLowering &TLI,
-                                  const DataLayout &DL) {
+static bool optimizeCmp(CmpInst *Cmp, const TargetLowering &TLI,
+                        const DataLayout &DL, bool &ModifiedDT) {
   if (sinkCmpExpression(Cmp, TLI))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, TLI, DL))
     return true;
 
+  if (combineToUSubWithOverflow(Cmp, TLI, DL, ModifiedDT))
+    return true;
+
   return false;
 }
 
@@ -6770,8 +6835,8 @@
     return false;
   }
 
-  if (CmpInst *CI = dyn_cast<CmpInst>(I))
-    if (TLI && optimizeCmpExpression(CI, *TLI, *DL))
+  if (auto *Cmp = dyn_cast<CmpInst>(I))
+    if (TLI && optimizeCmp(Cmp, *TLI, *DL, ModifiedDT))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {