[ARM][CGP] Negative constant operand handling

While mutating instructions, we sign extended negative constant
operands for binary operators that can safely overflow. This was to
allow instructions, such as add nuw i8 %a, -2, to still be able to
perform a subtraction. However, the code to handle constants doesn't
take into consideration that instructions, such as sub nuw i8 -2, %a,
require the i8 -2 to be converted into i32 254.

This is a relatively simple fix, but I've taken the time to
reorganise the code a bit - mainly that instructions that can be
promoted are cached and splitting up the Mutate function.

Differential Revision: https://reviews.llvm.org/D53972

llvm-svn: 345840
diff --git a/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
index fb9fad4..2403b9e 100644
--- a/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
@@ -110,11 +110,26 @@
 class IRPromoter {
   SmallPtrSet<Value*, 8> NewInsts;
   SmallVector<Instruction*, 4> InstsToRemove;
+  DenseMap<Value*, Type*> TruncTysMap;
+  SmallPtrSet<Value*, 8> Promoted;
   Module *M = nullptr;
   LLVMContext &Ctx;
+  Type *ExtTy = nullptr;
+  Type *OrigTy = nullptr;
+
+  void PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                         SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void ExtendSources(SmallPtrSetImpl<Value*> &Sources);
+  void PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                   SmallPtrSetImpl<Value*> &Sources,
+                   SmallPtrSetImpl<Instruction*> &Sinks,
+                   SmallPtrSetImpl<Instruction*> &SafeToPromote);
+  void TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                     SmallPtrSetImpl<Instruction*> &Sinks);
 
 public:
-  IRPromoter(Module *M) : M(M), Ctx(M->getContext()) { }
+  IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
+                          ExtTy(Type::getInt32Ty(Ctx)) { }
 
   void Cleanup() {
     for (auto *I : InstsToRemove) {
@@ -129,14 +144,17 @@
   void Mutate(Type *OrigTy,
               SmallPtrSetImpl<Value*> &Visited,
               SmallPtrSetImpl<Value*> &Sources,
-              SmallPtrSetImpl<Instruction*> &Sinks);
+              SmallPtrSetImpl<Instruction*> &Sinks,
+              SmallPtrSetImpl<Instruction*> &SafeToPromote);
 };
 
 class ARMCodeGenPrepare : public FunctionPass {
   const ARMSubtarget *ST = nullptr;
   IRPromoter *Promoter = nullptr;
   std::set<Value*> AllVisited;
+  SmallPtrSet<Instruction*, 8> SafeToPromote;
 
+  bool isSafeOverflow(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
   bool TryToPromote(Value *V);
@@ -241,8 +259,8 @@
 }
 
 /// Return whether the instruction can be promoted within any modifications to
-/// it's operands or result.
-static bool isSafeOverflow(Instruction *I) {
+/// its operands or result.
+bool ARMCodeGenPrepare::isSafeOverflow(Instruction *I) {
   // FIXME Do we need NSW too?
   if (isa<OverflowingBinaryOperator>(I) && I->hasNoUnsignedWrap())
     return true;
@@ -386,11 +404,13 @@
   // If I is only being used by something that will require its value to be
   // truncated, then we don't care about the promoted result.
   auto *I = cast<Instruction>(V);
-  if (I->hasOneUse() && isSink(*I->use_begin()))
+  if (I->hasOneUse() && isSink(*I->use_begin())) {
+    LLVM_DEBUG(dbgs() << "ARM CGP: Only use is a sink: " << *V << "\n");
     return true;
+  }
 
   if (isa<OverflowingBinaryOperator>(I))
-    return isSafeOverflow(I);
+    return false;
   return true;
 }
 
@@ -414,56 +434,84 @@
   llvm_unreachable("unhandled opcode for narrow intrinsic");
 }
 
-void IRPromoter::Mutate(Type *OrigTy,
-                        SmallPtrSetImpl<Value*> &Visited,
-                        SmallPtrSetImpl<Value*> &Sources,
-                        SmallPtrSetImpl<Instruction*> &Sinks) {
+static void ReplaceAllUsersOfWith(Value *From, Value *To) {
+  SmallVector<Instruction*, 4> Users;
+  Instruction *InstTo = dyn_cast<Instruction>(To);
+  for (Use &U : From->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+    if (InstTo && User->isIdenticalTo(InstTo))
+      continue;
+    Users.push_back(User);
+  }
+
+  for (auto *U : Users)
+    U->replaceUsesOfWith(From, To);
+}
+
+void
+IRPromoter::PrepareConstants(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   IRBuilder<> Builder{Ctx};
-  Type *ExtTy = Type::getInt32Ty(M->getContext());
-  SmallPtrSet<Value*, 8> Promoted;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
-             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  // First step is to prepare the instructions for mutation. Most constants
+  // just need to be zero extended into their new type, but complications arise
+  // because:
+  // - For nuw binary operators, negative immediates would need sign extending;
+  //   however, instead we'll change them to positive and zext them. We can do
+  //   this because:
+  //   > The operators that can wrap are: add, sub, mul and shl.
+  //   > shl interprets its second operand as unsigned and if the first operand
+  //     is an immediate, it will need zext to be nuw.
+  //   > I'm assuming mul cannot be nuw while using a negative immediate...
+  //   > Which leaves the nuw add and sub to be handled; as with shl, if an
+  //     immediate is used as operand 0, it will need zext to be nuw.
+  // - We also allow add and sub to safely overflow in certain circumstances
+  //   and only when the value (operand 0) is being decreased.
+  //
+  // For adds and subs, that are either nuw or safely wrap and use a negative
+  // immediate as operand 1, we create an equivalent instruction using a
+  // positive immediate. That positive immediate can then be zext along with
+  // all the other immediates later.
+  for (auto *V : Visited) {
+    if (!isa<Instruction>(V))
+      continue;
 
-  // Cache original types.
-  DenseMap<Value*, Type*> TruncTysMap;
-  for (auto *V : Visited)
-    TruncTysMap[V] = V->getType();
+    auto *I = cast<Instruction>(V);
+    if (SafeToPromote.count(I)) {
 
-  auto ReplaceAllUsersOfWith = [&](Value *From, Value *To) {
-    SmallVector<Instruction*, 4> Users;
-    Instruction *InstTo = dyn_cast<Instruction>(To);
-    for (Use &U : From->uses()) {
-      auto *User = cast<Instruction>(U.getUser());
-      if (InstTo && User->isIdenticalTo(InstTo))
+      if (!isa<OverflowingBinaryOperator>(I))
         continue;
-      Users.push_back(User);
+
+      if (auto *Const = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        if (!Const->isNegative())
+          break;
+
+        unsigned Opc = I->getOpcode();
+        assert((Opc == Instruction::Add || Opc == Instruction::Sub) &&
+               "expected only an add or sub to use a negative imm");
+
+        LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+        auto *NewConst = ConstantInt::get(Ctx, Const->getValue().abs());
+        Builder.SetInsertPoint(I);
+        Value *NewVal = Opc == Instruction::Sub ?
+          Builder.CreateAdd(I->getOperand(0), NewConst) :
+          Builder.CreateSub(I->getOperand(0), NewConst);
+        LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
+
+        if (auto *NewInst = dyn_cast<Instruction>(NewVal)) {
+          NewInst->copyIRFlags(I);
+          NewInsts.insert(NewInst);
+        }
+        InstsToRemove.push_back(I);
+        I->replaceAllUsesWith(NewVal);
+      }
     }
+  }
+  for (auto *I : NewInsts)
+    Visited.insert(I);
+}
 
-    for (auto *U : Users)
-      U->replaceUsesOfWith(From, To);
-  };
-
-  auto FixConst = [&](ConstantInt *Const, Instruction *I) {
-    Constant *NewConst = isSafeOverflow(I) && Const->isNegative() ?
-      ConstantExpr::getSExt(Const, ExtTy) :
-      ConstantExpr::getZExt(Const, ExtTy);
-    I->replaceUsesOfWith(Const, NewConst);
-  };
-
-  auto InsertDSPIntrinsic = [&](Instruction *I) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
-               << *I << "\n");
-    Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
-    Builder.SetInsertPoint(I);
-    Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
-    CallInst *Call = Builder.CreateCall(DSPInst, Args);
-    ReplaceAllUsersOfWith(I, Call);
-    InstsToRemove.push_back(I);
-    NewInsts.insert(Call);
-    TruncTysMap[Call] = OrigTy;
-  };
+void IRPromoter::ExtendSources(SmallPtrSetImpl<Value*> &Sources) {
+  IRBuilder<> Builder{Ctx};
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
     LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
@@ -480,7 +528,8 @@
     TruncTysMap[ZExt] = TruncTysMap[V];
   };
 
-  // First, insert extending instructions between the sources and their users.
+
+  // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
   for (auto V : Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
@@ -494,9 +543,17 @@
     }
     Promoted.insert(V);
   }
+}
 
+void IRPromoter::PromoteTree(SmallPtrSetImpl<Value*> &Visited,
+                             SmallPtrSetImpl<Value*> &Sources,
+                             SmallPtrSetImpl<Instruction*> &Sinks,
+                             SmallPtrSetImpl<Instruction*> &SafeToPromote) {
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
-  // Then mutate the types of the instructions within the tree. Here we handle
+
+  IRBuilder<> Builder{Ctx};
+
+  // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
   for (auto *V : Visited) {
     if (Sources.count(V))
@@ -511,9 +568,10 @@
       if ((Op->getType() == ExtTy) || !isa<IntegerType>(Op->getType()))
         continue;
 
-      if (auto *Const = dyn_cast<ConstantInt>(Op))
-        FixConst(Const, I);
-      else if (isa<UndefValue>(Op))
+      if (auto *Const = dyn_cast<ConstantInt>(Op)) {
+        Constant *NewConst = ConstantExpr::getZExt(Const, ExtTy);
+        I->setOperand(i, NewConst);
+      } else if (isa<UndefValue>(Op))
         I->setOperand(i, UndefValue::get(ExtTy));
     }
 
@@ -523,20 +581,42 @@
     }
   }
 
-  // Now we need to remove any zexts that have become unnecessary, as well
-  // as insert any intrinsics.
+  // Finally, any instructions that should be promoted but haven't yet been,
+  // need to be handled using intrinsics.
   for (auto *V : Visited) {
-    if (Sources.count(V))
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
       continue;
 
-    if (!shouldPromote(V) || isPromotedResultSafe(V))
+    if (Sources.count(I) || Sinks.count(I))
       continue;
 
+    if (!shouldPromote(I) || SafeToPromote.count(I) || NewInsts.count(I))
+      continue;
+  
     assert(EnableDSP && "DSP intrinisc insertion not enabled!");
 
     // Replace unsafe instructions with appropriate intrinsic calls.
-    InsertDSPIntrinsic(cast<Instruction>(V));
+    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
+               << *I << "\n");
+    Function *DSPInst =
+      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
+    Builder.SetInsertPoint(I);
+    Builder.SetCurrentDebugLocation(I->getDebugLoc());
+    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
+    CallInst *Call = Builder.CreateCall(DSPInst, Args);
+    ReplaceAllUsersOfWith(I, Call);
+    InstsToRemove.push_back(I);
+    NewInsts.insert(Call);
+    TruncTysMap[Call] = OrigTy;
   }
+}
+
+void IRPromoter::TruncateSinks(SmallPtrSetImpl<Value*> &Sources,
+                               SmallPtrSetImpl<Instruction*> &Sinks) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
+
+  IRBuilder<> Builder{Ctx};
 
   auto InsertTrunc = [&](Value *V) -> Instruction* {
     if (!isa<Instruction>(V) || !isa<IntegerType>(V->getType()))
@@ -558,7 +638,6 @@
     return Trunc;
   };
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
   // Fix up any stores or returns that use the results of the promoted
   // chain.
   for (auto I : Sinks) {
@@ -584,6 +663,36 @@
       }
     }
   }
+}
+
+void IRPromoter::Mutate(Type *OrigTy,
+                        SmallPtrSetImpl<Value*> &Visited,
+                        SmallPtrSetImpl<Value*> &Sources,
+                        SmallPtrSetImpl<Instruction*> &Sinks,
+                        SmallPtrSetImpl<Instruction*> &SafeToPromote) {
+  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
+             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  this->OrigTy = OrigTy;
+
+  // Cache original types.
+  for (auto *V : Visited)
+    TruncTysMap[V] = V->getType();
+
+  // Convert adds and subs using negative immediates to equivalent instructions
+  // that use positive constants.
+  PrepareConstants(Visited, SafeToPromote);
+
+  // Insert zext instructions between sources and their users.
+  ExtendSources(Sources);
+
+  // Promote visited instructions, mutating their types in place. Also insert
+  // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
+  // promote.
+  PromoteTree(Visited, Sources, Sinks, SafeToPromote);
+
+  // Finally, insert trunc instructions for use by calls, stores etc...
+  TruncateSinks(Sources, Sinks);
+
   LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete:\n");
   LLVM_DEBUG(dbgs();
              for (auto *V : Sources)
@@ -651,11 +760,20 @@
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
 bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
-  if (isPromotedResultSafe(V))
-    return true;
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
+    return true;
+
+  if (SafeToPromote.count(I))
+   return true;
+
+  if (isPromotedResultSafe(V) || isSafeOverflow(I)) {
+    SafeToPromote.insert(I);
+    return true;
+  }
+
+  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
     return false;
 
   // If promotion is not safe, can we use a DSP instruction to natively
@@ -666,9 +784,6 @@
   if (ST->isThumb() && !ST->hasThumb2())
     return false;
 
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
   // TODO
   // Would it be profitable? For Thumb code, these parallel DSP instructions
   // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
@@ -680,6 +795,7 @@
         return false;
     }
   }
+  LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
   return true;
 }
 
@@ -689,6 +805,8 @@
   if (TypeSize > 16 || TypeSize < 8)
     return false;
 
+  SafeToPromote.clear();
+
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
@@ -698,9 +816,8 @@
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Sources;
   SmallPtrSet<Instruction*, 4> Sinks;
-  WorkList.insert(V);
   SmallPtrSet<Value*, 16> CurrentVisited;
-  CurrentVisited.clear();
+  WorkList.insert(V);
 
   // Return true if V was added to the worklist as a supported instruction,
   // if it was already visited, or if we don't need to explore it (e.g.
@@ -783,7 +900,7 @@
   if (ToPromote < 2)
     return false;
 
-  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks);
+  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote);
   return true;
 }