[PPC]: Peephole optimize small accesss to aligned globals.

Access to aligned globals gives us a chance to peephole optimize nonzero
offsets. If a struct is 4 byte aligned, then accesses to bytes 0-3 won't
overflow the available displacement. For example:
        addis 3, 2, b4v@toc@ha
        addi 4, 3, b4v@toc@l
        lbz 5, b4v@toc@l(3) ; This is the result of the current peephole
        lbz 6, 1(4)         ; optimizer
        lbz 7, 2(4)
        lbz 8, 3(4)
If b4v is 4-byte aligned, we can skip using register 4 because we know
that b4v@toc@l+{1,2,3} won't overflow 32K, and instead generate:
        addis 3, 2, b4v@toc@ha
        lbz 4, b4v@toc@l(3)
        lbz 5, b4v@toc@l+1(3)
        lbz 6, b4v@toc@l+2(3)
        lbz 7, b4v@toc@l+3(3)
Saving a register and an addition.
Larger alignments allow larger structures/arrays to be optimized.

llvm-svn: 255319
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2261b71..8bc4102 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -4180,16 +4180,24 @@
       break;
     }
 
-    // If this is a load or store with a zero offset, we may be able to
-    // fold an add-immediate into the memory operation.
-    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
-        N->getConstantOperandVal(FirstOp) != 0)
+    // If this is a load or store with a zero offset, or within the alignment,
+    // we may be able to fold an add-immediate into the memory operation.
+    // The check against alignment is below, as it can't occur until we check
+    // the arguments to N
+    if (!isa<ConstantSDNode>(N->getOperand(FirstOp)))
       continue;
 
     SDValue Base = N->getOperand(FirstOp + 1);
     if (!Base.isMachineOpcode())
       continue;
 
+    // On targets with fusion, we don't want this to fire and remove a fusion
+    // opportunity, unless a) it results in another fusion opportunity or
+    // b) optimizing for size.
+    if (PPCSubTarget->hasFusion() &&
+        (!MF->getFunction()->optForSize() && !Base.hasOneUse())
+      continue;
+
     unsigned Flags = 0;
     bool ReplaceFlags = true;
 
@@ -4233,6 +4241,17 @@
       break;
     }
 
+    SDValue ImmOpnd = Base.getOperand(1);
+    int MaxDisplacement = 0;
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
+      const GlobalValue *GV = GA->getGlobal();
+      MaxDisplacement = GV->getAlignment() - 1;
+    }
+
+    int Offset = N->getConstantOperandVal(FirstOp);
+    if (Offset < 0 || Offset > MaxDisplacement)
+      continue;
+
     // We found an opportunity.  Reverse the operands from the add
     // immediate and substitute them into the load or store.  If
     // needed, update the target flags for the immediate operand to
@@ -4243,8 +4262,6 @@
     DEBUG(N->dump(CurDAG));
     DEBUG(dbgs() << "\n");
 
-    SDValue ImmOpnd = Base.getOperand(1);
-
     // If the relocation information isn't already present on the
     // immediate operand, add it now.
     if (ReplaceFlags) {
@@ -4255,17 +4272,17 @@
         // is insufficient for the instruction encoding.
         if (GV->getAlignment() < 4 &&
             (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
-             StorageOpcode == PPC::LWA)) {
+             StorageOpcode == PPC::LWA || (Offset % 4) != 0)) {
           DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
-        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
+        ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags);
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
         const Constant *C = CP->getConstVal();
         ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
                                                 CP->getAlignment(),
-                                                0, Flags);
+                                                Offset, Flags);
       }
     }