[SystemZ] Use BRCT and BRCTG to eliminate add-&-compare sequences

This patch just uses a peephole test for "add; compare; branch" sequences
within a single block.  The IR optimizers already convert loops to
decrement-and-branch-on-nonzero form in some cases, so even this
simplistic test triggers many times during a clang bootstrap and
projects/test-suite run.  It looks like there are still cases where we
need to more strongly prefer branches on nonzero though.  E.g. I saw a
case where a loop that started out with a check for 0 ended up with a
check for -1.  I'll try to look at that sometime.

I ended up adding the Reference class because MachineInstr::readsRegister()
doesn't check for subregisters (by design, as far as I could tell).

llvm-svn: 187723
diff --git a/llvm/test/CodeGen/SystemZ/loop-01.ll b/llvm/test/CodeGen/SystemZ/loop-01.ll
index 025a34e..5800801 100644
--- a/llvm/test/CodeGen/SystemZ/loop-01.ll
+++ b/llvm/test/CodeGen/SystemZ/loop-01.ll
@@ -5,7 +5,7 @@
 ; Test that strength reduction is applied to addresses with a scale factor,
 ; but that indexed addressing can still be used.
 define void @f1(i32 *%dest, i32 %a) {
-; CHECK-LABEL: f1
+; CHECK-LABEL: f1:
 ; CHECK-NOT: sllg
 ; CHECK: st %r3, 0({{%r[1-5],%r[1-5]}})
 ; CHECK: br %r14
@@ -23,3 +23,102 @@
 exit:
   ret void
 }
+
+; Test a loop that should be converted into dbr form and then use BRCT.
+define void @f2(i32 *%src, i32 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[REG:%r[0-5]]], 100
+; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
+; CHECK: brct [[REG]], [[LABEL]]
+; CHECK: br %r14
+entry:
+  br label %loop
+
+loop:
+  %count = phi i32 [ 0, %entry ], [ %next, %loop.next ]
+  %next = add i32 %count, 1
+  %val = load volatile i32 *%src
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+  %add = add i32 %val, 1
+  store volatile i32 %add, i32 *%dest
+  br label %loop.next
+
+loop.next:
+  %cont = icmp ne i32 %next, 100
+  br i1 %cont, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Like f2, but for BRCTG.
+define void @f3(i64 *%src, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lghi [[REG:%r[0-5]]], 100
+; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
+; CHECK: brctg [[REG]], [[LABEL]]
+; CHECK: br %r14
+entry:
+  br label %loop
+
+loop:
+  %count = phi i64 [ 0, %entry ], [ %next, %loop.next ]
+  %next = add i64 %count, 1
+  %val = load volatile i64 *%src
+  %cmp = icmp eq i64 %val, 0
+  br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+  %add = add i64 %val, 1
+  store volatile i64 %add, i64 *%dest
+  br label %loop.next
+
+loop.next:
+  %cont = icmp ne i64 %next, 100
+  br i1 %cont, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+; Test a loop with a 64-bit decremented counter in which the 32-bit
+; low part of the counter is used after the decrement.  This is an example
+; of a subregister use being the only thing that blocks a conversion to BRCTG.
+define void @f4(i32 *%src, i32 *%dest, i64 *%dest2, i64 %count) {
+; CHECK-LABEL: f4:
+; CHECK: aghi [[REG:%r[0-5]]], -1
+; CHECK: lr [[REG2:%r[0-5]]], [[REG]]
+; CHECK: stg [[REG2]],
+; CHECK: jne {{\..*}}
+; CHECK: br %r14
+entry:
+  br label %loop
+
+loop:
+  %left = phi i64 [ %count, %entry ], [ %next, %loop.next ]
+  store volatile i64 %left, i64 *%dest2
+  %val = load volatile i32 *%src
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+  %add = add i32 %val, 1
+  store volatile i32 %add, i32 *%dest
+  br label %loop.next
+
+loop.next:
+  %next = add i64 %left, -1
+  %ext = zext i32 %val to i64
+  %shl = shl i64 %ext, 32
+  %and = and i64 %next, 4294967295
+  %or = or i64 %shl, %and
+  store volatile i64 %or, i64 *%dest2
+  %cont = icmp ne i64 %next, 0
+  br i1 %cont, label %loop, label %exit
+
+exit:
+  ret void
+}