Re-implement the main strength-reduction portion of LoopStrengthReduction.
This new version is much more aggressive about doing "full" reduction in
cases where it reduces register pressure, and also more aggressive about
rewriting induction variables to count down (or up) to zero when doing so
reduces register pressure.

It currently uses fairly simplistic algorithms for finding reuse
opportunities, but it introduces a new framework allows it to combine
multiple strategies at once to form hybrid solutions, instead of doing
all full-reduction or all base+index.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@94061 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/ARM/arm-negative-stride.ll b/test/CodeGen/ARM/arm-negative-stride.ll
index 72ec8ef..52ab871 100644
--- a/test/CodeGen/ARM/arm-negative-stride.ll
+++ b/test/CodeGen/ARM/arm-negative-stride.ll
@@ -1,7 +1,32 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
 
+; This loop is rewritten with an indvar which counts down, which
+; frees up a register from holding the trip count.
+
 define void @test(i32* %P, i32 %A, i32 %i) nounwind {
 entry:
+; CHECK: str r1, [{{r.*}}, +{{r.*}}, lsl #2]
+        icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
+        br i1 %0, label %return, label %bb
+
+bb:             ; preds = %bb, %entry
+        %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ]          ; <i32> [#uses=2]
+        %i_addr.09.0 = sub i32 %i, %indvar              ; <i32> [#uses=1]
+        %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0         ; <i32*> [#uses=1]
+        store i32 %A, i32* %tmp2
+        %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
+        icmp eq i32 %indvar.next, %i            ; <i1>:1 [#uses=1]
+        br i1 %1, label %return, label %bb
+
+return:         ; preds = %bb, %entry
+        ret void
+}
+
+; This loop has a non-address use of the count-up indvar, so
+; it'll remain. Now the original store uses a negative-stride address.
+
+define void @test_with_forced_iv(i32* %P, i32 %A, i32 %i) nounwind {
+entry:
 ; CHECK: str r1, [{{r.*}}, -{{r.*}}, lsl #2]
         icmp eq i32 %i, 0               ; <i1>:0 [#uses=1]
         br i1 %0, label %return, label %bb
@@ -11,6 +36,7 @@
         %i_addr.09.0 = sub i32 %i, %indvar              ; <i32> [#uses=1]
         %tmp2 = getelementptr i32* %P, i32 %i_addr.09.0         ; <i32*> [#uses=1]
         store i32 %A, i32* %tmp2
+        store i32 %indvar, i32* null
         %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=2]
         icmp eq i32 %indvar.next, %i            ; <i1>:1 [#uses=1]
         br i1 %1, label %return, label %bb
diff --git a/test/CodeGen/ARM/lsr-code-insertion.ll b/test/CodeGen/ARM/lsr-code-insertion.ll
index 507ec2c..1bbb96d 100644
--- a/test/CodeGen/ARM/lsr-code-insertion.ll
+++ b/test/CodeGen/ARM/lsr-code-insertion.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -stats |& grep {40.*Number of machine instrs printed}
-; RUN: llc < %s -stats |& grep {.*Number of re-materialization}
+; RUN: llc < %s -stats |& grep {39.*Number of machine instrs printed}
+; RUN: llc < %s -stats |& not grep {.*Number of re-materialization}
 ; This test really wants to check that the resultant "cond_true" block only 
 ; has a single store in it, and that cond_true55 only has code to materialize 
 ; the constant and do a store.  We do *not* want something like this:
diff --git a/test/CodeGen/ARM/remat.ll b/test/CodeGen/ARM/remat.ll
index 9565c8b..9072bcb 100644
--- a/test/CodeGen/ARM/remat.ll
+++ b/test/CodeGen/ARM/remat.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin 
-; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | grep "Number of re-materialization" | grep 3
+; RUN: llc < %s -mtriple=arm-apple-darwin -stats -info-output-file - | not grep "Number of re-materialization"
 
 	%struct.CONTENTBOX = type { i32, i32, i32, i32, i32 }
 	%struct.LOCBOX = type { i32, i32, i32, i32 }
diff --git a/test/CodeGen/Thumb2/lsr-deficiency.ll b/test/CodeGen/Thumb2/lsr-deficiency.ll
index 7b1b57a..ac2cd34 100644
--- a/test/CodeGen/Thumb2/lsr-deficiency.ll
+++ b/test/CodeGen/Thumb2/lsr-deficiency.ll
@@ -1,25 +1,29 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -relocation-model=pic | FileCheck %s
 ; rdar://7387640
 
-; FIXME: We still need to rewrite array reference iv of stride -4 with loop
-; count iv of stride -1.
+; This now reduces to a single induction variable.
+
+; TODO: It still gets a GPR shuffle at the end of the loop
+; This is because something in instruction selection has decided
+; that comparing the pre-incremented value with zero is better
+; than comparing the post-incremented value with -4.
 
 @G = external global i32                          ; <i32*> [#uses=2]
 @array = external global i32*                     ; <i32**> [#uses=1]
 
 define arm_apcscc void @t() nounwind optsize {
 ; CHECK: t:
-; CHECK: mov.w r2, #4000
-; CHECK: movw r3, #1001
+; CHECK: mov.w r2, #1000
 entry:
   %.pre = load i32* @G, align 4                   ; <i32> [#uses=1]
   br label %bb
 
 bb:                                               ; preds = %bb, %entry
 ; CHECK: LBB1_1:
-; CHECK: subs r3, #1
-; CHECK: cmp r3, #0
-; CHECK: sub.w r2, r2, #4
+; CHECK: cmp r2, #0
+; CHECK: sub.w r9, r2, #1
+; CHECK: mov r2, r9
+
   %0 = phi i32 [ %.pre, %entry ], [ %3, %bb ]     ; <i32> [#uses=1]
   %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %bb ] ; <i32> [#uses=2]
   %tmp5 = sub i32 1000, %indvar                   ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
index 71199ab..1d26756 100644
--- a/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
+++ b/test/CodeGen/Thumb2/thumb2-ifcvt1.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin | FileCheck %s
 
-define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) {
+define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t1:
 ; CHECK: it ne
 ; CHECK: cmpne
@@ -20,12 +20,12 @@
 }
 
 ; FIXME: Check for # of unconditional branch after adding branch folding post ifcvt.
-define i32 @t2(i32 %a, i32 %b) {
+define i32 @t2(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t2:
-; CHECK: ite le
-; CHECK: suble
+; CHECK: ite gt
 ; CHECK: subgt
+; CHECK: suble
 	%tmp1434 = icmp eq i32 %a, %b		; <i1> [#uses=1]
 	br i1 %tmp1434, label %bb17, label %bb.outer
 
@@ -60,14 +60,14 @@
 
 @x = external global i32*		; <i32**> [#uses=1]
 
-define void @foo(i32 %a) {
+define void @foo(i32 %a) nounwind {
 entry:
 	%tmp = load i32** @x		; <i32*> [#uses=1]
 	store i32 %a, i32* %tmp
 	ret void
 }
 
-define void @t3(i32 %a, i32 %b) {
+define void @t3(i32 %a, i32 %b) nounwind {
 entry:
 ; CHECK: t3:
 ; CHECK: it lt
diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll
index bdbe713..56d6aa9 100644
--- a/test/CodeGen/X86/2006-05-11-InstrSched.ll
+++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\
-; RUN:     grep {asm-printer} | grep 31
+; RUN:     grep {asm-printer} | grep 34
 
 target datalayout = "e-p:32:32"
 define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind {
@@ -40,7 +40,7 @@
 	%tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>*		; <<2 x i64>*> [#uses=1]
 	store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7
 	%tmp147 = add nsw i32 %tmp.10, 8		; <i32> [#uses=1]
-	%tmp.upgrd.8 = icmp slt i32 %tmp147, %M		; <i1> [#uses=1]
+	%tmp.upgrd.8 = icmp ne i32 %tmp147, %M		; <i1> [#uses=1]
 	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
 	br i1 %tmp.upgrd.8, label %cond_true, label %return
 
diff --git a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll b/test/CodeGen/X86/2007-08-13-SpillerReuse.ll
deleted file mode 100644
index d6ea510..0000000
--- a/test/CodeGen/X86/2007-08-13-SpillerReuse.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin | grep "48(%esp)" | count 5
-
-	%struct..0anon = type { i32 }
-	%struct.rtvec_def = type { i32, [1 x %struct..0anon] }
-	%struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] }
-@rtx_format = external global [116 x i8*]		; <[116 x i8*]*> [#uses=1]
-@rtx_length = external global [117 x i32]		; <[117 x i32]*> [#uses=1]
-
-declare %struct.rtx_def* @fixup_memory_subreg(%struct.rtx_def*, %struct.rtx_def*, i32)
-
-define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) {
-entry:
-	%tmp2 = icmp eq %struct.rtx_def* %x, null		; <i1> [#uses=1]
-	br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next
-
-cond_next:		; preds = %entry
-	%tmp6 = getelementptr %struct.rtx_def* %x, i32 0, i32 0		; <i16*> [#uses=1]
-	%tmp7 = load i16* %tmp6		; <i16> [#uses=2]
-	%tmp78 = zext i16 %tmp7 to i32		; <i32> [#uses=2]
-	%tmp10 = icmp eq i16 %tmp7, 54		; <i1> [#uses=1]
-	br i1 %tmp10, label %cond_true13, label %cond_next32
-
-cond_true13:		; preds = %cond_next
-	%tmp15 = getelementptr %struct.rtx_def* %x, i32 0, i32 3		; <[1 x %struct..0anon]*> [#uses=1]
-	%tmp1718 = bitcast [1 x %struct..0anon]* %tmp15 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp19 = load %struct.rtx_def** %tmp1718		; <%struct.rtx_def*> [#uses=1]
-	%tmp20 = getelementptr %struct.rtx_def* %tmp19, i32 0, i32 0		; <i16*> [#uses=1]
-	%tmp21 = load i16* %tmp20		; <i16> [#uses=1]
-	%tmp22 = icmp eq i16 %tmp21, 57		; <i1> [#uses=1]
-	br i1 %tmp22, label %cond_true25, label %cond_next32
-
-cond_true25:		; preds = %cond_true13
-	%tmp29 = tail call %struct.rtx_def* @fixup_memory_subreg( %struct.rtx_def* %x, %struct.rtx_def* %insn, i32 1 )		; <%struct.rtx_def*> [#uses=1]
-	ret %struct.rtx_def* %tmp29
-
-cond_next32:		; preds = %cond_true13, %cond_next
-	%tmp34 = getelementptr [116 x i8*]* @rtx_format, i32 0, i32 %tmp78		; <i8**> [#uses=1]
-	%tmp35 = load i8** %tmp34, align 4		; <i8*> [#uses=1]
-	%tmp37 = getelementptr [117 x i32]* @rtx_length, i32 0, i32 %tmp78		; <i32*> [#uses=1]
-	%tmp38 = load i32* %tmp37, align 4		; <i32> [#uses=1]
-	%i.011 = add i32 %tmp38, -1		; <i32> [#uses=2]
-	%tmp12513 = icmp sgt i32 %i.011, -1		; <i1> [#uses=1]
-	br i1 %tmp12513, label %bb, label %UnifiedReturnBlock
-
-bb:		; preds = %bb123, %cond_next32
-	%indvar = phi i32 [ %indvar.next26, %bb123 ], [ 0, %cond_next32 ]		; <i32> [#uses=2]
-	%i.01.0 = sub i32 %i.011, %indvar		; <i32> [#uses=5]
-	%tmp42 = getelementptr i8* %tmp35, i32 %i.01.0		; <i8*> [#uses=2]
-	%tmp43 = load i8* %tmp42		; <i8> [#uses=1]
-	switch i8 %tmp43, label %bb123 [
-		 i8 101, label %cond_true47
-		 i8 69, label %bb105.preheader
-	]
-
-cond_true47:		; preds = %bb
-	%tmp52 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0		; <%struct..0anon*> [#uses=1]
-	%tmp5354 = bitcast %struct..0anon* %tmp52 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp55 = load %struct.rtx_def** %tmp5354		; <%struct.rtx_def*> [#uses=1]
-	%tmp58 = tail call  %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp55, %struct.rtx_def* %insn )		; <%struct.rtx_def*> [#uses=1]
-	%tmp62 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0, i32 0		; <i32*> [#uses=1]
-	%tmp58.c = ptrtoint %struct.rtx_def* %tmp58 to i32		; <i32> [#uses=1]
-	store i32 %tmp58.c, i32* %tmp62
-	%tmp6816 = load i8* %tmp42		; <i8> [#uses=1]
-	%tmp6917 = icmp eq i8 %tmp6816, 69		; <i1> [#uses=1]
-	br i1 %tmp6917, label %bb105.preheader, label %bb123
-
-bb105.preheader:		; preds = %cond_true47, %bb
-	%tmp11020 = getelementptr %struct.rtx_def* %x, i32 0, i32 3, i32 %i.01.0		; <%struct..0anon*> [#uses=1]
-	%tmp11111221 = bitcast %struct..0anon* %tmp11020 to %struct.rtvec_def**		; <%struct.rtvec_def**> [#uses=3]
-	%tmp11322 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=1]
-	%tmp11423 = getelementptr %struct.rtvec_def* %tmp11322, i32 0, i32 0		; <i32*> [#uses=1]
-	%tmp11524 = load i32* %tmp11423		; <i32> [#uses=1]
-	%tmp11625 = icmp eq i32 %tmp11524, 0		; <i1> [#uses=1]
-	br i1 %tmp11625, label %bb123, label %bb73
-
-bb73:		; preds = %bb73, %bb105.preheader
-	%j.019 = phi i32 [ %tmp104, %bb73 ], [ 0, %bb105.preheader ]		; <i32> [#uses=3]
-	%tmp81 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=2]
-	%tmp92 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019		; <%struct..0anon*> [#uses=1]
-	%tmp9394 = bitcast %struct..0anon* %tmp92 to %struct.rtx_def**		; <%struct.rtx_def**> [#uses=1]
-	%tmp95 = load %struct.rtx_def** %tmp9394		; <%struct.rtx_def*> [#uses=1]
-	%tmp98 = tail call  %struct.rtx_def* @walk_fixup_memory_subreg( %struct.rtx_def* %tmp95, %struct.rtx_def* %insn )		; <%struct.rtx_def*> [#uses=1]
-	%tmp101 = getelementptr %struct.rtvec_def* %tmp81, i32 0, i32 1, i32 %j.019, i32 0		; <i32*> [#uses=1]
-	%tmp98.c = ptrtoint %struct.rtx_def* %tmp98 to i32		; <i32> [#uses=1]
-	store i32 %tmp98.c, i32* %tmp101
-	%tmp104 = add i32 %j.019, 1		; <i32> [#uses=2]
-	%tmp113 = load %struct.rtvec_def** %tmp11111221		; <%struct.rtvec_def*> [#uses=1]
-	%tmp114 = getelementptr %struct.rtvec_def* %tmp113, i32 0, i32 0		; <i32*> [#uses=1]
-	%tmp115 = load i32* %tmp114		; <i32> [#uses=1]
-	%tmp116 = icmp ult i32 %tmp104, %tmp115		; <i1> [#uses=1]
-	br i1 %tmp116, label %bb73, label %bb123
-
-bb123:		; preds = %bb73, %bb105.preheader, %cond_true47, %bb
-	%i.0 = add i32 %i.01.0, -1		; <i32> [#uses=1]
-	%tmp125 = icmp sgt i32 %i.0, -1		; <i1> [#uses=1]
-	%indvar.next26 = add i32 %indvar, 1		; <i32> [#uses=1]
-	br i1 %tmp125, label %bb, label %UnifiedReturnBlock
-
-UnifiedReturnBlock:		; preds = %bb123, %cond_next32, %entry
-	%UnifiedRetVal = phi %struct.rtx_def* [ null, %entry ], [ %x, %cond_next32 ], [ %x, %bb123 ]		; <%struct.rtx_def*> [#uses=1]
-	ret %struct.rtx_def* %UnifiedRetVal
-}
diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
index 721d4c9..8e315f4 100644
--- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
+++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll
@@ -35,7 +35,7 @@
 bb.i28.i:		; preds = %bb.i28.i, %cond_next36.i
 ; CHECK: %bb.i28.i
 ; CHECK: addl $2
-; CHECK: addl $2
+; CHECK: addl $-2
 	%j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ]		; <i32> [#uses=2]
 	%din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ]		; <double> [#uses=1]
 	%tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2009-09-10-SpillComments.ll b/test/CodeGen/X86/2009-09-10-SpillComments.ll
index 1dd9990..f9ca861 100644
--- a/test/CodeGen/X86/2009-09-10-SpillComments.ll
+++ b/test/CodeGen/X86/2009-09-10-SpillComments.ll
@@ -1,5 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s
 
+; This test shouldn't require spills.
+
+; CHECK: subq  $8, %rsp
+; CHECK-NOT: $rsp
+; CHECK: addq  $8, %rsp
+
 	%struct..0anon = type { i32 }
 	%struct.rtvec_def = type { i32, [1 x %struct..0anon] }
 	%struct.rtx_def = type { i16, i8, i8, [1 x %struct..0anon] }
@@ -10,9 +16,6 @@
 
 define %struct.rtx_def* @walk_fixup_memory_subreg(%struct.rtx_def* %x, %struct.rtx_def* %insn) {
 entry:
-; CHECK: Spill
-; CHECK: Folded Spill
-; CHECK: Reload
 	%tmp2 = icmp eq %struct.rtx_def* %x, null		; <i1> [#uses=1]
 	br i1 %tmp2, label %UnifiedReturnBlock, label %cond_next
 
diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll
index 68575bc..3bd58b6 100644
--- a/test/CodeGen/X86/full-lsr.ll
+++ b/test/CodeGen/X86/full-lsr.ll
@@ -1,6 +1,12 @@
-; RUN: llc < %s -march=x86 -enable-full-lsr >%t
-; RUN: grep {addl	\\\$4,} %t | count 3
-; RUN: not grep {,%} %t
+; RUN: llc < %s -march=x86 >%t
+
+; TODO: Enhance full lsr mode to get this:
+; RUNX: grep {addl	\\\$4,} %t | count 3
+; RUNX: not grep {,%} %t
+
+; For now, it should find this, which is still pretty good:
+; RUN: not grep {addl	\\\$4,} %t
+; RUN: grep {,%} %t | count 6
 
 define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind {
 entry:
diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll
index c695c29..0410bc0 100644
--- a/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/test/CodeGen/X86/iv-users-in-other-loops.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=x86-64 -o %t
-; RUN: grep inc %t | count 1
+; RUN: not grep inc %t
 ; RUN: grep dec %t | count 2
-; RUN: grep addq %t | count 13
+; RUN: grep addq %t | count 10
 ; RUN: not grep addb %t
 ; RUN: grep leaq %t | count 9
-; RUN: grep leal %t | count 3
-; RUN: grep movq %t | count 5
+; RUN: grep leal %t | count 2
+; RUN: grep movq %t | count 10
 
 ; IV users in each of the loops from other loops shouldn't cause LSR
 ; to insert new induction variables. Previously it would create a
diff --git a/test/CodeGen/X86/loop-strength-reduce4.ll b/test/CodeGen/X86/loop-strength-reduce4.ll
index 07e46ec..6c0eb8c 100644
--- a/test/CodeGen/X86/loop-strength-reduce4.ll
+++ b/test/CodeGen/X86/loop-strength-reduce4.ll
@@ -1,5 +1,19 @@
-; RUN: llc < %s -march=x86 | grep cmp | grep 64
-; RUN: llc < %s -march=x86 | not grep inc
+; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC
+; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC
+
+; By starting the IV at -64 instead of 0, a cmp is eliminated,
+; as the flags from the add can be used directly.
+
+; STATIC: movl    $-64, %ecx
+
+; STATIC: movl    %eax, _state+76(%ecx)
+; STATIC: addl    $16, %ecx
+; STATIC: jne
+
+; In PIC mode the symbol can't be folded, so the change-compare-stride
+; trick applies.
+
+; PIC: cmpl $64
 
 @state = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
 @S = external global [0 x i32]		; <[0 x i32]*> [#uses=4]
diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll
index e14cd8a..6b2247d 100644
--- a/test/CodeGen/X86/loop-strength-reduce8.ll
+++ b/test/CodeGen/X86/loop-strength-reduce8.ll
@@ -1,4 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16
+; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+
+; CHECK: leal 16(%eax), %edx
+; CHECK: align
+; CHECK: addl    $4, %edx
+; CHECK: decl    %ecx
+; CHECK: jne     LBB1_2
 
 	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 }
 	%struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] }
diff --git a/test/CodeGen/X86/lsr-reuse.ll b/test/CodeGen/X86/lsr-reuse.ll
new file mode 100644
index 0000000..a1919ba
--- /dev/null
+++ b/test/CodeGen/X86/lsr-reuse.ll
@@ -0,0 +1,159 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+target datalayout = "e-p:64:64:64"
+target triple = "x86_64-unknown-unknown"
+
+; Full strength reduction reduces register pressure from 5 to 4 here.
+
+; CHECK: full_me:
+; CHECK: movsd   (%rsi), %xmm0
+; CHECK: mulsd   (%rdx), %xmm0
+; CHECK: movsd   %xmm0, (%rdi)
+; CHECK: addq    $8, %rsi
+; CHECK: addq    $8, %rdx
+; CHECK: addq    $8, %rdi
+; CHECK: decq    %rcx
+; CHECK: jne
+
+define void @full_me(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
+
+; In this test, the counting IV exit value is used, so full strength reduction
+; would not reduce register pressure. IndVarSimplify ought to simplify such
+; cases away, but it's useful here to verify that LSR's register pressure
+; heuristics are working as expected.
+
+; CHECK: count_me_0:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  ret i64 %q
+}
+
+; In this test, the trip count value is used, so full strength reduction
+; would not reduce register pressure.
+; (though it would reduce register pressure inside the loop...)
+
+; CHECK: count_me_1:
+; CHECK: movsd   (%rsi,%rax,8), %xmm0
+; CHECK: mulsd   (%rdx,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdi,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    %rax, %rcx
+; CHECK: jne
+
+define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind {
+entry:
+  %t0 = icmp sgt i64 %n, 0
+  br i1 %t0, label %loop, label %return
+
+loop:
+  %i = phi i64 [ %i.next, %loop ], [ 0, %entry ]
+  %Ai = getelementptr inbounds double* %A, i64 %i
+  %Bi = getelementptr inbounds double* %B, i64 %i
+  %Ci = getelementptr inbounds double* %C, i64 %i
+  %t1 = load double* %Bi
+  %t2 = load double* %Ci
+  %m = fmul double %t1, %t2
+  store double %m, double* %Ai
+  %i.next = add nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, %n
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  %q = phi i64 [ 0, %entry ], [ %n, %loop ]
+  ret i64 %q
+}
+
+; This should be fully strength-reduced to reduce register pressure, however
+; the current heuristics get distracted by all the reuse with the stride-1
+; induction variable first.
+
+; But even so, be clever and start the stride-1 variable at a non-zero value
+; to eliminate an in-loop immediate value.
+
+; CHECK: count_me_2:
+; CHECK: movl    $5, %eax
+; CHECK: align
+; CHECK: BB4_1:
+; CHECK: movsd   (%rdi,%rax,8), %xmm0
+; CHECK: addsd   (%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, (%rdx,%rax,8)
+; CHECK: movsd   40(%rdi,%rax,8), %xmm0
+; CHECK: addsd   40(%rsi,%rax,8), %xmm0
+; CHECK: movsd   %xmm0, 40(%rdx,%rax,8)
+; CHECK: incq    %rax
+; CHECK: cmpq    $5005, %rax
+; CHECK: jne
+
+define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %i5 = add i64 %i, 5
+  %Ai = getelementptr double* %A, i64 %i5
+  %t2 = load double* %Ai
+  %Bi = getelementptr double* %B, i64 %i5
+  %t4 = load double* %Bi
+  %t5 = fadd double %t2, %t4
+  %Ci = getelementptr double* %C, i64 %i5
+  store double %t5, double* %Ci
+  %i10 = add i64 %i, 10
+  %Ai10 = getelementptr double* %A, i64 %i10
+  %t9 = load double* %Ai10
+  %Bi10 = getelementptr double* %B, i64 %i10
+  %t11 = load double* %Bi10
+  %t12 = fadd double %t9, %t11
+  %Ci10 = getelementptr double* %C, i64 %i10
+  store double %t12, double* %Ci10
+  %i.next = add i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 5000
+  br i1 %exitcond, label %return, label %loop
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index bc493bd..7111d68 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -4,9 +4,9 @@
 ; RUN: not grep sar %t
 ; RUN: not grep shl %t
 ; RUN: grep add %t | count 2
-; RUN: grep inc %t | count 4
+; RUN: grep inc %t | count 3
 ; RUN: grep dec %t | count 2
-; RUN: grep lea %t | count 2
+; RUN: grep lea %t | count 3
 
 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.
@@ -127,6 +127,9 @@
 	ret void
 }
 
+; TODO: If we could handle all the loads and stores as post-inc users, we could
+; use {-1,+,1} in the induction variable register, and we'd get another inc,
+; one fewer add, and a comparison with zero.
 define void @another_count_up(double* %d, i64 %n) nounwind {
 entry:
 	br label %loop
diff --git a/test/CodeGen/X86/pr3495-2.ll b/test/CodeGen/X86/pr3495-2.ll
index 1372a15..71aa5a0 100644
--- a/test/CodeGen/X86/pr3495-2.ll
+++ b/test/CodeGen/X86/pr3495-2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of reloads omited}
 
+target datalayout = "e-p:32:32:32"
 target triple = "i386-apple-darwin9.6"
 	%struct.constraintVCGType = type { i32, i32, i32, i32 }
 	%struct.nodeVCGType = type { %struct.constraintVCGType*, i32, i32, i32, %struct.constraintVCGType*, i32, i32, i32 }
diff --git a/test/CodeGen/X86/remat-mov-0.ll b/test/CodeGen/X86/remat-mov-0.ll
index c4f768c..5fb445c 100644
--- a/test/CodeGen/X86/remat-mov-0.ll
+++ b/test/CodeGen/X86/remat-mov-0.ll
@@ -1,13 +1,33 @@
-; RUN: llc < %s -march=x86-64 | grep {xorl	%edi, %edi} | count 4
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 ; CodeGen should remat the zero instead of spilling it.
 
 declare void @foo(i64 %p)
 
+; CHECK: bar:
+; CHECK: xorl %edi, %edi
+; CHECK: xorl %edi, %edi
 define void @bar() nounwind {
   call void @foo(i64 0)
   call void @foo(i64 0)
-  call void @foo(i64 0)
-  call void @foo(i64 0)
   ret void
 }
+
+; CHECK: bat:
+; CHECK: movq $-1, %rdi
+; CHECK: movq $-1, %rdi
+define void @bat() nounwind {
+  call void @foo(i64 -1)
+  call void @foo(i64 -1)
+  ret void
+}
+
+; CHECK: bau:
+; CHECK: movl $1, %edi
+; CHECK: movl $1, %edi
+define void @bau() nounwind {
+  call void @foo(i64 1)
+  call void @foo(i64 1)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/remat-mov-1.ll b/test/CodeGen/X86/remat-mov-1.ll
deleted file mode 100644
index d71b7a5..0000000
--- a/test/CodeGen/X86/remat-mov-1.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llc < %s -march=x86 | grep -- -1 | grep mov | count 2
-
-	%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
-	%struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* }
-	%struct._CompT = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, i8, %struct._PixT*, %struct._CompT*, i8, %struct._CompT* }
-	%struct._PixT = type { i32, i32, %struct._PixT* }
-	%struct.__sFILEX = type opaque
-	%struct.__sbuf = type { i8*, i32 }
-
-declare fastcc void @MergeComponents(%struct._CompT*, %struct._CompT*, %struct._CompT*, %struct._CompT**, %struct.ImgT*) nounwind 
-
-define fastcc void @MergeToLeft(%struct._CompT* %comp, %struct._CompT** %head, %struct.ImgT* %img) nounwind  {
-entry:
-	br label %bb208
-
-bb105:		; preds = %bb200
-	br i1 false, label %bb197, label %bb149
-
-bb149:		; preds = %bb105
-	%tmp151 = getelementptr %struct._CompT* %comp, i32 0, i32 0		; <i32*> [#uses=1]
-	br label %bb193
-
-bb193:		; preds = %bb184, %bb149
-	%tmp196 = load i32* %tmp151, align 4		; <i32> [#uses=1]
-	br label %bb197
-
-bb197:		; preds = %bb193, %bb105
-	%last_comp.0 = phi i32 [ %tmp196, %bb193 ], [ 0, %bb105 ]		; <i32> [#uses=0]
-	%indvar.next = add i32 %indvar, 1		; <i32> [#uses=1]
-	br label %bb200
-
-bb200:		; preds = %bb208, %bb197
-	%indvar = phi i32 [ 0, %bb208 ], [ %indvar.next, %bb197 ]		; <i32> [#uses=2]
-	%xm.0 = sub i32 %indvar, 0		; <i32> [#uses=1]
-	%tmp202 = icmp slt i32 %xm.0, 1		; <i1> [#uses=1]
-	br i1 %tmp202, label %bb105, label %bb208
-
-bb208:		; preds = %bb200, %entry
-	br label %bb200
-}
diff --git a/test/CodeGen/X86/subreg-to-reg-5.ll b/test/CodeGen/X86/subreg-to-reg-5.ll
deleted file mode 100644
index ba4c307..0000000
--- a/test/CodeGen/X86/subreg-to-reg-5.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep addl %t
-; RUN: not egrep {movl|movq} %t
-
-define float @foo(float* %B) nounwind {
-entry:
-	br label %bb2
-
-bb2:		; preds = %bb3, %entry
-	%B_addr.0.rec = phi i64 [ %indvar.next154, %bb3 ], [ 0, %entry ]		; <i64> [#uses=2]
-        %z = icmp slt i64 %B_addr.0.rec, 20000
-	br i1 %z, label %bb3, label %bb4
-
-bb3:		; preds = %bb2
-	%indvar.next154 = add i64 %B_addr.0.rec, 1		; <i64> [#uses=1]
-	br label %bb2
-
-bb4:		; preds = %bb2
-	%B_addr.0 = getelementptr float* %B, i64 %B_addr.0.rec		; <float*> [#uses=1]
-	%t1 = ptrtoint float* %B_addr.0 to i64		; <i64> [#uses=1]
-	%t2 = and i64 %t1, 4294967295		; <i64> [#uses=1]
-	%t3 = icmp eq i64 %t2, 0		; <i1> [#uses=1]
-	br i1 %t3, label %bb5, label %bb10.preheader
-
-bb10.preheader:		; preds = %bb4
-	br label %bb9
-
-bb5:		; preds = %bb4
-	ret float 7.0
-
-bb9:		; preds = %bb10.preheader
-	%t5 = getelementptr float* %B, i64 0		; <float*> [#uses=1]
-	%t7 = load float* %t5		; <float> [#uses=1]
-	ret float %t7
-}
diff --git a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll b/test/Transforms/IndVarSimplify/gep-with-mul-base.ll
index 7809594..19d54ff 100644
--- a/test/Transforms/IndVarSimplify/gep-with-mul-base.ll
+++ b/test/Transforms/IndVarSimplify/gep-with-mul-base.ll
@@ -1,6 +1,7 @@
 ; RUN: opt < %s -indvars -S > %t
-; RUN: grep add %t | count 8
-; RUN: grep mul %t | count 7
+; RUN: grep add %t | count 6
+; RUN: grep sub %t | count 2
+; RUN: grep mul %t | count 6
 
 define void @foo(i64 %n, i64 %m, i64 %o, double* nocapture %p) nounwind {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll b/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll
index 7c7a21c..99cb856 100644
--- a/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll
+++ b/test/Transforms/LoopStrengthReduce/2008-08-06-CmpStride.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | grep ugt
-; PR2535
+; RUN: llc -march=x86-64 < %s -o - | grep {cmpl	\\$\[1\], %}
 
 @.str = internal constant [4 x i8] c"%d\0A\00"
 
@@ -16,7 +15,7 @@
         %add166 = or i32 %mul15, 1              ; <i32> [#uses=1] *
         call i32 (i8*, ...)* @printf( i8* noalias  getelementptr ([4 x i8]* @.str, i32 0, i32 0), i32 %add166 ) nounwind
         %inc = add i32 %i.0, 1          ; <i32> [#uses=3]
-        %cmp = icmp ult i32 %inc, 1027          ; <i1> [#uses=1]
+        %cmp = icmp ne i32 %inc, 1027          ; <i1> [#uses=1]
         br i1 %cmp, label %forbody, label %afterfor
 
 afterfor:               ; preds = %forcond
diff --git a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll
index 36941ad..d9abc8b 100644
--- a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll
+++ b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-0.ll
@@ -1,10 +1,9 @@
-; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpl	\$4}
+; RUN: llc < %s -o - | grep {testl	%ecx, %ecx}
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9"
 
-; This is like change-compare-stride-trickiness-1.ll except the comparison
-; happens before the relevant use, so the comparison stride can't be
-; easily changed.
+; The comparison happens before the relevant use, but it can still be rewritten
+; to compare with zero.
 
 define void @foo() nounwind {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll
index 8a3978b..ea8a259 100644
--- a/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll
+++ b/test/Transforms/LoopStrengthReduce/change-compare-stride-trickiness-1.ll
@@ -1,10 +1,10 @@
-; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmpq	\$8}
+; RUN: llc %s -o - --x86-asm-syntax=att | grep {cmp.	\$8}
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9"
 
-; This is like change-compare-stride-trickiness-0.ll except the comparison
-; happens after the relevant use, so the comparison stride can be
-; easily changed.
+; The comparison happens after the relevant use, so the stride can easily
+; be changed. The comparison can be done in a narrower mode than the
+; induction variable.
 
 define void @foo() nounwind {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/count-to-zero.ll b/test/Transforms/LoopStrengthReduce/count-to-zero.ll
index 8cc3b5c..feb79f8 100644
--- a/test/Transforms/LoopStrengthReduce/count-to-zero.ll
+++ b/test/Transforms/LoopStrengthReduce/count-to-zero.ll
@@ -19,7 +19,7 @@
   %tmp4 = add i32 %c_addr.1, -1                   ; <i32> [#uses=1]
   %c_addr.1.be = select i1 %tmp2, i32 %tmp3, i32 %tmp4 ; <i32> [#uses=1]
   %indvar.next = add i32 %indvar, 1               ; <i32> [#uses=1]
-; CHECK: sub i32 %lsr.iv, 1
+; CHECK: add i32 %lsr.iv, -1
   br label %bb6
 
 bb6:                                              ; preds = %bb3, %entry
diff --git a/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll b/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll
deleted file mode 100644
index 4ad5d14..0000000
--- a/test/Transforms/LoopStrengthReduce/icmp_use_postinc.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: opt < %s -loop-reduce -S | FileCheck %s
-
-define i32 @main(i32 %argc, i8** nocapture %argv) nounwind ssp {
-entry:
-  br i1 undef, label %bb4.preheader, label %bb.nph8
-
-bb4.preheader:                                    ; preds = %entry
-  br label %bb4
-
-bb1:                                              ; preds = %bb4
-  br i1 undef, label %bb.nph8, label %bb3
-
-bb3:                                              ; preds = %bb1
-  %phitmp = add i32 %indvar, 1                    ; <i32> [#uses=1]
-  br label %bb4
-
-bb4:                                              ; preds = %bb3, %bb4.preheader
-; CHECK: %lsr.iv = phi
-; CHECK: %lsr.iv.next = add i32 %lsr.iv, 1
-; CHECK: %0 = icmp slt i32 %lsr.iv.next, %argc
-  %indvar = phi i32 [ 1, %bb4.preheader ], [ %phitmp, %bb3 ] ; <i32> [#uses=2]
-  %0 = icmp slt i32 %indvar, %argc                ; <i1> [#uses=1]
-  br i1 %0, label %bb1, label %bb.nph8
-
-bb.nph8:                                          ; preds = %bb4, %bb1, %entry
-  unreachable
-}