[LoopStrengthReduce, x86] don't add cost for a cmp that will be macro-fused (PR35681)

In the motivating case from PR35681 and represented by the macro-fuse-cmp test:
https://bugs.llvm.org/show_bug.cgi?id=35681
...there's a 37 -> 31 byte size win for the loop because we eliminate the big base 
address offsets.

SPEC2017 on Ryzen shows no significant perf difference.

Differential Revision: https://reviews.llvm.org/D42607

llvm-svn: 324289
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
index 219aed4..ba0da16 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll
@@ -43,27 +43,22 @@
 ;
 ; HSW-LABEL: @maxArray(
 ; HSW-NEXT:  entry:
-; HSW-NEXT:    [[Y1:%.*]] = bitcast double* [[Y:%.*]] to i8*
-; HSW-NEXT:    [[X3:%.*]] = bitcast double* [[X:%.*]] to i8*
 ; HSW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; HSW:       vector.body:
-; HSW-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
-; HSW-NEXT:    [[UGLYGEP7:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP78:%.*]] = bitcast i8* [[UGLYGEP7]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP9:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP78]], i64 32768
-; HSW-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y1]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP2:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP2]], i64 32768
-; HSW-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP9]], align 8
-; HSW-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
+; HSW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; HSW-NEXT:    [[SCEVGEP4:%.*]] = getelementptr double, double* [[X:%.*]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to <2 x double>*
+; HSW-NEXT:    [[SCEVGEP:%.*]] = getelementptr double, double* [[Y:%.*]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP1:%.*]] = bitcast double* [[SCEVGEP]] to <2 x double>*
+; HSW-NEXT:    [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP45]], align 8
+; HSW-NEXT:    [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP1]], align 8
 ; HSW-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
 ; HSW-NEXT:    [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
-; HSW-NEXT:    [[UGLYGEP4:%.*]] = getelementptr i8, i8* [[X3]], i64 [[LSR_IV]]
-; HSW-NEXT:    [[UGLYGEP45:%.*]] = bitcast i8* [[UGLYGEP4]] to <2 x double>*
-; HSW-NEXT:    [[SCEVGEP6:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP45]], i64 32768
-; HSW-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP6]], align 8
-; HSW-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
-; HSW-NEXT:    [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
+; HSW-NEXT:    [[SCEVGEP2:%.*]] = getelementptr double, double* [[X]], i64 [[INDEX]]
+; HSW-NEXT:    [[SCEVGEP23:%.*]] = bitcast double* [[SCEVGEP2]] to <2 x double>*
+; HSW-NEXT:    store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP23]], align 8
+; HSW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; HSW-NEXT:    [[DONE:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
 ; HSW-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
 ; HSW:       exit:
 ; HSW-NEXT:    ret void
@@ -85,15 +80,16 @@
 ;
 ; FUSE-LABEL: maxArray:
 ; FUSE:       # %bb.0: # %entry
-; FUSE-NEXT:    movq $-524288, %rax # imm = 0xFFF80000
+; FUSE-NEXT:    xorl %eax, %eax
 ; FUSE-NEXT:    .p2align 4, 0x90
 ; FUSE-NEXT:  .LBB0_1: # %vector.body
 ; FUSE-NEXT:    # =>This Inner Loop Header: Depth=1
-; FUSE-NEXT:    movupd 524288(%rdi,%rax), %xmm0
-; FUSE-NEXT:    movupd 524288(%rsi,%rax), %xmm1
+; FUSE-NEXT:    movupd (%rdi,%rax,8), %xmm0
+; FUSE-NEXT:    movupd (%rsi,%rax,8), %xmm1
 ; FUSE-NEXT:    maxpd %xmm0, %xmm1
-; FUSE-NEXT:    movupd %xmm1, 524288(%rdi,%rax)
-; FUSE-NEXT:    addq $16, %rax
+; FUSE-NEXT:    movupd %xmm1, (%rdi,%rax,8)
+; FUSE-NEXT:    addq $2, %rax
+; FUSE-NEXT:    cmpq $65536, %rax # imm = 0x10000
 ; FUSE-NEXT:    jne .LBB0_1
 ; FUSE-NEXT:  # %bb.2: # %exit
 ; FUSE-NEXT:    retq