Sanjay Patel | ffb37a2 | 2018-01-30 19:17:38 +0000 | [diff] [blame^] | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | ; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG |
| 3 | ; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW |
| 4 | |
| 5 | ; RUN: llc < %s | FileCheck %s --check-prefix=BASE |
| 6 | ; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE |
| 7 | |
| 8 | target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" |
| 9 | target triple = "x86_64-unknown-unknown" |
| 10 | |
| 11 | ; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681 |
| 12 | ; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that |
| 13 | ; cost in LSR and avoid generating large offsets in each memory access. |
| 14 | ; This reduces code size and may improve decode throughput. |
| 15 | |
| 16 | define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) { |
| 17 | ; JAG-LABEL: @maxArray( |
| 18 | ; JAG-NEXT: entry: |
| 19 | ; JAG-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to <2 x double>* |
| 20 | ; JAG-NEXT: [[X4:%.*]] = bitcast double* [[X:%.*]] to <2 x double>* |
| 21 | ; JAG-NEXT: [[X45:%.*]] = bitcast <2 x double>* [[X4]] to i8* |
| 22 | ; JAG-NEXT: [[Y12:%.*]] = bitcast <2 x double>* [[Y1]] to i8* |
| 23 | ; JAG-NEXT: br label [[VECTOR_BODY:%.*]] |
| 24 | ; JAG: vector.body: |
| 25 | ; JAG-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ] |
| 26 | ; JAG-NEXT: [[UGLYGEP9:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]] |
| 27 | ; JAG-NEXT: [[UGLYGEP910:%.*]] = bitcast i8* [[UGLYGEP9]] to <2 x double>* |
| 28 | ; JAG-NEXT: [[SCEVGEP11:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP910]], i64 32768 |
| 29 | ; JAG-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y12]], i64 [[LSR_IV]] |
| 30 | ; JAG-NEXT: [[UGLYGEP3:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>* |
| 31 | ; JAG-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP3]], i64 32768 |
| 32 | ; JAG-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP11]], align 8 |
| 33 | ; JAG-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8 |
| 34 | ; JAG-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] |
| 35 | ; JAG-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] |
| 36 | ; JAG-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]] |
| 37 | ; JAG-NEXT: [[UGLYGEP67:%.*]] = bitcast i8* [[UGLYGEP6]] to <2 x double>* |
| 38 | ; JAG-NEXT: [[SCEVGEP8:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP67]], i64 32768 |
| 39 | ; JAG-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP8]], align 8 |
| 40 | ; JAG-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16 |
| 41 | ; JAG-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 |
| 42 | ; JAG-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] |
| 43 | ; JAG: exit: |
| 44 | ; JAG-NEXT: ret void |
| 45 | ; |
| 46 | ; HSW-LABEL: @maxArray( |
| 47 | ; HSW-NEXT: entry: |
| 48 | ; HSW-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to <2 x double>* |
| 49 | ; HSW-NEXT: [[X4:%.*]] = bitcast double* [[X:%.*]] to <2 x double>* |
| 50 | ; HSW-NEXT: [[X45:%.*]] = bitcast <2 x double>* [[X4]] to i8* |
| 51 | ; HSW-NEXT: [[Y12:%.*]] = bitcast <2 x double>* [[Y1]] to i8* |
| 52 | ; HSW-NEXT: br label [[VECTOR_BODY:%.*]] |
| 53 | ; HSW: vector.body: |
| 54 | ; HSW-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ] |
| 55 | ; HSW-NEXT: [[UGLYGEP9:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]] |
| 56 | ; HSW-NEXT: [[UGLYGEP910:%.*]] = bitcast i8* [[UGLYGEP9]] to <2 x double>* |
| 57 | ; HSW-NEXT: [[SCEVGEP11:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP910]], i64 32768 |
| 58 | ; HSW-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y12]], i64 [[LSR_IV]] |
| 59 | ; HSW-NEXT: [[UGLYGEP3:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>* |
| 60 | ; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP3]], i64 32768 |
| 61 | ; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP11]], align 8 |
| 62 | ; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8 |
| 63 | ; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]] |
| 64 | ; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]] |
| 65 | ; HSW-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]] |
| 66 | ; HSW-NEXT: [[UGLYGEP67:%.*]] = bitcast i8* [[UGLYGEP6]] to <2 x double>* |
| 67 | ; HSW-NEXT: [[SCEVGEP8:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP67]], i64 32768 |
| 68 | ; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP8]], align 8 |
| 69 | ; HSW-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16 |
| 70 | ; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 |
| 71 | ; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]] |
| 72 | ; HSW: exit: |
| 73 | ; HSW-NEXT: ret void |
| 74 | ; |
| 75 | ; BASE-LABEL: maxArray: |
| 76 | ; BASE: # %bb.0: # %entry |
| 77 | ; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000 |
| 78 | ; BASE-NEXT: .p2align 4, 0x90 |
| 79 | ; BASE-NEXT: .LBB0_1: # %vector.body |
| 80 | ; BASE-NEXT: # =>This Inner Loop Header: Depth=1 |
| 81 | ; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm0 |
| 82 | ; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm1 |
| 83 | ; BASE-NEXT: maxpd %xmm0, %xmm1 |
| 84 | ; BASE-NEXT: movupd %xmm1, 524288(%rdi,%rax) |
| 85 | ; BASE-NEXT: addq $16, %rax |
| 86 | ; BASE-NEXT: jne .LBB0_1 |
| 87 | ; BASE-NEXT: # %bb.2: # %exit |
| 88 | ; BASE-NEXT: retq |
| 89 | ; |
| 90 | ; FUSE-LABEL: maxArray: |
| 91 | ; FUSE: # %bb.0: # %entry |
| 92 | ; FUSE-NEXT: movq $-524288, %rax # imm = 0xFFF80000 |
| 93 | ; FUSE-NEXT: .p2align 4, 0x90 |
| 94 | ; FUSE-NEXT: .LBB0_1: # %vector.body |
| 95 | ; FUSE-NEXT: # =>This Inner Loop Header: Depth=1 |
| 96 | ; FUSE-NEXT: movupd 524288(%rdi,%rax), %xmm0 |
| 97 | ; FUSE-NEXT: movupd 524288(%rsi,%rax), %xmm1 |
| 98 | ; FUSE-NEXT: maxpd %xmm0, %xmm1 |
| 99 | ; FUSE-NEXT: movupd %xmm1, 524288(%rdi,%rax) |
| 100 | ; FUSE-NEXT: addq $16, %rax |
| 101 | ; FUSE-NEXT: jne .LBB0_1 |
| 102 | ; FUSE-NEXT: # %bb.2: # %exit |
| 103 | ; FUSE-NEXT: retq |
| 104 | entry: |
| 105 | br label %vector.body |
| 106 | |
| 107 | vector.body: |
| 108 | %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] |
| 109 | %gepx = getelementptr inbounds double, double* %x, i64 %index |
| 110 | %gepy = getelementptr inbounds double, double* %y, i64 %index |
| 111 | %xptr = bitcast double* %gepx to <2 x double>* |
| 112 | %yptr = bitcast double* %gepy to <2 x double>* |
| 113 | %xval = load <2 x double>, <2 x double>* %xptr, align 8 |
| 114 | %yval = load <2 x double>, <2 x double>* %yptr, align 8 |
| 115 | %cmp = fcmp ogt <2 x double> %yval, %xval |
| 116 | %max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval |
| 117 | %xptr_again = bitcast double* %gepx to <2 x double>* |
| 118 | store <2 x double> %max, <2 x double>* %xptr_again, align 8 |
| 119 | %index.next = add i64 %index, 2 |
| 120 | %done = icmp eq i64 %index.next, 65536 |
| 121 | br i1 %done, label %exit, label %vector.body |
| 122 | |
| 123 | exit: |
| 124 | ret void |
| 125 | } |
| 126 | |