blob: b1909ed32995ccb965937ca885dccbe151606053 [file] [log] [blame]
Sanjay Patelffb37a22018-01-30 19:17:38 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: opt < %s -loop-reduce -mcpu=btver2 -S | FileCheck %s --check-prefix=JAG
3; RUN: opt < %s -loop-reduce -mcpu=haswell -S | FileCheck %s --check-prefix=HSW
4
5; RUN: llc < %s | FileCheck %s --check-prefix=BASE
6; RUN: llc < %s -mattr=macrofusion | FileCheck %s --check-prefix=FUSE
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9target triple = "x86_64-unknown-unknown"
10
11; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681
12; FIXME: If a CPU can macro-fuse a compare and branch, then we discount that
13; cost in LSR and avoid generating large offsets in each memory access.
14; This reduces code size and may improve decode throughput.
15
16define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) {
17; JAG-LABEL: @maxArray(
18; JAG-NEXT: entry:
19; JAG-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to <2 x double>*
20; JAG-NEXT: [[X4:%.*]] = bitcast double* [[X:%.*]] to <2 x double>*
21; JAG-NEXT: [[X45:%.*]] = bitcast <2 x double>* [[X4]] to i8*
22; JAG-NEXT: [[Y12:%.*]] = bitcast <2 x double>* [[Y1]] to i8*
23; JAG-NEXT: br label [[VECTOR_BODY:%.*]]
24; JAG: vector.body:
25; JAG-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
26; JAG-NEXT: [[UGLYGEP9:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]]
27; JAG-NEXT: [[UGLYGEP910:%.*]] = bitcast i8* [[UGLYGEP9]] to <2 x double>*
28; JAG-NEXT: [[SCEVGEP11:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP910]], i64 32768
29; JAG-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y12]], i64 [[LSR_IV]]
30; JAG-NEXT: [[UGLYGEP3:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
31; JAG-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP3]], i64 32768
32; JAG-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP11]], align 8
33; JAG-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
34; JAG-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
35; JAG-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
36; JAG-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]]
37; JAG-NEXT: [[UGLYGEP67:%.*]] = bitcast i8* [[UGLYGEP6]] to <2 x double>*
38; JAG-NEXT: [[SCEVGEP8:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP67]], i64 32768
39; JAG-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP8]], align 8
40; JAG-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
41; JAG-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
42; JAG-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
43; JAG: exit:
44; JAG-NEXT: ret void
45;
46; HSW-LABEL: @maxArray(
47; HSW-NEXT: entry:
48; HSW-NEXT: [[Y1:%.*]] = bitcast double* [[Y:%.*]] to <2 x double>*
49; HSW-NEXT: [[X4:%.*]] = bitcast double* [[X:%.*]] to <2 x double>*
50; HSW-NEXT: [[X45:%.*]] = bitcast <2 x double>* [[X4]] to i8*
51; HSW-NEXT: [[Y12:%.*]] = bitcast <2 x double>* [[Y1]] to i8*
52; HSW-NEXT: br label [[VECTOR_BODY:%.*]]
53; HSW: vector.body:
54; HSW-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[VECTOR_BODY]] ], [ -524288, [[ENTRY:%.*]] ]
55; HSW-NEXT: [[UGLYGEP9:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]]
56; HSW-NEXT: [[UGLYGEP910:%.*]] = bitcast i8* [[UGLYGEP9]] to <2 x double>*
57; HSW-NEXT: [[SCEVGEP11:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP910]], i64 32768
58; HSW-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[Y12]], i64 [[LSR_IV]]
59; HSW-NEXT: [[UGLYGEP3:%.*]] = bitcast i8* [[UGLYGEP]] to <2 x double>*
60; HSW-NEXT: [[SCEVGEP:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP3]], i64 32768
61; HSW-NEXT: [[XVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP11]], align 8
62; HSW-NEXT: [[YVAL:%.*]] = load <2 x double>, <2 x double>* [[SCEVGEP]], align 8
63; HSW-NEXT: [[CMP:%.*]] = fcmp ogt <2 x double> [[YVAL]], [[XVAL]]
64; HSW-NEXT: [[MAX:%.*]] = select <2 x i1> [[CMP]], <2 x double> [[YVAL]], <2 x double> [[XVAL]]
65; HSW-NEXT: [[UGLYGEP6:%.*]] = getelementptr i8, i8* [[X45]], i64 [[LSR_IV]]
66; HSW-NEXT: [[UGLYGEP67:%.*]] = bitcast i8* [[UGLYGEP6]] to <2 x double>*
67; HSW-NEXT: [[SCEVGEP8:%.*]] = getelementptr <2 x double>, <2 x double>* [[UGLYGEP67]], i64 32768
68; HSW-NEXT: store <2 x double> [[MAX]], <2 x double>* [[SCEVGEP8]], align 8
69; HSW-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 16
70; HSW-NEXT: [[DONE:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
71; HSW-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[VECTOR_BODY]]
72; HSW: exit:
73; HSW-NEXT: ret void
74;
75; BASE-LABEL: maxArray:
76; BASE: # %bb.0: # %entry
77; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000
78; BASE-NEXT: .p2align 4, 0x90
79; BASE-NEXT: .LBB0_1: # %vector.body
80; BASE-NEXT: # =>This Inner Loop Header: Depth=1
81; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm0
82; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm1
83; BASE-NEXT: maxpd %xmm0, %xmm1
84; BASE-NEXT: movupd %xmm1, 524288(%rdi,%rax)
85; BASE-NEXT: addq $16, %rax
86; BASE-NEXT: jne .LBB0_1
87; BASE-NEXT: # %bb.2: # %exit
88; BASE-NEXT: retq
89;
90; FUSE-LABEL: maxArray:
91; FUSE: # %bb.0: # %entry
92; FUSE-NEXT: movq $-524288, %rax # imm = 0xFFF80000
93; FUSE-NEXT: .p2align 4, 0x90
94; FUSE-NEXT: .LBB0_1: # %vector.body
95; FUSE-NEXT: # =>This Inner Loop Header: Depth=1
96; FUSE-NEXT: movupd 524288(%rdi,%rax), %xmm0
97; FUSE-NEXT: movupd 524288(%rsi,%rax), %xmm1
98; FUSE-NEXT: maxpd %xmm0, %xmm1
99; FUSE-NEXT: movupd %xmm1, 524288(%rdi,%rax)
100; FUSE-NEXT: addq $16, %rax
101; FUSE-NEXT: jne .LBB0_1
102; FUSE-NEXT: # %bb.2: # %exit
103; FUSE-NEXT: retq
104entry:
105 br label %vector.body
106
107vector.body:
108 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
109 %gepx = getelementptr inbounds double, double* %x, i64 %index
110 %gepy = getelementptr inbounds double, double* %y, i64 %index
111 %xptr = bitcast double* %gepx to <2 x double>*
112 %yptr = bitcast double* %gepy to <2 x double>*
113 %xval = load <2 x double>, <2 x double>* %xptr, align 8
114 %yval = load <2 x double>, <2 x double>* %yptr, align 8
115 %cmp = fcmp ogt <2 x double> %yval, %xval
116 %max = select <2 x i1> %cmp, <2 x double> %yval, <2 x double> %xval
117 %xptr_again = bitcast double* %gepx to <2 x double>*
118 store <2 x double> %max, <2 x double>* %xptr_again, align 8
119 %index.next = add i64 %index, 2
120 %done = icmp eq i64 %index.next, 65536
121 br i1 %done, label %exit, label %vector.body
122
123exit:
124 ret void
125}
126