blob: f7a02d613af1b5de0e6a8cea6c45bfb709b3bb62 [file] [log] [blame]
Eric Christophercee313d2019-04-17 04:52:47 +00001; RUN: opt -S -loop-vectorize -instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -runtime-memory-check-threshold=24 < %s | FileCheck %s
2
3target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
4
5; Check vectorization on an interleaved load group of factor 2 and an interleaved
6; store group of factor 2.
7
8; int AB[1024];
9; int CD[1024];
10; void test_array_load2_store2(int C, int D) {
11; for (int i = 0; i < 1024; i+=2) {
12; int A = AB[i];
13; int B = AB[i+1];
14; CD[i] = A + C;
15; CD[i+1] = B * D;
16; }
17; }
18
19; CHECK-LABEL: @test_array_load2_store2(
20; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
21; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
22; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
23; CHECK: add nsw <4 x i32>
24; CHECK: mul nsw <4 x i32>
25; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
26; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %{{.*}}, align 4
27
28@AB = common global [1024 x i32] zeroinitializer, align 4
29@CD = common global [1024 x i32] zeroinitializer, align 4
30
31define void @test_array_load2_store2(i32 %C, i32 %D) {
32entry:
33 br label %for.body
34
35for.body: ; preds = %for.body, %entry
36 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
37 %arrayidx0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %indvars.iv
38 %tmp = load i32, i32* %arrayidx0, align 4
39 %tmp1 = or i64 %indvars.iv, 1
40 %arrayidx1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @AB, i64 0, i64 %tmp1
41 %tmp2 = load i32, i32* %arrayidx1, align 4
42 %add = add nsw i32 %tmp, %C
43 %mul = mul nsw i32 %tmp2, %D
44 %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %indvars.iv
45 store i32 %add, i32* %arrayidx2, align 4
46 %arrayidx3 = getelementptr inbounds [1024 x i32], [1024 x i32]* @CD, i64 0, i64 %tmp1
47 store i32 %mul, i32* %arrayidx3, align 4
48 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
49 %cmp = icmp slt i64 %indvars.iv.next, 1024
50 br i1 %cmp, label %for.body, label %for.end
51
52for.end: ; preds = %for.body
53 ret void
54}
55
56; int A[3072];
57; struct ST S[1024];
58; void test_struct_st3() {
59; int *ptr = A;
60; for (int i = 0; i < 1024; i++) {
61; int X1 = *ptr++;
62; int X2 = *ptr++;
63; int X3 = *ptr++;
64; T[i].x = X1 + 1;
65; T[i].y = X2 + 2;
66; T[i].z = X3 + 3;
67; }
68; }
69
70; CHECK-LABEL: @test_struct_array_load3_store3(
71; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
72; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
73; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
74; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
75; CHECK: add nsw <4 x i32> {{.*}}, <i32 1, i32 1, i32 1, i32 1>
76; CHECK: add nsw <4 x i32> {{.*}}, <i32 2, i32 2, i32 2, i32 2>
77; CHECK: add nsw <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
78; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
79; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
80; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
81; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* {{.*}}, align 4
82
83%struct.ST3 = type { i32, i32, i32 }
84@A = common global [3072 x i32] zeroinitializer, align 4
85@S = common global [1024 x %struct.ST3] zeroinitializer, align 4
86
87define void @test_struct_array_load3_store3() {
88entry:
89 br label %for.body
90
91for.body: ; preds = %for.body, %entry
92 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
93 %ptr.016 = phi i32* [ getelementptr inbounds ([3072 x i32], [3072 x i32]* @A, i64 0, i64 0), %entry ], [ %incdec.ptr2, %for.body ]
94 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.016, i64 1
95 %tmp = load i32, i32* %ptr.016, align 4
96 %incdec.ptr1 = getelementptr inbounds i32, i32* %ptr.016, i64 2
97 %tmp1 = load i32, i32* %incdec.ptr, align 4
98 %incdec.ptr2 = getelementptr inbounds i32, i32* %ptr.016, i64 3
99 %tmp2 = load i32, i32* %incdec.ptr1, align 4
100 %add = add nsw i32 %tmp, 1
101 %x = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 0
102 store i32 %add, i32* %x, align 4
103 %add3 = add nsw i32 %tmp1, 2
104 %y = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 1
105 store i32 %add3, i32* %y, align 4
106 %add6 = add nsw i32 %tmp2, 3
107 %z = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 %indvars.iv, i32 2
108 store i32 %add6, i32* %z, align 4
109 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
110 %exitcond = icmp eq i64 %indvars.iv.next, 1024
111 br i1 %exitcond, label %for.end, label %for.body
112
113for.end: ; preds = %for.body
114 ret void
115}
116
117; Check vectorization on an interleaved load group of factor 4.
118
119; struct ST4{
120; int x;
121; int y;
122; int z;
123; int w;
124; };
125; int test_struct_load4(struct ST4 *S) {
126; int r = 0;
127; for (int i = 0; i < 1024; i++) {
128; r += S[i].x;
129; r -= S[i].y;
130; r += S[i].z;
131; r -= S[i].w;
132; }
133; return r;
134; }
135
Eric Christophercee313d2019-04-17 04:52:47 +0000136%struct.ST4 = type { i32, i32, i32, i32 }
137
138define i32 @test_struct_load4(%struct.ST4* nocapture readonly %S) {
Sanjay Patelf5cfcc42020-05-26 14:30:48 -0400139; CHECK-LABEL: @test_struct_load4(
140; CHECK-NEXT: entry:
141; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
142; CHECK: vector.ph:
143; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
144; CHECK: vector.body:
145; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
146; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
147; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[S:%.*]], i64 [[INDEX]], i32 0
148; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
149; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4
150; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
151; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
152; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
153; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
154; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_PHI]]
Sanjay Patel1a2bffa2020-05-26 14:32:57 -0400155; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[STRIDED_VEC2]]
156; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC3]]
157; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[TMP3]], [[TMP4]]
Sanjay Patelf5cfcc42020-05-26 14:30:48 -0400158; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
159; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
160; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
161; CHECK: middle.block:
162; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
163; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]]
164; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
165; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]]
166; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0
167; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
168; CHECK: scalar.ph:
169; CHECK-NEXT: br label [[FOR_BODY:%.*]]
170; CHECK: for.body:
171; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !7
172; CHECK: for.end:
173; CHECK-NEXT: [[SUB8_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
174; CHECK-NEXT: ret i32 [[SUB8_LCSSA]]
175;
Eric Christophercee313d2019-04-17 04:52:47 +0000176entry:
177 br label %for.body
178
179for.body: ; preds = %for.body, %entry
180 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
181 %r.022 = phi i32 [ 0, %entry ], [ %sub8, %for.body ]
182 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 0
183 %tmp = load i32, i32* %x, align 4
184 %add = add nsw i32 %tmp, %r.022
185 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 1
186 %tmp1 = load i32, i32* %y, align 4
187 %sub = sub i32 %add, %tmp1
188 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 2
189 %tmp2 = load i32, i32* %z, align 4
190 %add5 = add nsw i32 %sub, %tmp2
191 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %S, i64 %indvars.iv, i32 3
192 %tmp3 = load i32, i32* %w, align 4
193 %sub8 = sub i32 %add5, %tmp3
194 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
195 %exitcond = icmp eq i64 %indvars.iv.next, 1024
196 br i1 %exitcond, label %for.end, label %for.body
197
198for.end: ; preds = %for.body
199 ret i32 %sub8
200}
201
202; Check vectorization on an interleaved store group of factor 4.
203
204; void test_struct_store4(int *A, struct ST4 *B) {
205; int *ptr = A;
206; for (int i = 0; i < 1024; i++) {
207; int X = *ptr++;
208; B[i].x = X + 1;
209; B[i].y = X * 2;
210; B[i].z = X + 3;
211; B[i].w = X + 4;
212; }
213; }
214
215; CHECK-LABEL: @test_struct_store4(
Sanjay Patelf5cfcc42020-05-26 14:30:48 -0400216; CHECK: %[[LD:.*]] = load <4 x i32>, <4 x i32>*
Eric Christophercee313d2019-04-17 04:52:47 +0000217; CHECK: add nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
218; CHECK: shl nsw <4 x i32> %[[LD]], <i32 1, i32 1, i32 1, i32 1>
219; CHECK: add nsw <4 x i32> %[[LD]], <i32 3, i32 3, i32 3, i32 3>
220; CHECK: add nsw <4 x i32> %[[LD]], <i32 4, i32 4, i32 4, i32 4>
221; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
222; CHECK: shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
223; CHECK: %interleaved.vec = shufflevector <8 x i32> {{.*}}, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
224; CHECK: store <16 x i32> %interleaved.vec, <16 x i32>* {{.*}}, align 4
225
226define void @test_struct_store4(i32* noalias nocapture readonly %A, %struct.ST4* noalias nocapture %B) {
227entry:
228 br label %for.body
229
230for.cond.cleanup: ; preds = %for.body
231 ret void
232
233for.body: ; preds = %for.body, %entry
234 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
235 %ptr.024 = phi i32* [ %A, %entry ], [ %incdec.ptr, %for.body ]
236 %incdec.ptr = getelementptr inbounds i32, i32* %ptr.024, i64 1
237 %tmp = load i32, i32* %ptr.024, align 4
238 %add = add nsw i32 %tmp, 1
239 %x = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 0
240 store i32 %add, i32* %x, align 4
241 %mul = shl nsw i32 %tmp, 1
242 %y = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 1
243 store i32 %mul, i32* %y, align 4
244 %add3 = add nsw i32 %tmp, 3
245 %z = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 2
246 store i32 %add3, i32* %z, align 4
247 %add6 = add nsw i32 %tmp, 4
248 %w = getelementptr inbounds %struct.ST4, %struct.ST4* %B, i64 %indvars.iv, i32 3
249 store i32 %add6, i32* %w, align 4
250 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
251 %exitcond = icmp eq i64 %indvars.iv.next, 1024
252 br i1 %exitcond, label %for.cond.cleanup, label %for.body
253}
254
255; Check vectorization on a reverse interleaved load group of factor 2 and
256; a reverse interleaved store group of factor 2.
257
258; struct ST2 {
259; int x;
260; int y;
261; };
262;
263; void test_reversed_load2_store2(struct ST2 *A, struct ST2 *B) {
264; for (int i = 1023; i >= 0; i--) {
265; int a = A[i].x + i; // interleaved load of index 0
266; int b = A[i].y - i; // interleaved load of index 1
267; B[i].x = a; // interleaved store of index 0
268; B[i].y = b; // interleaved store of index 1
269; }
270; }
271
272; CHECK-LABEL: @test_reversed_load2_store2(
273; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
274; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
275; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
276; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
277; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
278; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
279; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
280; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
281; CHECK: add nsw <4 x i32>
282; CHECK: sub nsw <4 x i32>
283; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
284; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
285; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
286; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
287; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
288; CHECK: %interleaved.vec = shufflevector <4 x i32> {{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
289; CHECK: store <8 x i32> %interleaved.vec, <8 x i32>* %[[B1]], align 4
290
291%struct.ST2 = type { i32, i32 }
292
293define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly %A, %struct.ST2* noalias nocapture %B) {
294entry:
295 br label %for.body
296
297for.cond.cleanup: ; preds = %for.body
298 ret void
299
300for.body: ; preds = %for.body, %entry
301 %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ]
302 %x = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 0
303 %tmp = load i32, i32* %x, align 4
304 %tmp1 = trunc i64 %indvars.iv to i32
305 %add = add nsw i32 %tmp, %tmp1
306 %y = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %indvars.iv, i32 1
307 %tmp2 = load i32, i32* %y, align 4
308 %sub = sub nsw i32 %tmp2, %tmp1
309 %x5 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 0
310 store i32 %add, i32* %x5, align 4
311 %y8 = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %indvars.iv, i32 1
312 store i32 %sub, i32* %y8, align 4
313 %indvars.iv.next = add nsw i64 %indvars.iv, -1
314 %cmp = icmp sgt i64 %indvars.iv, 0
315 br i1 %cmp, label %for.body, label %for.cond.cleanup
316}
317
318; Check vectorization on an interleaved load group of factor 2 with 1 gap
319; (missing the load of odd elements). Because the vectorized loop would
320; speculatively access memory out-of-bounds, we must execute at least one
321; iteration of the scalar loop.
322
323; void even_load_static_tc(int *A, int *B) {
324; for (unsigned i = 0; i < 1024; i+=2)
325; B[i/2] = A[i] * 2;
326; }
327
328; CHECK-LABEL: @even_load_static_tc(
329; CHECK: vector.body:
330; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
331; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
332; CHECK: icmp eq i64 %index.next, 508
333; CHECK: middle.block:
334; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph
335
336define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
337entry:
338 br label %for.body
339
340for.cond.cleanup: ; preds = %for.body
341 ret void
342
343for.body: ; preds = %for.body, %entry
344 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
345 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
346 %tmp = load i32, i32* %arrayidx, align 4
347 %mul = shl nsw i32 %tmp, 1
348 %tmp1 = lshr exact i64 %indvars.iv, 1
349 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
350 store i32 %mul, i32* %arrayidx2, align 4
351 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
352 %cmp = icmp ult i64 %indvars.iv.next, 1024
353 br i1 %cmp, label %for.body, label %for.cond.cleanup
354}
355
356; Check vectorization on an interleaved load group of factor 2 with 1 gap
357; (missing the load of odd elements). Because the vectorized loop would
358; speculatively access memory out-of-bounds, we must execute at least one
359; iteration of the scalar loop.
360
361; void even_load_dynamic_tc(int *A, int *B, unsigned N) {
362; for (unsigned i = 0; i < N; i+=2)
363; B[i/2] = A[i] * 2;
364; }
365
366; CHECK-LABEL: @even_load_dynamic_tc(
367; CHECK: vector.ph:
368; CHECK: %n.mod.vf = and i64 %[[N:[a-zA-Z0-9]+]], 3
369; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
370; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
371; CHECK: %n.vec = sub i64 %[[N]], %[[R]]
372; CHECK: vector.body:
373; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
374; CHECK: %strided.vec = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
375; CHECK: icmp eq i64 %index.next, %n.vec
376; CHECK: middle.block:
377; CHECK: br i1 false, label %for.cond.cleanup, label %scalar.ph
378
379define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i64 %N) {
380entry:
381 br label %for.body
382
383for.cond.cleanup: ; preds = %for.body
384 ret void
385
386for.body: ; preds = %for.body, %entry
387 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
388 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
389 %tmp = load i32, i32* %arrayidx, align 4
390 %mul = shl nsw i32 %tmp, 1
391 %tmp1 = lshr exact i64 %indvars.iv, 1
392 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %tmp1
393 store i32 %mul, i32* %arrayidx2, align 4
394 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
395 %cmp = icmp ult i64 %indvars.iv.next, %N
396 br i1 %cmp, label %for.body, label %for.cond.cleanup
397}
398
399; Check vectorization on a reverse interleaved load group of factor 2 with 1
400; gap and a reverse interleaved store group of factor 2. The interleaved load
401; group should be removed since it has a gap and is reverse.
402
403; struct pair {
404; int x;
405; int y;
406; };
407;
408; void load_gap_reverse(struct pair *P1, struct pair *P2, int X) {
409; for (int i = 1023; i >= 0; i--) {
410; int a = X + i;
411; int b = A[i].y - i;
412; B[i].x = a;
413; B[i].y = b;
414; }
415; }
416
417; CHECK-LABEL: @load_gap_reverse(
418; CHECK-NOT: %wide.vec = load <8 x i64>, <8 x i64>* %{{.*}}, align 8
419; CHECK-NOT: %strided.vec = shufflevector <8 x i64> %wide.vec, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
420
421%pair = type { i64, i64 }
422define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noalias nocapture readonly %P2, i64 %X) {
423entry:
424 br label %for.body
425
426for.body:
427 %i = phi i64 [ 1023, %entry ], [ %i.next, %for.body ]
428 %0 = add nsw i64 %X, %i
429 %1 = getelementptr inbounds %pair, %pair* %P1, i64 %i, i32 0
430 %2 = getelementptr inbounds %pair, %pair* %P2, i64 %i, i32 1
431 %3 = load i64, i64* %2, align 8
432 %4 = sub nsw i64 %3, %i
433 store i64 %0, i64* %1, align 8
434 store i64 %4, i64* %2, align 8
435 %i.next = add nsw i64 %i, -1
436 %cond = icmp sgt i64 %i, 0
437 br i1 %cond, label %for.body, label %for.exit
438
439for.exit:
440 ret void
441}
442
443; Check vectorization on interleaved access groups identified from mixed
444; loads/stores.
445; void mixed_load2_store2(int *A, int *B) {
446; for (unsigned i = 0; i < 1024; i+=2) {
447; B[i] = A[i] * A[i+1];
448; B[i+1] = A[i] + A[i+1];
449; }
450; }
451
452; CHECK-LABEL: @mixed_load2_store2(
453; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* {{.*}}, align 4
454; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
455; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
456; CHECK: %interleaved.vec = shufflevector <4 x i32> %{{.*}}, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
457; CHECK: store <8 x i32> %interleaved.vec
458
459define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias nocapture %B) {
460entry:
461 br label %for.body
462
463for.cond.cleanup: ; preds = %for.body
464 ret void
465
466for.body: ; preds = %for.body, %entry
467 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
468 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
469 %tmp = load i32, i32* %arrayidx, align 4
470 %tmp1 = or i64 %indvars.iv, 1
471 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %tmp1
472 %tmp2 = load i32, i32* %arrayidx2, align 4
473 %mul = mul nsw i32 %tmp2, %tmp
474 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
475 store i32 %mul, i32* %arrayidx4, align 4
476 %tmp3 = load i32, i32* %arrayidx, align 4
477 %tmp4 = load i32, i32* %arrayidx2, align 4
478 %add10 = add nsw i32 %tmp4, %tmp3
479 %arrayidx13 = getelementptr inbounds i32, i32* %B, i64 %tmp1
480 store i32 %add10, i32* %arrayidx13, align 4
481 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
482 %cmp = icmp ult i64 %indvars.iv.next, 1024
483 br i1 %cmp, label %for.body, label %for.cond.cleanup
484}
485
486; Check vectorization on interleaved access groups identified from mixed
487; loads/stores.
488; void mixed_load3_store3(int *A) {
489; for (unsigned i = 0; i < 1024; i++) {
490; *A++ += i;
491; *A++ += i;
492; *A++ += i;
493; }
494; }
495
496; CHECK-LABEL: @mixed_load3_store3(
497; CHECK: %wide.vec = load <12 x i32>, <12 x i32>* {{.*}}, align 4
498; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
499; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
500; CHECK: shufflevector <12 x i32> %wide.vec, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
501; CHECK: %interleaved.vec = shufflevector <8 x i32> %{{.*}}, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
502; CHECK: store <12 x i32> %interleaved.vec, <12 x i32>* %{{.*}}, align 4
503
504define void @mixed_load3_store3(i32* nocapture %A) {
505entry:
506 br label %for.body
507
508for.cond.cleanup: ; preds = %for.body
509 ret void
510
511for.body: ; preds = %for.body, %entry
512 %i.013 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
513 %A.addr.012 = phi i32* [ %A, %entry ], [ %incdec.ptr3, %for.body ]
514 %incdec.ptr = getelementptr inbounds i32, i32* %A.addr.012, i64 1
515 %tmp = load i32, i32* %A.addr.012, align 4
516 %add = add i32 %tmp, %i.013
517 store i32 %add, i32* %A.addr.012, align 4
518 %incdec.ptr1 = getelementptr inbounds i32, i32* %A.addr.012, i64 2
519 %tmp1 = load i32, i32* %incdec.ptr, align 4
520 %add2 = add i32 %tmp1, %i.013
521 store i32 %add2, i32* %incdec.ptr, align 4
522 %incdec.ptr3 = getelementptr inbounds i32, i32* %A.addr.012, i64 3
523 %tmp2 = load i32, i32* %incdec.ptr1, align 4
524 %add4 = add i32 %tmp2, %i.013
525 store i32 %add4, i32* %incdec.ptr1, align 4
526 %inc = add nuw nsw i32 %i.013, 1
527 %exitcond = icmp eq i32 %inc, 1024
528 br i1 %exitcond, label %for.cond.cleanup, label %for.body
529}
530
531; Check vectorization on interleaved access groups with members having different
532; kinds of type.
533
534; struct IntFloat {
535; int a;
536; float b;
537; };
Sanjay Patelf5cfcc42020-05-26 14:30:48 -0400538;
Eric Christophercee313d2019-04-17 04:52:47 +0000539; int SA;
540; float SB;
541;
542; void int_float_struct(struct IntFloat *A) {
543; int SumA;
544; float SumB;
545; for (unsigned i = 0; i < 1024; i++) {
546; SumA += A[i].a;
547; SumB += A[i].b;
548; }
549; SA = SumA;
550; SB = SumB;
551; }
552
553; CHECK-LABEL: @int_float_struct(
554; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
555; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
556; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
557; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
Ayal Zakse498be52019-12-20 00:04:49 +0200558; CHECK: add <4 x i32>
Eric Christophercee313d2019-04-17 04:52:47 +0000559; CHECK: fadd fast <4 x float>
560
561%struct.IntFloat = type { i32, float }
562
563@SA = common global i32 0, align 4
564@SB = common global float 0.000000e+00, align 4
565
566define void @int_float_struct(%struct.IntFloat* nocapture readonly %A) #0 {
567entry:
568 br label %for.body
569
570for.cond.cleanup: ; preds = %for.body
571 store i32 %add, i32* @SA, align 4
572 store float %add3, float* @SB, align 4
573 ret void
574
575for.body: ; preds = %for.body, %entry
576 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
577 %SumB.014 = phi float [ undef, %entry ], [ %add3, %for.body ]
578 %SumA.013 = phi i32 [ undef, %entry ], [ %add, %for.body ]
579 %a = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 0
580 %tmp = load i32, i32* %a, align 4
581 %add = add nsw i32 %tmp, %SumA.013
582 %b = getelementptr inbounds %struct.IntFloat, %struct.IntFloat* %A, i64 %indvars.iv, i32 1
583 %tmp1 = load float, float* %b, align 4
584 %add3 = fadd fast float %SumB.014, %tmp1
585 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
586 %exitcond = icmp eq i64 %indvars.iv.next, 1024
587 br i1 %exitcond, label %for.cond.cleanup, label %for.body
588}
589
590; Check vectorization of interleaved access groups in the presence of
591; dependences (PR27626). The following tests check that we don't reorder
592; dependent loads and stores when generating code for interleaved access
593; groups. Stores should be scalarized because the required code motion would
594; break dependences, and the remaining interleaved load groups should have
595; gaps.
596
597; PR27626_0: Ensure a strided store is not moved after a dependent (zero
598; distance) strided load.
599
600; void PR27626_0(struct pair *p, int z, int n) {
601; for (int i = 0; i < n; i++) {
602; p[i].x = z;
603; p[i].y = p[i].x;
604; }
605; }
606
607; CHECK-LABEL: @PR27626_0(
608; CHECK: vector.ph:
609; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
610; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
611; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
612; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]]
613; CHECK: vector.body:
614; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
615; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
616; CHECK: store i32 %[[X1]], {{.*}}
617; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
618; CHECK: store i32 %[[X2]], {{.*}}
619; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
620; CHECK: store i32 %[[X3]], {{.*}}
621; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
622; CHECK: store i32 %[[X4]], {{.*}}
623
624%pair.i32 = type { i32, i32 }
625define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) {
626entry:
627 br label %for.body
628
629for.body:
630 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
631 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
632 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
633 store i32 %z, i32* %p_i.x, align 4
634 %0 = load i32, i32* %p_i.x, align 4
635 store i32 %0, i32 *%p_i.y, align 4
636 %i.next = add nuw nsw i64 %i, 1
637 %cond = icmp slt i64 %i.next, %n
638 br i1 %cond, label %for.body, label %for.end
639
640for.end:
641 ret void
642}
643
644; PR27626_1: Ensure a strided load is not moved before a dependent (zero
645; distance) strided store.
646
647; void PR27626_1(struct pair *p, int n) {
648; int s = 0;
649; for (int i = 0; i < n; i++) {
650; p[i].y = p[i].x;
651; s += p[i].y
652; }
653; }
654
655; CHECK-LABEL: @PR27626_1(
656; CHECK: vector.ph:
657; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
658; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
659; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
660; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]]
661; CHECK: vector.body:
662; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
663; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
664; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
665; CHECK: store i32 %[[X1:.+]], {{.*}}
666; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
667; CHECK: store i32 %[[X2:.+]], {{.*}}
668; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
669; CHECK: store i32 %[[X3:.+]], {{.*}}
670; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
671; CHECK: store i32 %[[X4:.+]], {{.*}}
672; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
673; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
Ayal Zakse498be52019-12-20 00:04:49 +0200674; CHECK: add <4 x i32> %[[S1]], %[[Phi]]
Eric Christophercee313d2019-04-17 04:52:47 +0000675
676define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
677entry:
678 br label %for.body
679
680for.body:
681 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
682 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
683 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
684 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
685 %0 = load i32, i32* %p_i.x, align 4
686 store i32 %0, i32* %p_i.y, align 4
687 %1 = load i32, i32* %p_i.y, align 4
688 %2 = add nsw i32 %1, %s
689 %i.next = add nuw nsw i64 %i, 1
690 %cond = icmp slt i64 %i.next, %n
691 br i1 %cond, label %for.body, label %for.end
692
693for.end:
694 %3 = phi i32 [ %2, %for.body ]
695 ret i32 %3
696}
697
698; PR27626_2: Ensure a strided store is not moved after a dependent (negative
699; distance) strided load.
700
701; void PR27626_2(struct pair *p, int z, int n) {
702; for (int i = 0; i < n; i++) {
703; p[i].x = z;
704; p[i].y = p[i - 1].x;
705; }
706; }
707
708; CHECK-LABEL: @PR27626_2(
709; CHECK: vector.ph:
710; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
711; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
712; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
713; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]]
714; CHECK: vector.body:
715; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
716; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1]], i32 0
717; CHECK: store i32 %[[X1]], {{.*}}
718; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1]], i32 2
719; CHECK: store i32 %[[X2]], {{.*}}
720; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1]], i32 4
721; CHECK: store i32 %[[X3]], {{.*}}
722; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1]], i32 6
723; CHECK: store i32 %[[X4]], {{.*}}
724
725define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) {
726entry:
727 br label %for.body
728
729for.body:
730 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
731 %i_minus_1 = add nuw nsw i64 %i, -1
732 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
733 %p_i_minus_1.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_minus_1, i32 0
734 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
735 store i32 %z, i32* %p_i.x, align 4
736 %0 = load i32, i32* %p_i_minus_1.x, align 4
737 store i32 %0, i32 *%p_i.y, align 4
738 %i.next = add nuw nsw i64 %i, 1
739 %cond = icmp slt i64 %i.next, %n
740 br i1 %cond, label %for.body, label %for.end
741
742for.end:
743 ret void
744}
745
746; PR27626_3: Ensure a strided load is not moved before a dependent (negative
747; distance) strided store.
748
749; void PR27626_3(struct pair *p, int z, int n) {
750; for (int i = 0; i < n; i++) {
751; p[i + 1].y = p[i].x;
752; s += p[i].y;
753; }
754; }
755
756; CHECK-LABEL: @PR27626_3(
757; CHECK: vector.ph:
758; CHECK: %n.mod.vf = and i64 %[[N:.+]], 3
759; CHECK: %[[IsZero:[a-zA-Z0-9]+]] = icmp eq i64 %n.mod.vf, 0
760; CHECK: %[[R:[a-zA-Z0-9]+]] = select i1 %[[IsZero]], i64 4, i64 %n.mod.vf
761; CHECK: %n.vec = sub nsw i64 %[[N]], %[[R]]
762; CHECK: vector.body:
763; CHECK: %[[Phi:.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ {{.*}}, %vector.body ]
764; CHECK: %[[L1:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
765; CHECK: %[[X1:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 0
766; CHECK: store i32 %[[X1:.+]], {{.*}}
767; CHECK: %[[X2:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 2
768; CHECK: store i32 %[[X2:.+]], {{.*}}
769; CHECK: %[[X3:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 4
770; CHECK: store i32 %[[X3:.+]], {{.*}}
771; CHECK: %[[X4:.+]] = extractelement <8 x i32> %[[L1:.+]], i32 6
772; CHECK: store i32 %[[X4:.+]], {{.*}}
773; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
774; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
Ayal Zakse498be52019-12-20 00:04:49 +0200775; CHECK: add <4 x i32> %[[S1]], %[[Phi]]
Eric Christophercee313d2019-04-17 04:52:47 +0000776
777define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
778entry:
779 br label %for.body
780
781for.body:
782 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
783 %s = phi i32 [ %2, %for.body ], [ 0, %entry ]
784 %i_plus_1 = add nuw nsw i64 %i, 1
785 %p_i.x = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 0
786 %p_i.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i, i32 1
787 %p_i_plus_1.y = getelementptr inbounds %pair.i32, %pair.i32* %p, i64 %i_plus_1, i32 1
788 %0 = load i32, i32* %p_i.x, align 4
789 store i32 %0, i32* %p_i_plus_1.y, align 4
790 %1 = load i32, i32* %p_i.y, align 4
791 %2 = add nsw i32 %1, %s
792 %i.next = add nuw nsw i64 %i, 1
793 %cond = icmp slt i64 %i.next, %n
794 br i1 %cond, label %for.body, label %for.end
795
796for.end:
797 %3 = phi i32 [ %2, %for.body ]
798 ret i32 %3
799}
800
801; PR27626_4: Ensure we form an interleaved group for strided stores in the
802; presence of a write-after-write dependence. We create a group for
803; (2) and (3) while excluding (1).
804
805; void PR27626_4(int *a, int x, int y, int z, int n) {
806; for (int i = 0; i < n; i += 2) {
807; a[i] = x; // (1)
808; a[i] = y; // (2)
809; a[i + 1] = z; // (3)
810; }
811; }
812
813; CHECK-LABEL: @PR27626_4(
814; CHECK: vector.ph:
815; CHECK: %[[INS_Y:.+]] = insertelement <4 x i32> undef, i32 %y, i32 0
816; CHECK: %[[SPLAT_Y:.+]] = shufflevector <4 x i32> %[[INS_Y]], <4 x i32> undef, <4 x i32> zeroinitializer
817; CHECK: %[[INS_Z:.+]] = insertelement <4 x i32> undef, i32 %z, i32 0
818; CHECK: %[[SPLAT_Z:.+]] = shufflevector <4 x i32> %[[INS_Z]], <4 x i32> undef, <4 x i32> zeroinitializer
819; CHECK: vector.body:
820; CHECK: store i32 %x, {{.*}}
821; CHECK: store i32 %x, {{.*}}
822; CHECK: store i32 %x, {{.*}}
823; CHECK: store i32 %x, {{.*}}
824; CHECK: %[[VEC:.+]] = shufflevector <4 x i32> %[[SPLAT_Y]], <4 x i32> %[[SPLAT_Z]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
825; CHECK: store <8 x i32> %[[VEC]], {{.*}}
826
827define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
828entry:
829 br label %for.body
830
831for.body:
832 %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
833 %i_plus_1 = add i64 %i, 1
834 %a_i = getelementptr inbounds i32, i32* %a, i64 %i
835 %a_i_plus_1 = getelementptr inbounds i32, i32* %a, i64 %i_plus_1
836 store i32 %x, i32* %a_i, align 4
837 store i32 %y, i32* %a_i, align 4
838 store i32 %z, i32* %a_i_plus_1, align 4
839 %i.next = add nuw nsw i64 %i, 2
840 %cond = icmp slt i64 %i.next, %n
841 br i1 %cond, label %for.body, label %for.end
842
843for.end:
844 ret void
845}
846
847; PR27626_5: Ensure we do not form an interleaved group for strided stores in
848; the presence of a write-after-write dependence.
849
850; void PR27626_5(int *a, int x, int y, int z, int n) {
851; for (int i = 3; i < n; i += 2) {
852; a[i - 1] = x;
853; a[i - 3] = y;
854; a[i] = z;
855; }
856; }
857
858; CHECK-LABEL: @PR27626_5(
859; CHECK: vector.body:
860; CHECK: store i32 %x, {{.*}}
861; CHECK: store i32 %x, {{.*}}
862; CHECK: store i32 %x, {{.*}}
863; CHECK: store i32 %x, {{.*}}
864; CHECK: store i32 %y, {{.*}}
865; CHECK: store i32 %y, {{.*}}
866; CHECK: store i32 %y, {{.*}}
867; CHECK: store i32 %y, {{.*}}
868; CHECK: store i32 %z, {{.*}}
869; CHECK: store i32 %z, {{.*}}
870; CHECK: store i32 %z, {{.*}}
871; CHECK: store i32 %z, {{.*}}
872
873define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) {
874entry:
875 br label %for.body
876
877for.body:
878 %i = phi i64 [ %i.next, %for.body ], [ 3, %entry ]
879 %i_minus_1 = sub i64 %i, 1
880 %i_minus_3 = sub i64 %i_minus_1, 2
881 %a_i = getelementptr inbounds i32, i32* %a, i64 %i
882 %a_i_minus_1 = getelementptr inbounds i32, i32* %a, i64 %i_minus_1
883 %a_i_minus_3 = getelementptr inbounds i32, i32* %a, i64 %i_minus_3
884 store i32 %x, i32* %a_i_minus_1, align 4
885 store i32 %y, i32* %a_i_minus_3, align 4
886 store i32 %z, i32* %a_i, align 4
887 %i.next = add nuw nsw i64 %i, 2
888 %cond = icmp slt i64 %i.next, %n
889 br i1 %cond, label %for.body, label %for.end
890
891for.end:
892 ret void
893}
894
895; PR34743: Ensure that a cast which needs to sink after a load that belongs to
896; an interleaved group, indeeded gets sunk.
897
898; void PR34743(short *a, int *b, int n) {
899; for (int i = 0, iv = 0; iv < n; i++, iv += 2) {
900; b[i] = a[iv] * a[iv+1] * a[iv+2];
901; }
902; }
903
904; CHECK-LABEL: @PR34743(
905; CHECK: vector.body:
906; CHECK: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[VSHUF1:.+]], %vector.body ]
907; CHECK: %wide.vec = load <8 x i16>
908; CHECK: %[[VSHUF0:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
909; CHECK: %[[VSHUF1:.+]] = shufflevector <8 x i16> %wide.vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
910; CHECK: %[[VSHUF:.+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> %[[VSHUF1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
911; CHECK: sext <4 x i16> %[[VSHUF0]] to <4 x i32>
912; CHECK: sext <4 x i16> %[[VSHUF]] to <4 x i32>
913; CHECK: sext <4 x i16> %[[VSHUF1]] to <4 x i32>
914; CHECK: mul nsw <4 x i32>
915; CHECK: mul nsw <4 x i32>
916
917define void @PR34743(i16* %a, i32* %b, i64 %n) {
918entry:
919 %.pre = load i16, i16* %a
920 br label %loop
921
922loop:
923 %0 = phi i16 [ %.pre, %entry ], [ %load2, %loop ]
924 %iv = phi i64 [ 0, %entry ], [ %iv2, %loop ]
925 %i = phi i64 [ 0, %entry ], [ %i1, %loop ]
926 %conv = sext i16 %0 to i32
927 %i1 = add nuw nsw i64 %i, 1
928 %iv1 = add nuw nsw i64 %iv, 1
929 %iv2 = add nuw nsw i64 %iv, 2
930 %gep1 = getelementptr inbounds i16, i16* %a, i64 %iv1
931 %load1 = load i16, i16* %gep1, align 4
932 %conv1 = sext i16 %load1 to i32
933 %gep2 = getelementptr inbounds i16, i16* %a, i64 %iv2
934 %load2 = load i16, i16* %gep2, align 4
935 %conv2 = sext i16 %load2 to i32
936 %mul01 = mul nsw i32 %conv, %conv1
937 %mul012 = mul nsw i32 %mul01, %conv2
938 %arrayidx5 = getelementptr inbounds i32, i32* %b, i64 %i
939 store i32 %mul012, i32* %arrayidx5
940 %exitcond = icmp eq i64 %iv, %n
941 br i1 %exitcond, label %end, label %loop
942
943end:
944 ret void
945}
946
947attributes #0 = { "unsafe-fp-math"="true" }