blob: 2a34d085fc1e66b5042398156c03a48229401a28 [file] [log] [blame]
Roman Lebedevbaf26282018-09-11 15:34:26 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
3
4; Tests BuildUREMEqFold for 4 x i32 splat vectors with odd divisor.
5; See urem-seteq.ll for justification behind constants emitted.
6define <4 x i32> @test_urem_odd_vec_i32(<4 x i32> %X) nounwind readnone {
7; CHECK-LABEL: test_urem_odd_vec_i32:
8; CHECK: // %bb.0:
9; CHECK-NEXT: mov w8, #52429
10; CHECK-NEXT: movk w8, #52428, lsl #16
11; CHECK-NEXT: dup v2.4s, w8
12; CHECK-NEXT: umull2 v3.2d, v0.4s, v2.4s
13; CHECK-NEXT: umull v2.2d, v0.2s, v2.2s
14; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s
15; CHECK-NEXT: movi v1.4s, #5
16; CHECK-NEXT: ushr v2.4s, v2.4s, #2
17; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s
18; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
19; CHECK-NEXT: movi v1.4s, #1
20; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
21; CHECK-NEXT: ret
22 %urem = urem <4 x i32> %X, <i32 5, i32 5, i32 5, i32 5>
23 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
24 %ret = zext <4 x i1> %cmp to <4 x i32>
25 ret <4 x i32> %ret
26}
27
28; Like test_urem_odd_vec_i32, but with 4 x i16 vectors.
29define <4 x i16> @test_urem_odd_vec_i16(<4 x i16> %X) nounwind readnone {
30; CHECK-LABEL: test_urem_odd_vec_i16:
31; CHECK: // %bb.0:
32; CHECK-NEXT: mov w9, #52429
33; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
34; CHECK-NEXT: umov w8, v0.h[1]
35; CHECK-NEXT: movk w9, #52428, lsl #16
36; CHECK-NEXT: umull x12, w8, w9
37; CHECK-NEXT: lsr x12, x12, #34
38; CHECK-NEXT: umov w10, v0.h[0]
39; CHECK-NEXT: add w12, w12, w12, lsl #2
40; CHECK-NEXT: sub w8, w8, w12
41; CHECK-NEXT: umull x12, w10, w9
42; CHECK-NEXT: lsr x12, x12, #34
43; CHECK-NEXT: umov w11, v0.h[2]
44; CHECK-NEXT: add w12, w12, w12, lsl #2
45; CHECK-NEXT: sub w10, w10, w12
46; CHECK-NEXT: umull x12, w11, w9
47; CHECK-NEXT: lsr x12, x12, #34
48; CHECK-NEXT: add w12, w12, w12, lsl #2
49; CHECK-NEXT: sub w11, w11, w12
50; CHECK-NEXT: umov w12, v0.h[3]
51; CHECK-NEXT: umull x9, w12, w9
52; CHECK-NEXT: lsr x9, x9, #34
53; CHECK-NEXT: fmov s0, w10
54; CHECK-NEXT: add w9, w9, w9, lsl #2
55; CHECK-NEXT: mov v0.h[1], w8
56; CHECK-NEXT: sub w9, w12, w9
57; CHECK-NEXT: mov v0.h[2], w11
58; CHECK-NEXT: mov v0.h[3], w9
59; CHECK-NEXT: cmeq v0.4h, v0.4h, #0
60; CHECK-NEXT: movi v1.4h, #1
61; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
62; CHECK-NEXT: ret
63 %urem = urem <4 x i16> %X, <i16 5, i16 5, i16 5, i16 5>
64 %cmp = icmp eq <4 x i16> %urem, <i16 0, i16 0, i16 0, i16 0>
65 %ret = zext <4 x i1> %cmp to <4 x i16>
66 ret <4 x i16> %ret
67}
68
69; Tests BuildUREMEqFold for 4 x i32 splat vectors with even divisor.
70; The expected behavior is that the fold is _not_ applied
71; because it requires a ROTR in the even case, which has to be expanded.
72define <4 x i32> @test_urem_even_vec_i32(<4 x i32> %X) nounwind readnone {
73; CHECK-LABEL: test_urem_even_vec_i32:
74; CHECK: // %bb.0:
75; CHECK-NEXT: mov w8, #9363
76; CHECK-NEXT: movk w8, #37449, lsl #16
77; CHECK-NEXT: ushr v1.4s, v0.4s, #1
78; CHECK-NEXT: dup v3.4s, w8
79; CHECK-NEXT: umull2 v4.2d, v1.4s, v3.4s
80; CHECK-NEXT: umull v1.2d, v1.2s, v3.2s
81; CHECK-NEXT: uzp2 v1.4s, v1.4s, v4.4s
82; CHECK-NEXT: movi v2.4s, #14
83; CHECK-NEXT: ushr v1.4s, v1.4s, #2
84; CHECK-NEXT: mls v0.4s, v1.4s, v2.4s
85; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
86; CHECK-NEXT: movi v1.4s, #1
87; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
88; CHECK-NEXT: ret
89 %urem = urem <4 x i32> %X, <i32 14, i32 14, i32 14, i32 14>
90 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
91 %ret = zext <4 x i1> %cmp to <4 x i32>
92 ret <4 x i32> %ret
93}
94
95; Like test_urem_even_vec_i32, but with 4 x i16 vectors.
96; i16 is not legal for ROTR on AArch64, but ROTR also cannot be promoted to i32,
97; so this would crash if BuildUREMEqFold was applied.
98define <4 x i16> @test_urem_even_vec_i16(<4 x i16> %X) nounwind readnone {
99; CHECK-LABEL: test_urem_even_vec_i16:
100; CHECK: // %bb.0:
101; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
102; CHECK-NEXT: umov w8, v0.h[1]
103; CHECK-NEXT: mov w9, #9363
104; CHECK-NEXT: movk w9, #37449, lsl #16
105; CHECK-NEXT: umov w10, v0.h[0]
106; CHECK-NEXT: umov w11, v0.h[2]
107; CHECK-NEXT: umov w12, v0.h[3]
108; CHECK-NEXT: ubfx w13, w8, #1, #15
109; CHECK-NEXT: ubfx w14, w10, #1, #15
110; CHECK-NEXT: ubfx w15, w11, #1, #15
111; CHECK-NEXT: ubfx w16, w12, #1, #15
112; CHECK-NEXT: umull x13, w13, w9
113; CHECK-NEXT: umull x14, w14, w9
114; CHECK-NEXT: umull x15, w15, w9
115; CHECK-NEXT: umull x9, w16, w9
116; CHECK-NEXT: orr w16, wzr, #0xe
117; CHECK-NEXT: lsr x13, x13, #34
118; CHECK-NEXT: msub w8, w13, w16, w8
119; CHECK-NEXT: lsr x13, x14, #34
120; CHECK-NEXT: msub w10, w13, w16, w10
121; CHECK-NEXT: lsr x13, x15, #34
122; CHECK-NEXT: fmov s0, w10
123; CHECK-NEXT: msub w11, w13, w16, w11
124; CHECK-NEXT: lsr x9, x9, #34
125; CHECK-NEXT: mov v0.h[1], w8
126; CHECK-NEXT: msub w9, w9, w16, w12
127; CHECK-NEXT: mov v0.h[2], w11
128; CHECK-NEXT: mov v0.h[3], w9
129; CHECK-NEXT: cmeq v0.4h, v0.4h, #0
130; CHECK-NEXT: movi v1.4h, #1
131; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
132; CHECK-NEXT: ret
133 %urem = urem <4 x i16> %X, <i16 14, i16 14, i16 14, i16 14>
134 %cmp = icmp eq <4 x i16> %urem, <i16 0, i16 0, i16 0, i16 0>
135 %ret = zext <4 x i1> %cmp to <4 x i16>
136 ret <4 x i16> %ret
137}
138
139; We should not proceed with this fold if the divisor is 1 or -1
140define <4 x i32> @test_urem_one_vec(<4 x i32> %X) nounwind readnone {
141; CHECK-LABEL: test_urem_one_vec:
142; CHECK: // %bb.0:
143; CHECK-NEXT: movi v0.4s, #1
144; CHECK-NEXT: ret
145 %urem = urem <4 x i32> %X, <i32 1, i32 1, i32 1, i32 1>
146 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
147 %ret = zext <4 x i1> %cmp to <4 x i32>
148 ret <4 x i32> %ret
149}
150
151; BuildUREMEqFold does not work when the only odd factor of the divisor is 1.
152; This ensures we don't touch powers of two.
153define <4 x i32> @test_urem_pow2_vec(<4 x i32> %X) nounwind readnone {
154; CHECK-LABEL: test_urem_pow2_vec:
155; CHECK: // %bb.0:
156; CHECK-NEXT: movi v1.4s, #15
157; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
158; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
159; CHECK-NEXT: movi v1.4s, #1
160; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
161; CHECK-NEXT: ret
162 %urem = urem <4 x i32> %X, <i32 16, i32 16, i32 16, i32 16>
163 %cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
164 %ret = zext <4 x i1> %cmp to <4 x i32>
165 ret <4 x i32> %ret
166}