blob: 9113f6c2e6385887343991b2089c7ac61d23f6d7 [file] [log] [blame]
Matt Arsenault317fdcd2020-01-18 16:51:43 -05001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5; GCN-LABEL: s_test_srem:
6; GCN: ; %bb.0:
7; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
8; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
9; GCN-NEXT: s_mov_b32 s7, 0xf000
10; GCN-NEXT: s_mov_b32 s6, -1
11; GCN-NEXT: v_mov_b32_e32 v0, 0
12; GCN-NEXT: v_mov_b32_e32 v1, 0
13; GCN-NEXT: s_waitcnt lgkmcnt(0)
14; GCN-NEXT: s_mov_b32 s4, s8
15; GCN-NEXT: s_mov_b32 s5, s9
16; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
17; GCN-NEXT: v_cvt_f32_u32_e32 v3, s3
18; GCN-NEXT: s_sub_u32 s8, 0, s2
19; GCN-NEXT: v_mov_b32_e32 v4, s3
20; GCN-NEXT: v_mov_b32_e32 v5, s11
21; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
22; GCN-NEXT: s_subb_u32 s9, 0, s3
23; GCN-NEXT: v_rcp_f32_e32 v2, v2
24; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
25; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
26; GCN-NEXT: v_trunc_f32_e32 v3, v3
27; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
28; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
29; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
30; GCN-NEXT: v_mul_lo_u32 v6, s8, v3
31; GCN-NEXT: v_mul_lo_u32 v7, s9, v2
32; GCN-NEXT: v_mul_hi_u32 v8, s8, v2
33; GCN-NEXT: v_mul_lo_u32 v9, s8, v2
34; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6
35; GCN-NEXT: v_mul_hi_u32 v8, v2, v9
36; GCN-NEXT: v_mul_hi_u32 v10, v3, v9
37; GCN-NEXT: v_mul_lo_u32 v9, v3, v9
38; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
39; GCN-NEXT: v_mul_hi_u32 v7, v2, v6
40; GCN-NEXT: v_mul_lo_u32 v11, v2, v6
41; GCN-NEXT: v_mul_hi_u32 v12, v3, v6
42; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
43; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v11
44; GCN-NEXT: v_addc_u32_e32 v7, vcc, v1, v7, vcc
45; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
46; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v10, vcc
47; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v0, vcc
48; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
49; GCN-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc
50; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v2, v6
51; GCN-NEXT: v_addc_u32_e64 v6, vcc, v3, v7, s[0:1]
52; GCN-NEXT: v_mul_hi_u32 v8, s8, v2
53; GCN-NEXT: v_mul_lo_u32 v9, s9, v2
54; GCN-NEXT: v_mul_lo_u32 v10, s8, v2
55; GCN-NEXT: v_mul_lo_u32 v11, s8, v6
56; GCN-NEXT: v_mul_hi_u32 v12, v6, v10
57; GCN-NEXT: v_mul_lo_u32 v13, v6, v10
58; GCN-NEXT: v_mul_hi_u32 v10, v2, v10
59; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v11
60; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
61; GCN-NEXT: v_mul_hi_u32 v9, v6, v8
62; GCN-NEXT: v_mul_hi_u32 v11, v2, v8
63; GCN-NEXT: v_mul_lo_u32 v14, v2, v8
64; GCN-NEXT: v_mul_lo_u32 v6, v6, v8
65; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v14
66; GCN-NEXT: v_addc_u32_e32 v10, vcc, v1, v11, vcc
67; GCN-NEXT: v_add_i32_e32 v8, vcc, v13, v8
68; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v12, vcc
69; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
70; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
71; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6
72; GCN-NEXT: v_addc_u32_e32 v7, vcc, v1, v9, vcc
73; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v7, s[0:1]
74; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
75; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
76; GCN-NEXT: v_mul_hi_u32 v6, s10, v2
77; GCN-NEXT: v_mul_hi_u32 v7, s11, v2
78; GCN-NEXT: v_mul_lo_u32 v2, s11, v2
79; GCN-NEXT: v_mul_hi_u32 v8, s10, v3
80; GCN-NEXT: v_mul_lo_u32 v9, s10, v3
81; GCN-NEXT: v_mul_hi_u32 v10, s11, v3
82; GCN-NEXT: v_mul_lo_u32 v3, s11, v3
83; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9
84; GCN-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc
85; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
86; GCN-NEXT: v_addc_u32_e32 v2, vcc, v8, v7, vcc
87; GCN-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc
88; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3
89; GCN-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc
90; GCN-NEXT: v_mul_hi_u32 v1, s2, v2
91; GCN-NEXT: v_mul_lo_u32 v3, s3, v2
92; GCN-NEXT: v_mul_lo_u32 v2, s2, v2
93; GCN-NEXT: v_mul_lo_u32 v0, s2, v0
94; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
95; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
96; GCN-NEXT: v_sub_i32_e32 v1, vcc, s11, v0
97; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2
98; GCN-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v4, vcc
99; GCN-NEXT: v_subb_u32_e32 v0, vcc, v5, v0, vcc
100; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
101; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
102; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s2, v2
103; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v1, v4, vcc
104; GCN-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
105; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v5
106; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
107; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s2, v5
108; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v0
109; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
110; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
111; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
112; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
113; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v0
114; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
115; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1
116; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
117; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
118; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
119; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3
120; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
121; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
122; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
123; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
124; GCN-NEXT: s_endpgm
125 %result = urem i64 %x, %y
126 store i64 %result, i64 addrspace(1)* %out
127 ret void
128}
129
130define i64 @v_test_srem(i64 %x, i64 %y) {
131; GCN-LABEL: v_test_srem:
132; GCN: ; %bb.0:
133; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v3
135; GCN-NEXT: v_mov_b32_e32 v5, 0
136; GCN-NEXT: v_mov_b32_e32 v6, 0
137; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v1
138; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
139; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
140; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v7
141; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
142; GCN-NEXT: v_xor_b32_e32 v3, v3, v4
143; GCN-NEXT: v_xor_b32_e32 v2, v2, v4
144; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
145; GCN-NEXT: v_xor_b32_e32 v0, v0, v7
146; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2
147; GCN-NEXT: v_cvt_f32_u32_e32 v8, v3
148; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2
149; GCN-NEXT: v_subb_u32_e32 v10, vcc, 0, v3, vcc
150; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v8
151; GCN-NEXT: v_rcp_f32_e32 v4, v4
152; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
153; GCN-NEXT: v_mul_f32_e32 v8, 0x2f800000, v4
154; GCN-NEXT: v_trunc_f32_e32 v8, v8
155; GCN-NEXT: v_mac_f32_e32 v4, 0xcf800000, v8
156; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8
157; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4
158; GCN-NEXT: v_mul_lo_u32 v11, v9, v8
159; GCN-NEXT: v_mul_lo_u32 v12, v10, v4
160; GCN-NEXT: v_mul_hi_u32 v13, v9, v4
161; GCN-NEXT: v_mul_lo_u32 v14, v9, v4
162; GCN-NEXT: v_add_i32_e32 v11, vcc, v13, v11
163; GCN-NEXT: v_mul_hi_u32 v13, v4, v14
164; GCN-NEXT: v_mul_hi_u32 v15, v8, v14
165; GCN-NEXT: v_mul_lo_u32 v14, v8, v14
166; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12
167; GCN-NEXT: v_mul_hi_u32 v12, v4, v11
168; GCN-NEXT: v_mul_lo_u32 v16, v4, v11
169; GCN-NEXT: v_mul_hi_u32 v17, v8, v11
170; GCN-NEXT: v_mul_lo_u32 v11, v8, v11
171; GCN-NEXT: v_add_i32_e32 v13, vcc, v13, v16
172; GCN-NEXT: v_addc_u32_e32 v12, vcc, v6, v12, vcc
173; GCN-NEXT: v_add_i32_e32 v13, vcc, v14, v13
174; GCN-NEXT: v_addc_u32_e32 v12, vcc, v12, v15, vcc
175; GCN-NEXT: v_addc_u32_e32 v13, vcc, v17, v5, vcc
176; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11
177; GCN-NEXT: v_addc_u32_e32 v12, vcc, v6, v13, vcc
178; GCN-NEXT: v_add_i32_e64 v4, s[4:5], v4, v11
179; GCN-NEXT: v_addc_u32_e64 v11, vcc, v8, v12, s[4:5]
180; GCN-NEXT: v_mul_hi_u32 v13, v9, v4
181; GCN-NEXT: v_mul_lo_u32 v10, v10, v4
182; GCN-NEXT: v_mul_lo_u32 v14, v9, v4
183; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v12
184; GCN-NEXT: v_mul_lo_u32 v9, v9, v11
185; GCN-NEXT: v_mul_hi_u32 v12, v11, v14
186; GCN-NEXT: v_mul_lo_u32 v15, v11, v14
187; GCN-NEXT: v_mul_hi_u32 v14, v4, v14
188; GCN-NEXT: v_add_i32_e32 v9, vcc, v13, v9
189; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10
190; GCN-NEXT: v_mul_hi_u32 v10, v11, v9
191; GCN-NEXT: v_mul_hi_u32 v13, v4, v9
192; GCN-NEXT: v_mul_lo_u32 v16, v4, v9
193; GCN-NEXT: v_mul_lo_u32 v9, v11, v9
194; GCN-NEXT: v_add_i32_e32 v11, vcc, v14, v16
195; GCN-NEXT: v_addc_u32_e32 v13, vcc, v6, v13, vcc
196; GCN-NEXT: v_add_i32_e32 v11, vcc, v15, v11
197; GCN-NEXT: v_addc_u32_e32 v11, vcc, v13, v12, vcc
198; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v5, vcc
199; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v9
200; GCN-NEXT: v_addc_u32_e32 v10, vcc, v6, v10, vcc
201; GCN-NEXT: v_addc_u32_e64 v8, vcc, v8, v10, s[4:5]
202; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9
203; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc
204; GCN-NEXT: v_mul_hi_u32 v9, v0, v4
205; GCN-NEXT: v_mul_hi_u32 v10, v1, v4
206; GCN-NEXT: v_mul_lo_u32 v4, v1, v4
207; GCN-NEXT: v_mul_hi_u32 v11, v0, v8
208; GCN-NEXT: v_mul_lo_u32 v12, v0, v8
209; GCN-NEXT: v_mul_hi_u32 v13, v1, v8
210; GCN-NEXT: v_mul_lo_u32 v8, v1, v8
211; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v12
212; GCN-NEXT: v_addc_u32_e32 v11, vcc, v6, v11, vcc
213; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v9
214; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v10, vcc
215; GCN-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
216; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8
217; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
218; GCN-NEXT: v_mul_hi_u32 v6, v2, v4
219; GCN-NEXT: v_mul_lo_u32 v8, v3, v4
220; GCN-NEXT: v_mul_lo_u32 v4, v2, v4
221; GCN-NEXT: v_mul_lo_u32 v5, v2, v5
222; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
223; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
224; GCN-NEXT: v_sub_i32_e32 v6, vcc, v1, v5
225; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
226; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v6, v3, vcc
227; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
228; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
229; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
230; GCN-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
231; GCN-NEXT: v_subb_u32_e64 v8, s[4:5], v4, v3, vcc
232; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
233; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
234; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
235; GCN-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
236; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
237; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
238; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3
239; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
240; GCN-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
241; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3
242; GCN-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc
243; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
244; GCN-NEXT: v_cndmask_b32_e32 v3, v11, v9, vcc
245; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
246; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v8, vcc
247; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
248; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
249; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
250; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
251; GCN-NEXT: v_xor_b32_e32 v1, v1, v7
252; GCN-NEXT: v_xor_b32_e32 v0, v0, v7
253; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
254; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
255; GCN-NEXT: s_setpc_b64 s[30:31]
256 %result = srem i64 %x, %y
257 ret i64 %result
258}
259
260define amdgpu_kernel void @s_test_srem23_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
261; GCN-LABEL: s_test_srem23_64:
262; GCN: ; %bb.0:
263; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
264; GCN-NEXT: s_load_dword s9, s[0:1], 0xe
265; GCN-NEXT: s_mov_b32 s3, 0xf000
266; GCN-NEXT: s_mov_b32 s2, -1
267; GCN-NEXT: s_waitcnt lgkmcnt(0)
268; GCN-NEXT: s_mov_b32 s0, s4
269; GCN-NEXT: s_mov_b32 s1, s5
270; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 41
271; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 41
272; GCN-NEXT: s_xor_b32 s5, s4, s6
273; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6
274; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4
275; GCN-NEXT: s_ashr_i32 s5, s5, 30
276; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0
277; GCN-NEXT: s_or_b32 s5, s5, 1
278; GCN-NEXT: v_mul_f32_e32 v2, v1, v2
279; GCN-NEXT: v_trunc_f32_e32 v2, v2
280; GCN-NEXT: v_mov_b32_e32 v3, s5
281; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1
282; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
283; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
284; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2
285; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
286; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
287; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
288; GCN-NEXT: v_bfe_i32 v0, v0, 0, 23
289; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
290; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
291; GCN-NEXT: s_endpgm
292 %1 = ashr i64 %x, 41
293 %2 = ashr i64 %y, 41
294 %result = srem i64 %1, %2
295 store i64 %result, i64 addrspace(1)* %out
296 ret void
297}
298
299define amdgpu_kernel void @s_test_srem24_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
300; GCN-LABEL: s_test_srem24_64:
301; GCN: ; %bb.0:
302; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
303; GCN-NEXT: s_load_dword s9, s[0:1], 0xe
304; GCN-NEXT: s_mov_b32 s3, 0xf000
305; GCN-NEXT: s_mov_b32 s2, -1
306; GCN-NEXT: s_waitcnt lgkmcnt(0)
307; GCN-NEXT: s_mov_b32 s0, s4
308; GCN-NEXT: s_mov_b32 s1, s5
309; GCN-NEXT: s_ashr_i64 s[4:5], s[6:7], 40
310; GCN-NEXT: s_ashr_i64 s[6:7], s[8:9], 40
311; GCN-NEXT: s_xor_b32 s5, s4, s6
312; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6
313; GCN-NEXT: v_cvt_f32_i32_e32 v1, s4
314; GCN-NEXT: s_ashr_i32 s5, s5, 30
315; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0
316; GCN-NEXT: s_or_b32 s5, s5, 1
317; GCN-NEXT: v_mul_f32_e32 v2, v1, v2
318; GCN-NEXT: v_trunc_f32_e32 v2, v2
319; GCN-NEXT: v_mov_b32_e32 v3, s5
320; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1
321; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0|
322; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
323; GCN-NEXT: v_cvt_i32_f32_e32 v1, v2
324; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
325; GCN-NEXT: v_mul_lo_u32 v0, v0, s6
326; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
327; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
328; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
329; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
330; GCN-NEXT: s_endpgm
331 %1 = ashr i64 %x, 40
332 %2 = ashr i64 %y, 40
333 %result = srem i64 %1, %2
334 store i64 %result, i64 addrspace(1)* %out
335 ret void
336}
337
338define i64 @v_test_srem24_64(i64 %x, i64 %y) {
339; GCN-LABEL: v_test_srem24_64:
340; GCN: ; %bb.0:
341; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40
343; GCN-NEXT: v_ashr_i64 v[1:2], v[2:3], 40
344; GCN-NEXT: v_xor_b32_e32 v2, v0, v1
345; GCN-NEXT: v_cvt_f32_i32_e32 v3, v0
346; GCN-NEXT: v_cvt_f32_i32_e32 v4, v1
347; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2
348; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4
349; GCN-NEXT: v_or_b32_e32 v2, 1, v2
350; GCN-NEXT: v_mul_f32_e32 v5, v3, v5
351; GCN-NEXT: v_trunc_f32_e32 v5, v5
352; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3
353; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4|
354; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
355; GCN-NEXT: v_cvt_i32_f32_e32 v3, v5
356; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2
357; GCN-NEXT: v_mul_lo_u32 v1, v2, v1
358; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
359; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
360; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
361; GCN-NEXT: s_setpc_b64 s[30:31]
362 %1 = ashr i64 %x, 40
363 %2 = ashr i64 %y, 40
364 %result = srem i64 %1, %2
365 ret i64 %result
366}
367
368define amdgpu_kernel void @s_test_srem25_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
369; GCN-LABEL: s_test_srem25_64:
370; GCN: ; %bb.0:
371; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
372; GCN-NEXT: s_load_dword s3, s[0:1], 0xe
373; GCN-NEXT: s_mov_b32 s11, 0xf000
374; GCN-NEXT: s_mov_b32 s10, -1
375; GCN-NEXT: s_waitcnt lgkmcnt(0)
376; GCN-NEXT: s_mov_b32 s8, s4
377; GCN-NEXT: s_mov_b32 s9, s5
378; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 39
379; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 39
380; GCN-NEXT: s_ashr_i32 s1, s2, 31
381; GCN-NEXT: s_ashr_i32 s4, s0, 31
382; GCN-NEXT: s_add_i32 s2, s2, s1
383; GCN-NEXT: s_add_i32 s0, s0, s4
384; GCN-NEXT: s_xor_b32 s5, s2, s1
385; GCN-NEXT: s_xor_b32 s2, s0, s4
386; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5
387; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
388; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
389; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
390; GCN-NEXT: v_mul_hi_u32 v1, v0, s5
391; GCN-NEXT: v_mul_lo_u32 v2, v0, s5
392; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
393; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1
394; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
395; GCN-NEXT: v_mul_hi_u32 v1, v1, v0
396; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0
397; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
398; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
399; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
400; GCN-NEXT: v_mul_lo_u32 v0, v0, s5
401; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0
402; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1
403; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0
404; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1
405; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1
406; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
407; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
408; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
409; GCN-NEXT: v_xor_b32_e32 v0, s4, v0
410; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
411; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
412; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
413; GCN-NEXT: s_endpgm
414 %1 = ashr i64 %x, 39
415 %2 = ashr i64 %y, 39
416 %result = srem i64 %1, %2
417 store i64 %result, i64 addrspace(1)* %out
418 ret void
419}
420
421define amdgpu_kernel void @s_test_srem31_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
422; GCN-LABEL: s_test_srem31_64:
423; GCN: ; %bb.0:
424; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
425; GCN-NEXT: s_load_dword s3, s[0:1], 0xe
426; GCN-NEXT: s_mov_b32 s11, 0xf000
427; GCN-NEXT: s_mov_b32 s10, -1
428; GCN-NEXT: s_waitcnt lgkmcnt(0)
429; GCN-NEXT: s_mov_b32 s8, s4
430; GCN-NEXT: s_mov_b32 s9, s5
431; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], 33
432; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 33
433; GCN-NEXT: s_ashr_i32 s1, s2, 31
434; GCN-NEXT: s_ashr_i32 s4, s0, 31
435; GCN-NEXT: s_add_i32 s2, s2, s1
436; GCN-NEXT: s_add_i32 s0, s0, s4
437; GCN-NEXT: s_xor_b32 s5, s2, s1
438; GCN-NEXT: s_xor_b32 s2, s0, s4
439; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5
440; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
441; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
442; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
443; GCN-NEXT: v_mul_hi_u32 v1, v0, s5
444; GCN-NEXT: v_mul_lo_u32 v2, v0, s5
445; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
446; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1
447; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
448; GCN-NEXT: v_mul_hi_u32 v1, v1, v0
449; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0
450; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
451; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
452; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
453; GCN-NEXT: v_mul_lo_u32 v0, v0, s5
454; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0
455; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1
456; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0
457; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1
458; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1
459; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
460; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
461; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
462; GCN-NEXT: v_xor_b32_e32 v0, s4, v0
463; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
464; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
465; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
466; GCN-NEXT: s_endpgm
467 %1 = ashr i64 %x, 33
468 %2 = ashr i64 %y, 33
469 %result = srem i64 %1, %2
470 store i64 %result, i64 addrspace(1)* %out
471 ret void
472}
473
474; 32 known sign bits
475define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
476; GCN-LABEL: s_test_srem32_64:
477; GCN: ; %bb.0:
478; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
479; GCN-NEXT: s_load_dword s2, s[0:1], 0xe
480; GCN-NEXT: s_mov_b32 s11, 0xf000
481; GCN-NEXT: s_mov_b32 s10, -1
482; GCN-NEXT: s_waitcnt lgkmcnt(0)
483; GCN-NEXT: s_mov_b32 s8, s4
484; GCN-NEXT: s_mov_b32 s9, s5
485; GCN-NEXT: s_ashr_i32 s0, s2, 31
486; GCN-NEXT: s_ashr_i32 s4, s7, 31
487; GCN-NEXT: s_add_i32 s2, s2, s0
488; GCN-NEXT: s_add_i32 s1, s7, s4
489; GCN-NEXT: s_xor_b32 s5, s2, s0
490; GCN-NEXT: s_xor_b32 s2, s1, s4
491; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5
492; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
493; GCN-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
494; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
495; GCN-NEXT: v_mul_hi_u32 v1, v0, s5
496; GCN-NEXT: v_mul_lo_u32 v2, v0, s5
497; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
498; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1
499; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1]
500; GCN-NEXT: v_mul_hi_u32 v1, v1, v0
501; GCN-NEXT: v_add_i32_e32 v2, vcc, v1, v0
502; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0
503; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
504; GCN-NEXT: v_mul_hi_u32 v0, v0, s2
505; GCN-NEXT: v_mul_lo_u32 v0, v0, s5
506; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v0
507; GCN-NEXT: v_add_i32_e32 v2, vcc, s5, v1
508; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], s2, v0
509; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v1
510; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v1
511; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
512; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
513; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[0:1]
514; GCN-NEXT: v_xor_b32_e32 v0, s4, v0
515; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0
516; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
517; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
518; GCN-NEXT: s_endpgm
519 %1 = ashr i64 %x, 32
520 %2 = ashr i64 %y, 32
521 %result = srem i64 %1, %2
522 store i64 %result, i64 addrspace(1)* %out
523 ret void
524}
525
526; 33 known sign bits
527define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
528; GCN-LABEL: s_test_srem33_64:
529; GCN: ; %bb.0:
530; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
531; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
532; GCN-NEXT: s_mov_b32 s7, 0xf000
533; GCN-NEXT: s_mov_b32 s6, -1
534; GCN-NEXT: v_mov_b32_e32 v0, 0
535; GCN-NEXT: v_mov_b32_e32 v1, 0
536; GCN-NEXT: s_waitcnt lgkmcnt(0)
537; GCN-NEXT: s_mov_b32 s4, s8
538; GCN-NEXT: s_mov_b32 s5, s9
539; GCN-NEXT: s_ashr_i64 s[0:1], s[10:11], 31
540; GCN-NEXT: s_ashr_i32 s8, s3, 31
541; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31
542; GCN-NEXT: s_mov_b32 s9, s8
543; GCN-NEXT: s_add_u32 s2, s2, s8
544; GCN-NEXT: s_addc_u32 s3, s3, s8
545; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9]
546; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
547; GCN-NEXT: v_cvt_f32_u32_e32 v3, s3
548; GCN-NEXT: s_sub_u32 s12, 0, s2
549; GCN-NEXT: v_mov_b32_e32 v4, s3
550; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
551; GCN-NEXT: s_subb_u32 s13, 0, s3
552; GCN-NEXT: s_ashr_i32 s8, s11, 31
553; GCN-NEXT: v_rcp_f32_e32 v2, v2
554; GCN-NEXT: s_mov_b32 s9, s8
555; GCN-NEXT: s_add_u32 s0, s0, s8
556; GCN-NEXT: v_mov_b32_e32 v3, s8
557; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
558; GCN-NEXT: s_addc_u32 s1, s1, s8
559; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
560; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
561; GCN-NEXT: v_trunc_f32_e32 v5, v5
562; GCN-NEXT: v_mov_b32_e32 v6, s11
563; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5
564; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
565; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
566; GCN-NEXT: v_mul_lo_u32 v7, s12, v5
567; GCN-NEXT: v_mul_lo_u32 v8, s13, v2
568; GCN-NEXT: v_mul_hi_u32 v9, s12, v2
569; GCN-NEXT: v_mul_lo_u32 v10, s12, v2
570; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7
571; GCN-NEXT: v_mul_hi_u32 v9, v2, v10
572; GCN-NEXT: v_mul_hi_u32 v11, v5, v10
573; GCN-NEXT: v_mul_lo_u32 v10, v5, v10
574; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8
575; GCN-NEXT: v_mul_hi_u32 v8, v2, v7
576; GCN-NEXT: v_mul_lo_u32 v12, v2, v7
577; GCN-NEXT: v_mul_hi_u32 v13, v5, v7
578; GCN-NEXT: v_mul_lo_u32 v7, v5, v7
579; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v12
580; GCN-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc
581; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
582; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc
583; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v0, vcc
584; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
585; GCN-NEXT: v_addc_u32_e32 v8, vcc, v1, v9, vcc
586; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v2, v7
587; GCN-NEXT: v_addc_u32_e64 v7, vcc, v5, v8, s[0:1]
588; GCN-NEXT: v_mul_hi_u32 v9, s12, v2
589; GCN-NEXT: v_mul_lo_u32 v10, s13, v2
590; GCN-NEXT: v_mul_lo_u32 v11, s12, v2
591; GCN-NEXT: v_mul_lo_u32 v12, s12, v7
592; GCN-NEXT: v_mul_hi_u32 v13, v7, v11
593; GCN-NEXT: v_mul_lo_u32 v14, v7, v11
594; GCN-NEXT: v_mul_hi_u32 v11, v2, v11
595; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v12
596; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9
597; GCN-NEXT: v_mul_hi_u32 v10, v7, v9
598; GCN-NEXT: v_mul_hi_u32 v12, v2, v9
599; GCN-NEXT: v_mul_lo_u32 v15, v2, v9
600; GCN-NEXT: v_mul_lo_u32 v7, v7, v9
601; GCN-NEXT: v_add_i32_e32 v9, vcc, v11, v15
602; GCN-NEXT: v_addc_u32_e32 v11, vcc, v1, v12, vcc
603; GCN-NEXT: v_add_i32_e32 v9, vcc, v14, v9
604; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v13, vcc
605; GCN-NEXT: v_addc_u32_e32 v10, vcc, v10, v0, vcc
606; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8
607; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7
608; GCN-NEXT: v_addc_u32_e32 v8, vcc, v1, v10, vcc
609; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
610; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7
611; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
612; GCN-NEXT: v_mul_hi_u32 v7, s10, v2
613; GCN-NEXT: v_mul_hi_u32 v8, s11, v2
614; GCN-NEXT: v_mul_lo_u32 v2, s11, v2
615; GCN-NEXT: v_mul_hi_u32 v9, s10, v5
616; GCN-NEXT: v_mul_lo_u32 v10, s10, v5
617; GCN-NEXT: v_mul_hi_u32 v11, s11, v5
618; GCN-NEXT: v_mul_lo_u32 v5, s11, v5
619; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10
620; GCN-NEXT: v_addc_u32_e32 v9, vcc, v1, v9, vcc
621; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7
622; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v8, vcc
623; GCN-NEXT: v_addc_u32_e32 v0, vcc, v11, v0, vcc
624; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
625; GCN-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc
626; GCN-NEXT: v_mul_hi_u32 v1, s2, v2
627; GCN-NEXT: v_mul_lo_u32 v5, s3, v2
628; GCN-NEXT: v_mul_lo_u32 v2, s2, v2
629; GCN-NEXT: v_mul_lo_u32 v0, s2, v0
630; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
631; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v5
632; GCN-NEXT: v_sub_i32_e32 v1, vcc, s11, v0
633; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2
634; GCN-NEXT: v_subb_u32_e64 v1, s[0:1], v1, v4, vcc
635; GCN-NEXT: v_subb_u32_e32 v0, vcc, v6, v0, vcc
636; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
637; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
638; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s2, v2
639; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v1, v4, vcc
640; GCN-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
641; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v6
642; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
643; GCN-NEXT: v_subrev_i32_e32 v8, vcc, s2, v6
644; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v0
645; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
646; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
647; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[0:1]
648; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
649; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v0
650; GCN-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
651; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1
652; GCN-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
653; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
654; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
655; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5
656; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
657; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v8, vcc
658; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
659; GCN-NEXT: v_xor_b32_e32 v2, s8, v0
660; GCN-NEXT: v_xor_b32_e32 v0, s8, v1
661; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0
662; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc
663; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
664; GCN-NEXT: s_endpgm
665 %1 = ashr i64 %x, 31
666 %2 = ashr i64 %y, 31
667 %result = srem i64 %1, %2
668 store i64 %result, i64 addrspace(1)* %out
669 ret void
670}
671
672define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 %y) {
673; GCN-LABEL: s_test_srem24_48:
674; GCN: ; %bb.0:
675; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
676; GCN-NEXT: s_load_dword s2, s[0:1], 0xb
677; GCN-NEXT: s_load_dword s3, s[0:1], 0xc
678; GCN-NEXT: s_load_dword s8, s[0:1], 0xd
679; GCN-NEXT: s_load_dword s9, s[0:1], 0xe
680; GCN-NEXT: s_mov_b32 s7, 0xf000
681; GCN-NEXT: s_mov_b32 s6, -1
682; GCN-NEXT: s_waitcnt lgkmcnt(0)
683; GCN-NEXT: s_sext_i32_i16 s0, s3
684; GCN-NEXT: s_sext_i32_i16 s1, s9
685; GCN-NEXT: v_mov_b32_e32 v0, s8
686; GCN-NEXT: v_mov_b32_e32 v1, s2
687; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24
688; GCN-NEXT: v_alignbit_b32 v1, s0, v1, 24
689; GCN-NEXT: v_xor_b32_e32 v2, v1, v0
690; GCN-NEXT: v_cvt_f32_i32_e32 v3, v1
691; GCN-NEXT: v_cvt_f32_i32_e32 v4, v0
692; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v2
693; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4
694; GCN-NEXT: v_or_b32_e32 v2, 1, v2
695; GCN-NEXT: v_mul_f32_e32 v5, v3, v5
696; GCN-NEXT: v_trunc_f32_e32 v5, v5
697; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3
698; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5
699; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4|
700; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
701; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
702; GCN-NEXT: v_mul_lo_u32 v0, v2, v0
703; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
704; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
705; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
706; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
707; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
708; GCN-NEXT: s_endpgm
709 %1 = ashr i48 %x, 24
710 %2 = ashr i48 %y, 24
711 %result = srem i48 %1, %2
712 store i48 %result, i48 addrspace(1)* %out
713 ret void
714}
715
716define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
717; GCN-LABEL: s_test_srem_k_num_i64:
718; GCN: ; %bb.0:
719; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
720; GCN-NEXT: s_mov_b32 s7, 0xf000
721; GCN-NEXT: s_mov_b32 s6, -1
722; GCN-NEXT: v_mov_b32_e32 v0, 0
723; GCN-NEXT: s_waitcnt lgkmcnt(0)
724; GCN-NEXT: s_ashr_i32 s8, s3, 31
725; GCN-NEXT: v_mov_b32_e32 v1, 0
726; GCN-NEXT: s_mov_b32 s4, s0
727; GCN-NEXT: s_mov_b32 s5, s1
728; GCN-NEXT: s_mov_b32 s9, s8
729; GCN-NEXT: s_add_u32 s0, s2, s8
730; GCN-NEXT: s_addc_u32 s1, s3, s8
731; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], s[8:9]
732; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2
733; GCN-NEXT: v_cvt_f32_u32_e32 v3, s3
734; GCN-NEXT: s_sub_u32 s8, 0, s2
735; GCN-NEXT: v_mov_b32_e32 v4, s3
736; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
737; GCN-NEXT: s_subb_u32 s9, 0, s3
738; GCN-NEXT: v_rcp_f32_e32 v2, v2
739; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
740; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
741; GCN-NEXT: v_trunc_f32_e32 v3, v3
742; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
743; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3
744; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
745; GCN-NEXT: v_mul_lo_u32 v5, s8, v3
746; GCN-NEXT: v_mul_lo_u32 v6, s9, v2
747; GCN-NEXT: v_mul_hi_u32 v7, s8, v2
748; GCN-NEXT: v_mul_lo_u32 v8, s8, v2
749; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
750; GCN-NEXT: v_mul_hi_u32 v7, v2, v8
751; GCN-NEXT: v_mul_hi_u32 v9, v3, v8
752; GCN-NEXT: v_mul_lo_u32 v8, v3, v8
753; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6
754; GCN-NEXT: v_mul_hi_u32 v6, v2, v5
755; GCN-NEXT: v_mul_lo_u32 v10, v2, v5
756; GCN-NEXT: v_mul_hi_u32 v11, v3, v5
757; GCN-NEXT: v_mul_lo_u32 v5, v3, v5
758; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10
759; GCN-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc
760; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
761; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v9, vcc
762; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v0, vcc
763; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
764; GCN-NEXT: v_addc_u32_e32 v6, vcc, v1, v7, vcc
765; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v2, v5
766; GCN-NEXT: v_addc_u32_e64 v5, vcc, v3, v6, s[0:1]
767; GCN-NEXT: v_mul_hi_u32 v7, s8, v2
768; GCN-NEXT: v_mul_lo_u32 v8, s9, v2
769; GCN-NEXT: v_mul_lo_u32 v9, s8, v2
770; GCN-NEXT: v_mul_lo_u32 v10, s8, v5
771; GCN-NEXT: v_mul_hi_u32 v11, v5, v9
772; GCN-NEXT: v_mul_lo_u32 v12, v5, v9
773; GCN-NEXT: v_mul_hi_u32 v9, v2, v9
774; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10
775; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7
776; GCN-NEXT: v_mul_hi_u32 v8, v5, v7
777; GCN-NEXT: v_mul_hi_u32 v10, v2, v7
778; GCN-NEXT: v_mul_lo_u32 v13, v2, v7
779; GCN-NEXT: v_mul_lo_u32 v5, v5, v7
780; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v13
781; GCN-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc
782; GCN-NEXT: v_add_i32_e32 v7, vcc, v12, v7
783; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v11, vcc
784; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc
785; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6
786; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
787; GCN-NEXT: v_addc_u32_e32 v6, vcc, v1, v8, vcc
788; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
789; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5
790; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
791; GCN-NEXT: v_mul_hi_u32 v5, 24, v2
792; GCN-NEXT: v_mul_hi_u32 v2, 0, v2
793; GCN-NEXT: v_mul_hi_u32 v6, 24, v3
794; GCN-NEXT: v_mul_lo_u32 v7, v3, 24
795; GCN-NEXT: v_mul_hi_u32 v3, 0, v3
796; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7
797; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
798; GCN-NEXT: v_add_i32_e32 v5, vcc, 0, v5
799; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
800; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc
801; GCN-NEXT: v_mul_hi_u32 v2, s2, v1
802; GCN-NEXT: v_mul_lo_u32 v3, s3, v1
803; GCN-NEXT: v_mul_lo_u32 v1, s2, v1
804; GCN-NEXT: v_mul_lo_u32 v0, s2, v0
805; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0
806; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
807; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
808; GCN-NEXT: v_sub_i32_e32 v3, vcc, 24, v1
809; GCN-NEXT: v_subb_u32_e64 v1, s[0:1], v2, v4, vcc
810; GCN-NEXT: v_subb_u32_e32 v0, vcc, 0, v0, vcc
811; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
812; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
813; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s2, v3
814; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v1, v4, vcc
815; GCN-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
816; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v5
817; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
818; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s2, v5
819; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v0
820; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
821; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
822; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1]
823; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
824; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v0
825; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
826; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1
827; GCN-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
828; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
829; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
830; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2
831; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
832; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
833; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v0, s[0:1]
834; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
835; GCN-NEXT: s_endpgm
836 %result = srem i64 24, %x
837 store i64 %result, i64 addrspace(1)* %out
838 ret void
839}
840
841define i64 @v_test_srem_k_num_i64(i64 %x) {
842; GCN-LABEL: v_test_srem_k_num_i64:
843; GCN: ; %bb.0:
844; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
845; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
846; GCN-NEXT: v_mov_b32_e32 v3, 0
847; GCN-NEXT: v_mov_b32_e32 v4, 0
848; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
849; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
850; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
851; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
852; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
853; GCN-NEXT: v_cvt_f32_u32_e32 v5, v1
854; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
855; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v1, vcc
856; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
857; GCN-NEXT: v_rcp_f32_e32 v2, v2
858; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
859; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
860; GCN-NEXT: v_trunc_f32_e32 v5, v5
861; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5
862; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
863; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
864; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
865; GCN-NEXT: v_mul_lo_u32 v9, v7, v2
866; GCN-NEXT: v_mul_hi_u32 v10, v6, v2
867; GCN-NEXT: v_mul_lo_u32 v11, v6, v2
868; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8
869; GCN-NEXT: v_mul_hi_u32 v10, v2, v11
870; GCN-NEXT: v_mul_hi_u32 v12, v5, v11
871; GCN-NEXT: v_mul_lo_u32 v11, v5, v11
872; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9
873; GCN-NEXT: v_mul_hi_u32 v9, v2, v8
874; GCN-NEXT: v_mul_lo_u32 v13, v2, v8
875; GCN-NEXT: v_mul_hi_u32 v14, v5, v8
876; GCN-NEXT: v_mul_lo_u32 v8, v5, v8
877; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
878; GCN-NEXT: v_addc_u32_e32 v9, vcc, v4, v9, vcc
879; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
880; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
881; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v3, vcc
882; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
883; GCN-NEXT: v_addc_u32_e32 v9, vcc, v4, v10, vcc
884; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8
885; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
886; GCN-NEXT: v_mul_hi_u32 v10, v6, v2
887; GCN-NEXT: v_mul_lo_u32 v7, v7, v2
888; GCN-NEXT: v_mul_lo_u32 v11, v6, v2
889; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
890; GCN-NEXT: v_mul_lo_u32 v6, v6, v8
891; GCN-NEXT: v_mul_hi_u32 v9, v8, v11
892; GCN-NEXT: v_mul_lo_u32 v12, v8, v11
893; GCN-NEXT: v_mul_hi_u32 v11, v2, v11
894; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6
895; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
896; GCN-NEXT: v_mul_hi_u32 v7, v8, v6
897; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
898; GCN-NEXT: v_mul_lo_u32 v13, v2, v6
899; GCN-NEXT: v_mul_lo_u32 v6, v8, v6
900; GCN-NEXT: v_add_i32_e32 v8, vcc, v11, v13
901; GCN-NEXT: v_addc_u32_e32 v10, vcc, v4, v10, vcc
902; GCN-NEXT: v_add_i32_e32 v8, vcc, v12, v8
903; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v9, vcc
904; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
905; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6
906; GCN-NEXT: v_addc_u32_e32 v7, vcc, v4, v7, vcc
907; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v7, s[4:5]
908; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
909; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
910; GCN-NEXT: v_mul_hi_u32 v6, 24, v2
911; GCN-NEXT: v_mul_hi_u32 v2, 0, v2
912; GCN-NEXT: v_mul_hi_u32 v7, 24, v5
913; GCN-NEXT: v_mul_lo_u32 v8, v5, 24
914; GCN-NEXT: v_mul_hi_u32 v5, 0, v5
915; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
916; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc
917; GCN-NEXT: v_add_i32_e32 v6, vcc, 0, v6
918; GCN-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
919; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
920; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
921; GCN-NEXT: v_mul_lo_u32 v5, v1, v2
922; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
923; GCN-NEXT: v_mul_lo_u32 v3, v0, v3
924; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
925; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
926; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
927; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2
928; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
929; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc
930; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
931; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
932; GCN-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
933; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v4, v1, vcc
934; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
935; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0
936; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
937; GCN-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
938; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
939; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
940; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
941; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
942; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
943; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5]
944; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
945; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
946; GCN-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc
947; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
948; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
949; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
950; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
951; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
952; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
953; GCN-NEXT: s_setpc_b64 s[30:31]
954 %result = srem i64 24, %x
955 ret i64 %result
956}
957
958define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
959; GCN-LABEL: v_test_srem_pow2_k_num_i64:
960; GCN: ; %bb.0:
961; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
963; GCN-NEXT: v_mov_b32_e32 v3, 0
964; GCN-NEXT: v_mov_b32_e32 v4, 0
965; GCN-NEXT: s_mov_b32 s6, 0x8000
966; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2
967; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
968; GCN-NEXT: v_xor_b32_e32 v1, v1, v2
969; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
970; GCN-NEXT: v_cvt_f32_u32_e32 v2, v0
971; GCN-NEXT: v_cvt_f32_u32_e32 v5, v1
972; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v0
973; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v1, vcc
974; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v5
975; GCN-NEXT: v_rcp_f32_e32 v2, v2
976; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
977; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v2
978; GCN-NEXT: v_trunc_f32_e32 v5, v5
979; GCN-NEXT: v_mac_f32_e32 v2, 0xcf800000, v5
980; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5
981; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
982; GCN-NEXT: v_mul_lo_u32 v8, v6, v5
983; GCN-NEXT: v_mul_lo_u32 v9, v7, v2
984; GCN-NEXT: v_mul_hi_u32 v10, v6, v2
985; GCN-NEXT: v_mul_lo_u32 v11, v6, v2
986; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8
987; GCN-NEXT: v_mul_hi_u32 v10, v2, v11
988; GCN-NEXT: v_mul_hi_u32 v12, v5, v11
989; GCN-NEXT: v_mul_lo_u32 v11, v5, v11
990; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v9
991; GCN-NEXT: v_mul_hi_u32 v9, v2, v8
992; GCN-NEXT: v_mul_lo_u32 v13, v2, v8
993; GCN-NEXT: v_mul_hi_u32 v14, v5, v8
994; GCN-NEXT: v_mul_lo_u32 v8, v5, v8
995; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v13
996; GCN-NEXT: v_addc_u32_e32 v9, vcc, v4, v9, vcc
997; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10
998; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
999; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v3, vcc
1000; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8
1001; GCN-NEXT: v_addc_u32_e32 v9, vcc, v4, v10, vcc
1002; GCN-NEXT: v_add_i32_e64 v2, s[4:5], v2, v8
1003; GCN-NEXT: v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
1004; GCN-NEXT: v_mul_hi_u32 v10, v6, v2
1005; GCN-NEXT: v_mul_lo_u32 v7, v7, v2
1006; GCN-NEXT: v_mul_lo_u32 v11, v6, v2
1007; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9
1008; GCN-NEXT: v_mul_lo_u32 v6, v6, v8
1009; GCN-NEXT: v_mul_hi_u32 v9, v8, v11
1010; GCN-NEXT: v_mul_lo_u32 v12, v8, v11
1011; GCN-NEXT: v_mul_hi_u32 v11, v2, v11
1012; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6
1013; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7
1014; GCN-NEXT: v_mul_hi_u32 v7, v8, v6
1015; GCN-NEXT: v_mul_hi_u32 v10, v2, v6
1016; GCN-NEXT: v_mul_lo_u32 v13, v2, v6
1017; GCN-NEXT: v_mul_lo_u32 v6, v8, v6
1018; GCN-NEXT: v_add_i32_e32 v8, vcc, v11, v13
1019; GCN-NEXT: v_addc_u32_e32 v10, vcc, v4, v10, vcc
1020; GCN-NEXT: v_add_i32_e32 v8, vcc, v12, v8
1021; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v9, vcc
1022; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
1023; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6
1024; GCN-NEXT: v_addc_u32_e32 v7, vcc, v4, v7, vcc
1025; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v7, s[4:5]
1026; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
1027; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1028; GCN-NEXT: v_mul_hi_u32 v6, s6, v2
1029; GCN-NEXT: v_mul_hi_u32 v2, 0, v2
1030; GCN-NEXT: v_mul_hi_u32 v7, s6, v5
1031; GCN-NEXT: v_lshlrev_b32_e32 v8, 15, v5
1032; GCN-NEXT: v_mul_hi_u32 v5, 0, v5
1033; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8
1034; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc
1035; GCN-NEXT: v_add_i32_e32 v6, vcc, 0, v6
1036; GCN-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc
1037; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc
1038; GCN-NEXT: v_mul_hi_u32 v4, v0, v2
1039; GCN-NEXT: v_mul_lo_u32 v5, v1, v2
1040; GCN-NEXT: v_mul_lo_u32 v2, v0, v2
1041; GCN-NEXT: v_mul_lo_u32 v3, v0, v3
1042; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
1043; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
1044; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
1045; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2
1046; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc
1047; GCN-NEXT: v_subb_u32_e32 v3, vcc, 0, v3, vcc
1048; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v0
1049; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
1050; GCN-NEXT: v_sub_i32_e32 v6, vcc, v2, v0
1051; GCN-NEXT: v_subb_u32_e64 v7, s[4:5], v4, v1, vcc
1052; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
1053; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v0
1054; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
1055; GCN-NEXT: v_sub_i32_e32 v0, vcc, v6, v0
1056; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v1
1057; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
1058; GCN-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v1
1059; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
1060; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v1
1061; GCN-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5]
1062; GCN-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
1063; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
1064; GCN-NEXT: v_cndmask_b32_e32 v1, v10, v8, vcc
1065; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
1066; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
1067; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
1068; GCN-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5]
1069; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v7, vcc
1070; GCN-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
1071; GCN-NEXT: s_setpc_b64 s[30:31]
1072 %result = srem i64 32768, %x
1073 ret i64 %result
1074}
1075
1076define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
1077; GCN-LABEL: v_test_srem_pow2_k_den_i64:
1078; GCN: ; %bb.0:
1079; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1080; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v1
1081; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2
1082; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
1083; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1084; GCN-NEXT: v_and_b32_e32 v2, 0xffff8000, v2
1085; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1086; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
1087; GCN-NEXT: s_setpc_b64 s[30:31]
1088 %result = srem i64 %x, 32768
1089 ret i64 %result
1090}
1091
1092define amdgpu_kernel void @s_test_srem24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
1093; GCN-LABEL: s_test_srem24_k_num_i64:
1094; GCN: ; %bb.0:
1095; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1096; GCN-NEXT: s_mov_b32 s7, 0xf000
1097; GCN-NEXT: s_mov_b32 s6, -1
1098; GCN-NEXT: s_waitcnt lgkmcnt(0)
1099; GCN-NEXT: s_mov_b32 s2, 0x41c00000
1100; GCN-NEXT: s_mov_b32 s4, s0
1101; GCN-NEXT: s_mov_b32 s5, s1
1102; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40
1103; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0
1104; GCN-NEXT: s_ashr_i32 s1, s0, 30
1105; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0
1106; GCN-NEXT: s_or_b32 s1, s1, 1
1107; GCN-NEXT: v_mul_f32_e32 v1, s2, v1
1108; GCN-NEXT: v_mov_b32_e32 v2, s1
1109; GCN-NEXT: v_trunc_f32_e32 v1, v1
1110; GCN-NEXT: v_mad_f32 v3, -v1, v0, s2
1111; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v0|
1112; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
1113; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
1114; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1115; GCN-NEXT: v_mul_lo_u32 v0, v0, s0
1116; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
1117; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1118; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1119; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1120; GCN-NEXT: s_endpgm
1121 %x.shr = ashr i64 %x, 40
1122 %result = srem i64 24, %x.shr
1123 store i64 %result, i64 addrspace(1)* %out
1124 ret void
1125}
1126
1127define amdgpu_kernel void @s_test_srem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
1128; GCN-LABEL: s_test_srem24_k_den_i64:
1129; GCN: ; %bb.0:
1130; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1131; GCN-NEXT: s_mov_b32 s7, 0xf000
1132; GCN-NEXT: s_mov_b32 s6, -1
1133; GCN-NEXT: s_waitcnt lgkmcnt(0)
1134; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00
1135; GCN-NEXT: s_mov_b32 s4, s0
1136; GCN-NEXT: s_mov_b32 s5, s1
1137; GCN-NEXT: s_ashr_i64 s[0:1], s[2:3], 40
1138; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0
1139; GCN-NEXT: s_ashr_i32 s1, s0, 30
1140; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
1141; GCN-NEXT: s_or_b32 s1, s1, 1
1142; GCN-NEXT: v_trunc_f32_e32 v1, v1
1143; GCN-NEXT: v_mov_b32_e32 v2, s1
1144; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0
1145; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
1146; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2
1147; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc
1148; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1149; GCN-NEXT: s_movk_i32 s1, 0x5b7f
1150; GCN-NEXT: v_mul_lo_u32 v0, v0, s1
1151; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
1152; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1153; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1154; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1155; GCN-NEXT: s_endpgm
1156 %x.shr = ashr i64 %x, 40
1157 %result = srem i64 %x.shr, 23423
1158 store i64 %result, i64 addrspace(1)* %out
1159 ret void
1160}
1161
1162define i64 @v_test_srem24_k_num_i64(i64 %x) {
1163; GCN-LABEL: v_test_srem24_k_num_i64:
1164; GCN: ; %bb.0:
1165; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40
1167; GCN-NEXT: s_mov_b32 s4, 0x41c00000
1168; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0
1169; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v0
1170; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1
1171; GCN-NEXT: v_or_b32_e32 v2, 1, v2
1172; GCN-NEXT: v_mul_f32_e32 v3, s4, v3
1173; GCN-NEXT: v_trunc_f32_e32 v3, v3
1174; GCN-NEXT: v_mad_f32 v4, -v3, v1, s4
1175; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1|
1176; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
1177; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3
1178; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
1179; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
1180; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0
1181; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1182; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1183; GCN-NEXT: s_setpc_b64 s[30:31]
1184 %x.shr = ashr i64 %x, 40
1185 %result = srem i64 24, %x.shr
1186 ret i64 %result
1187}
1188
1189define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
1190; GCN-LABEL: v_test_srem24_pow2_k_num_i64:
1191; GCN: ; %bb.0:
1192; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40
1194; GCN-NEXT: s_mov_b32 s4, 0x47000000
1195; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0
1196; GCN-NEXT: v_ashrrev_i32_e32 v2, 30, v0
1197; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1
1198; GCN-NEXT: v_or_b32_e32 v2, 1, v2
1199; GCN-NEXT: v_mul_f32_e32 v3, s4, v3
1200; GCN-NEXT: v_trunc_f32_e32 v3, v3
1201; GCN-NEXT: v_mad_f32 v4, -v3, v1, s4
1202; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v1|
1203; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
1204; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3
1205; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1
1206; GCN-NEXT: v_mul_lo_u32 v0, v1, v0
1207; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0
1208; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
1209; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
1210; GCN-NEXT: s_setpc_b64 s[30:31]
1211 %x.shr = ashr i64 %x, 40
1212 %result = srem i64 32768, %x.shr
1213 ret i64 %result
1214}
1215
1216define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
1217; GCN-LABEL: v_test_srem24_pow2_k_den_i64:
1218; GCN: ; %bb.0:
1219; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Simon Pilgrimc8de7c82020-01-25 17:36:21 +00001220; GCN-NEXT: v_ashr_i64 v[0:1], v[0:1], 40
1221; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v1
1222; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2
1223; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
1224; GCN-NEXT: v_and_b32_e32 v2, 0xffff8000, v2
1225; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
1226; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
Matt Arsenault317fdcd2020-01-18 16:51:43 -05001227; GCN-NEXT: s_setpc_b64 s[30:31]
1228 %x.shr = ashr i64 %x, 40
1229 %result = srem i64 %x.shr, 32768
1230 ret i64 %result
1231}