blob: c923d1c0bc79155119aeb39d4e7e35663e6be2e1 [file] [log] [blame]
Farhana Aleen3528c802018-08-21 16:21:15 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
Matt Arsenault28c16bd2018-08-31 14:34:22 +00003; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
Farhana Aleen3528c802018-08-21 16:21:15 +00006
7; add(mul(S0.x, S1.y),
8; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
9
10define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +000011; GFX7-LABEL: udot2:
12; GFX7: ; %bb.0: ; %entry
13; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
14; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
15; GFX7-NEXT: s_mov_b32 s8, 0xffff
16; GFX7-NEXT: s_mov_b32 s3, 0xf000
17; GFX7-NEXT: s_mov_b32 s2, -1
18; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
20; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
21; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
22; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23; GFX7-NEXT: s_lshr_b32 s7, s4, 16
24; GFX7-NEXT: s_lshr_b32 s9, s5, 16
25; GFX7-NEXT: s_and_b32 s4, s4, s8
26; GFX7-NEXT: v_mov_b32_e32 v0, s7
27; GFX7-NEXT: v_mov_b32_e32 v1, s6
28; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
29; GFX7-NEXT: s_and_b32 s5, s5, s8
30; GFX7-NEXT: v_mov_b32_e32 v1, s4
31; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
32; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
33; GFX7-NEXT: s_endpgm
34;
35; GFX8-LABEL: udot2:
36; GFX8: ; %bb.0: ; %entry
37; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
38; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
39; GFX8-NEXT: s_mov_b32 s2, 0xffff
40; GFX8-NEXT: s_waitcnt lgkmcnt(0)
41; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
42; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
43; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
44; GFX8-NEXT: s_waitcnt lgkmcnt(0)
45; GFX8-NEXT: s_and_b32 s6, s3, s2
46; GFX8-NEXT: s_lshr_b32 s3, s3, 16
47; GFX8-NEXT: s_and_b32 s2, s4, s2
48; GFX8-NEXT: s_lshr_b32 s4, s4, 16
49; GFX8-NEXT: v_mov_b32_e32 v0, s5
50; GFX8-NEXT: v_mov_b32_e32 v1, s3
51; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
52; GFX8-NEXT: v_mov_b32_e32 v1, s6
53; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
54; GFX8-NEXT: v_mov_b32_e32 v0, s0
55; GFX8-NEXT: v_mov_b32_e32 v1, s1
56; GFX8-NEXT: flat_store_dword v[0:1], v2
57; GFX8-NEXT: s_endpgm
58;
59; GFX9-NODL-LABEL: udot2:
60; GFX9-NODL: ; %bb.0: ; %entry
61; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
62; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
63; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
64; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
65; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
66; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
67; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
68; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
69; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
70; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
71; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
72; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
73; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
74; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
75; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
76; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
77; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
78; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
79; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
80; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
81; GFX9-NODL-NEXT: s_endpgm
82;
83; GFX9-DL-LABEL: udot2:
84; GFX9-DL: ; %bb.0: ; %entry
85; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
86; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
87; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
88; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
89; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
90; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
91; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
92; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
93; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
94; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
95; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
96; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
97; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
98; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +000099 <2 x i16> addrspace(1)* %src2,
100 i32 addrspace(1)* nocapture %dst) {
101entry:
102 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
103 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
104
105 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
106 %conv = zext i16 %s1.elt1 to i32
107 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
108 %conv2 = zext i16 %s2.elt1 to i32
109 %mul1 = mul nuw i32 %conv2, %conv
110
111 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
112 %conv3 = zext i16 %s1.elt2 to i32
113 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
114 %conv4 = zext i16 %s2.elt2 to i32
115 %mul2 = mul nuw i32 %conv4, %conv3
116
117 %s3 = load i32, i32 addrspace(1)* %dst, align 4
118 %add = add i32 %mul2, %s3
119 %add6 = add i32 %add, %mul1
120 store i32 %add6, i32 addrspace(1)* %dst, align 4
121 ret void
122}
123
124; TODO: Support this pattern
125; add(S3,
126; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
127define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000128; GFX7-LABEL: udot2_MulMul:
129; GFX7: ; %bb.0: ; %entry
130; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
131; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
132; GFX7-NEXT: s_mov_b32 s8, 0xffff
133; GFX7-NEXT: s_mov_b32 s3, 0xf000
134; GFX7-NEXT: s_mov_b32 s2, -1
135; GFX7-NEXT: s_waitcnt lgkmcnt(0)
136; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
137; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
138; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
139; GFX7-NEXT: s_waitcnt lgkmcnt(0)
140; GFX7-NEXT: s_lshr_b32 s7, s4, 16
141; GFX7-NEXT: s_and_b32 s4, s4, s8
142; GFX7-NEXT: s_lshr_b32 s9, s5, 16
143; GFX7-NEXT: s_and_b32 s5, s5, s8
144; GFX7-NEXT: v_mov_b32_e32 v0, s4
145; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0
146; GFX7-NEXT: v_mov_b32_e32 v1, s7
147; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
148; GFX7-NEXT: v_add_i32_e32 v0, vcc, s6, v0
149; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
150; GFX7-NEXT: s_endpgm
151;
152; GFX8-LABEL: udot2_MulMul:
153; GFX8: ; %bb.0: ; %entry
154; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
155; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
156; GFX8-NEXT: s_mov_b32 s2, 0xffff
157; GFX8-NEXT: s_waitcnt lgkmcnt(0)
158; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
159; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
160; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
161; GFX8-NEXT: s_waitcnt lgkmcnt(0)
162; GFX8-NEXT: s_and_b32 s6, s3, s2
163; GFX8-NEXT: s_and_b32 s2, s4, s2
164; GFX8-NEXT: v_mov_b32_e32 v0, s6
165; GFX8-NEXT: s_lshr_b32 s3, s3, 16
166; GFX8-NEXT: s_lshr_b32 s4, s4, 16
167; GFX8-NEXT: v_mov_b32_e32 v1, s3
168; GFX8-NEXT: v_mul_u32_u24_e32 v0, s2, v0
169; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
170; GFX8-NEXT: v_add_u32_e32 v2, vcc, s5, v0
171; GFX8-NEXT: v_mov_b32_e32 v0, s0
172; GFX8-NEXT: v_mov_b32_e32 v1, s1
173; GFX8-NEXT: flat_store_dword v[0:1], v2
174; GFX8-NEXT: s_endpgm
175;
176; GFX9-NODL-LABEL: udot2_MulMul:
177; GFX9-NODL: ; %bb.0: ; %entry
178; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
179; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
180; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
181; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
182; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
183; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
184; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
185; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
186; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
187; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
188; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
189; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
190; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
191; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
192; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
193; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
194; GFX9-NODL-NEXT: v_add_u32_e32 v2, s5, v0
195; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
196; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
197; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
198; GFX9-NODL-NEXT: s_endpgm
199;
200; GFX9-DL-LABEL: udot2_MulMul:
201; GFX9-DL: ; %bb.0: ; %entry
202; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
203; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
204; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
205; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
206; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
207; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
208; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
209; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
210; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
211; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
212; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
213; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
214; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
215; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
216; GFX9-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
217; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
218; GFX9-DL-NEXT: v_add_u32_e32 v2, s5, v0
219; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
220; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
221; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
222; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000223 <2 x i16> addrspace(1)* %src2,
224 i32 addrspace(1)* nocapture %dst) {
225entry:
226 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
227 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
228
229 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
230 %conv = zext i16 %s1.elt1 to i32
231 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
232 %conv2 = zext i16 %s2.elt1 to i32
233 %mul1 = mul nuw i32 %conv2, %conv
234
235 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
236 %conv3 = zext i16 %s1.elt2 to i32
237 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
238 %conv4 = zext i16 %s2.elt2 to i32
239 %mul2 = mul nuw i32 %conv4, %conv3
240 %s3 = load i32, i32 addrspace(1)* %dst, align 4
241 %add = add i32 %mul2, %mul1
242 %add6 = add i32 %add, %s3
243 store i32 %add6, i32 addrspace(1)* %dst, align 4
244 ret void
245}
246
247define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000248; GFX7-LABEL: idot2:
249; GFX7: ; %bb.0: ; %entry
250; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
251; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
252; GFX7-NEXT: s_mov_b32 s3, 0xf000
253; GFX7-NEXT: s_mov_b32 s2, -1
254; GFX7-NEXT: s_waitcnt lgkmcnt(0)
255; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
256; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
257; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
258; GFX7-NEXT: s_waitcnt lgkmcnt(0)
259; GFX7-NEXT: s_sext_i32_i16 s7, s4
260; GFX7-NEXT: s_ashr_i32 s4, s4, 16
261; GFX7-NEXT: s_sext_i32_i16 s8, s5
262; GFX7-NEXT: s_ashr_i32 s5, s5, 16
263; GFX7-NEXT: v_mov_b32_e32 v0, s4
264; GFX7-NEXT: v_mov_b32_e32 v1, s6
265; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
266; GFX7-NEXT: v_mov_b32_e32 v1, s7
267; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
268; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
269; GFX7-NEXT: s_endpgm
270;
271; GFX8-LABEL: idot2:
272; GFX8: ; %bb.0: ; %entry
273; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
274; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
275; GFX8-NEXT: s_waitcnt lgkmcnt(0)
276; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
277; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
278; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
279; GFX8-NEXT: v_mov_b32_e32 v0, s0
280; GFX8-NEXT: v_mov_b32_e32 v1, s1
281; GFX8-NEXT: s_waitcnt lgkmcnt(0)
282; GFX8-NEXT: s_sext_i32_i16 s0, s2
283; GFX8-NEXT: s_ashr_i32 s2, s2, 16
284; GFX8-NEXT: s_sext_i32_i16 s1, s3
285; GFX8-NEXT: s_ashr_i32 s3, s3, 16
286; GFX8-NEXT: v_mov_b32_e32 v2, s4
287; GFX8-NEXT: v_mov_b32_e32 v3, s2
288; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
289; GFX8-NEXT: v_mov_b32_e32 v3, s0
290; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
291; GFX8-NEXT: flat_store_dword v[0:1], v2
292; GFX8-NEXT: s_endpgm
293;
294; GFX9-NODL-LABEL: idot2:
295; GFX9-NODL: ; %bb.0: ; %entry
296; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
297; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
298; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
299; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
300; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
301; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
302; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
303; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
304; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
305; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
306; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
307; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
308; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
309; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
310; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
311; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
312; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
313; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
314; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
315; GFX9-NODL-NEXT: s_endpgm
316;
317; GFX9-DL-LABEL: idot2:
318; GFX9-DL: ; %bb.0: ; %entry
319; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
320; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
321; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
322; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
323; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
324; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
325; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
326; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
327; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
328; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
329; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
330; GFX9-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3
331; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
332; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000333 <2 x i16> addrspace(1)* %src2,
334 i32 addrspace(1)* nocapture %dst) {
335entry:
336 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
337 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
338
339 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
340 %conv = sext i16 %s1.elt1 to i32
341 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
342 %conv2 = sext i16 %s2.elt1 to i32
343 %mul1 = mul nuw i32 %conv2, %conv
344
345 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
346 %conv3 = sext i16 %s1.elt2 to i32
347 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
348 %conv4 = sext i16 %s2.elt2 to i32
349 %mul2 = mul nuw i32 %conv4, %conv3
350
351 %s3 = load i32, i32 addrspace(1)* %dst, align 4
352 %add = add i32 %mul2, %s3
353 %add6 = add i32 %add, %mul1
354 store i32 %add6, i32 addrspace(1)* %dst, align 4
355 ret void
356}
357
358define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000359; GFX7-LABEL: idot2_MixedTypedMul:
360; GFX7: ; %bb.0: ; %entry
361; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
362; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
363; GFX7-NEXT: s_mov_b32 s3, 0xf000
364; GFX7-NEXT: s_mov_b32 s2, -1
365; GFX7-NEXT: s_waitcnt lgkmcnt(0)
366; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
367; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
368; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
369; GFX7-NEXT: s_waitcnt lgkmcnt(0)
370; GFX7-NEXT: s_lshr_b32 s7, s4, 16
371; GFX7-NEXT: s_lshr_b32 s8, s5, 16
372; GFX7-NEXT: s_sext_i32_i16 s4, s4
373; GFX7-NEXT: v_mov_b32_e32 v0, s7
374; GFX7-NEXT: v_mov_b32_e32 v1, s6
375; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1
376; GFX7-NEXT: s_sext_i32_i16 s5, s5
377; GFX7-NEXT: v_mov_b32_e32 v1, s4
378; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0
379; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
380; GFX7-NEXT: s_endpgm
381;
382; GFX8-LABEL: idot2_MixedTypedMul:
383; GFX8: ; %bb.0: ; %entry
384; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
385; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
386; GFX8-NEXT: s_waitcnt lgkmcnt(0)
387; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
388; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
389; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
390; GFX8-NEXT: v_mov_b32_e32 v0, s0
391; GFX8-NEXT: v_mov_b32_e32 v1, s1
392; GFX8-NEXT: s_waitcnt lgkmcnt(0)
393; GFX8-NEXT: s_sext_i32_i16 s0, s2
394; GFX8-NEXT: s_lshr_b32 s2, s2, 16
395; GFX8-NEXT: s_sext_i32_i16 s1, s3
396; GFX8-NEXT: s_lshr_b32 s3, s3, 16
397; GFX8-NEXT: v_mov_b32_e32 v2, s4
398; GFX8-NEXT: v_mov_b32_e32 v3, s2
399; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
400; GFX8-NEXT: v_mov_b32_e32 v3, s0
401; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
402; GFX8-NEXT: flat_store_dword v[0:1], v2
403; GFX8-NEXT: s_endpgm
404;
405; GFX9-NODL-LABEL: idot2_MixedTypedMul:
406; GFX9-NODL: ; %bb.0: ; %entry
407; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
408; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
409; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
410; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
411; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
412; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
413; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
414; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
415; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
416; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
417; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
418; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
419; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
420; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
421; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
422; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
423; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
424; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
425; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
426; GFX9-NODL-NEXT: s_endpgm
427;
428; GFX9-DL-LABEL: idot2_MixedTypedMul:
429; GFX9-DL: ; %bb.0: ; %entry
430; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
431; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
432; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
433; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
434; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
435; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
436; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
437; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
438; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
439; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
440; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
441; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
442; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
443; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
444; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
445; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
446; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
447; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
448; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
449; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000450 <2 x i16> addrspace(1)* %src2,
451 i32 addrspace(1)* nocapture %dst) {
452entry:
453 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
454 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
455
456 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
457 %conv = sext i16 %s1.elt1 to i32
458 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
459 %conv2 = sext i16 %s2.elt1 to i32
460 %mul1 = mul nuw i32 %conv2, %conv
461
462 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
463 %conv3 = zext i16 %s1.elt2 to i32
464 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
465 %conv4 = zext i16 %s2.elt2 to i32
466 %mul2 = mul nuw i32 %conv4, %conv3
467
468 %s3 = load i32, i32 addrspace(1)* %dst, align 4
469 %add = add i32 %mul2, %s3
470 %add6 = add i32 %add, %mul1
471 store i32 %add6, i32 addrspace(1)* %dst, align 4
472 ret void
473}
474
475define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000476; GFX7-LABEL: udot2_alt_AddOperands:
477; GFX7: ; %bb.0: ; %entry
478; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
479; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
480; GFX7-NEXT: s_mov_b32 s8, 0xffff
481; GFX7-NEXT: s_mov_b32 s3, 0xf000
482; GFX7-NEXT: s_mov_b32 s2, -1
483; GFX7-NEXT: s_waitcnt lgkmcnt(0)
484; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
485; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
486; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
487; GFX7-NEXT: s_waitcnt lgkmcnt(0)
488; GFX7-NEXT: s_lshr_b32 s7, s4, 16
489; GFX7-NEXT: s_lshr_b32 s9, s5, 16
490; GFX7-NEXT: s_and_b32 s4, s4, s8
491; GFX7-NEXT: v_mov_b32_e32 v0, s7
492; GFX7-NEXT: v_mov_b32_e32 v1, s6
493; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
494; GFX7-NEXT: s_and_b32 s5, s5, s8
495; GFX7-NEXT: v_mov_b32_e32 v1, s4
496; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
497; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
498; GFX7-NEXT: s_endpgm
499;
500; GFX8-LABEL: udot2_alt_AddOperands:
501; GFX8: ; %bb.0: ; %entry
502; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
503; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
504; GFX8-NEXT: s_mov_b32 s2, 0xffff
505; GFX8-NEXT: s_waitcnt lgkmcnt(0)
506; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
507; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
508; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
509; GFX8-NEXT: s_waitcnt lgkmcnt(0)
510; GFX8-NEXT: s_and_b32 s6, s3, s2
511; GFX8-NEXT: s_lshr_b32 s3, s3, 16
512; GFX8-NEXT: s_and_b32 s2, s4, s2
513; GFX8-NEXT: s_lshr_b32 s4, s4, 16
514; GFX8-NEXT: v_mov_b32_e32 v0, s5
515; GFX8-NEXT: v_mov_b32_e32 v1, s3
516; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
517; GFX8-NEXT: v_mov_b32_e32 v1, s6
518; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
519; GFX8-NEXT: v_mov_b32_e32 v0, s0
520; GFX8-NEXT: v_mov_b32_e32 v1, s1
521; GFX8-NEXT: flat_store_dword v[0:1], v2
522; GFX8-NEXT: s_endpgm
523;
524; GFX9-NODL-LABEL: udot2_alt_AddOperands:
525; GFX9-NODL: ; %bb.0: ; %entry
526; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
527; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
528; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
529; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
530; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
531; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
532; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
533; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
534; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
535; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
536; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
537; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
538; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
539; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
540; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
541; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
542; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
543; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
544; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
545; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
546; GFX9-NODL-NEXT: s_endpgm
547;
548; GFX9-DL-LABEL: udot2_alt_AddOperands:
549; GFX9-DL: ; %bb.0: ; %entry
550; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
551; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
552; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
553; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
554; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
555; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
556; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
557; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
558; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
559; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
560; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
561; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
562; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
563; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000564 <2 x i16> addrspace(1)* %src2,
565 i32 addrspace(1)* nocapture %dst) {
566entry:
567 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
568 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
569
570 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
571 %conv = zext i16 %s1.elt1 to i32
572 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
573 %conv2 = zext i16 %s2.elt1 to i32
574 %mul1 = mul nuw i32 %conv2, %conv
575
576 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
577 %conv3 = zext i16 %s1.elt2 to i32
578 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
579 %conv4 = zext i16 %s2.elt2 to i32
580 %mul2 = mul nuw i32 %conv4, %conv3
581
582 %s3 = load i32, i32 addrspace(1)* %dst, align 4
583 %add = add i32 %s3, %mul2
584 %add6 = add i32 %mul1, %add
585 store i32 %add6, i32 addrspace(1)* %dst, align 4
586 ret void
587}
588
589define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000590; GFX7-LABEL: idot2_MixedExt:
591; GFX7: ; %bb.0: ; %entry
592; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
593; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
594; GFX7-NEXT: s_mov_b32 s3, 0xf000
595; GFX7-NEXT: s_mov_b32 s2, -1
596; GFX7-NEXT: s_waitcnt lgkmcnt(0)
597; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
598; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
599; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
600; GFX7-NEXT: s_waitcnt lgkmcnt(0)
601; GFX7-NEXT: s_sext_i32_i16 s7, s4
602; GFX7-NEXT: s_ashr_i32 s4, s4, 16
603; GFX7-NEXT: s_and_b32 s8, s5, 0xffff
604; GFX7-NEXT: s_ashr_i32 s5, s5, 16
605; GFX7-NEXT: v_mov_b32_e32 v0, s4
606; GFX7-NEXT: v_mov_b32_e32 v1, s6
607; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
608; GFX7-NEXT: v_mov_b32_e32 v1, s7
609; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
610; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
611; GFX7-NEXT: s_endpgm
612;
613; GFX8-LABEL: idot2_MixedExt:
614; GFX8: ; %bb.0: ; %entry
615; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
616; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
617; GFX8-NEXT: s_waitcnt lgkmcnt(0)
618; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
619; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
620; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
621; GFX8-NEXT: v_mov_b32_e32 v0, s0
622; GFX8-NEXT: v_mov_b32_e32 v1, s1
623; GFX8-NEXT: s_waitcnt lgkmcnt(0)
624; GFX8-NEXT: s_sext_i32_i16 s0, s2
625; GFX8-NEXT: s_ashr_i32 s2, s2, 16
626; GFX8-NEXT: s_and_b32 s1, s3, 0xffff
627; GFX8-NEXT: s_ashr_i32 s3, s3, 16
628; GFX8-NEXT: v_mov_b32_e32 v2, s4
629; GFX8-NEXT: v_mov_b32_e32 v3, s2
630; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
631; GFX8-NEXT: v_mov_b32_e32 v3, s0
632; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
633; GFX8-NEXT: flat_store_dword v[0:1], v2
634; GFX8-NEXT: s_endpgm
635;
636; GFX9-NODL-LABEL: idot2_MixedExt:
637; GFX9-NODL: ; %bb.0: ; %entry
638; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
639; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
640; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
641; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
642; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
643; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
644; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
645; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
646; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
647; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
648; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
649; GFX9-NODL-NEXT: s_and_b32 s1, s3, 0xffff
650; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
651; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
652; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
653; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
654; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
655; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
656; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
657; GFX9-NODL-NEXT: s_endpgm
658;
659; GFX9-DL-LABEL: idot2_MixedExt:
660; GFX9-DL: ; %bb.0: ; %entry
661; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
662; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
663; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
664; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
665; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
666; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
667; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
668; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
669; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
670; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
671; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
672; GFX9-DL-NEXT: s_and_b32 s1, s3, 0xffff
673; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
674; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
675; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
676; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
677; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
678; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
679; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
680; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000681 <2 x i16> addrspace(1)* %src2,
682 i32 addrspace(1)* nocapture %dst) {
683entry:
684 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
685 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
686
687 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
688 %conv = sext i16 %s1.elt1 to i32
689 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
690 %conv2 = zext i16 %s2.elt1 to i32
691 %mul1 = mul nuw i32 %conv2, %conv
692
693 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
694 %conv3 = sext i16 %s1.elt2 to i32
695 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
696 %conv4 = sext i16 %s2.elt2 to i32
697 %mul2 = mul nuw i32 %conv4, %conv3
698
699 %s3 = load i32, i32 addrspace(1)* %dst, align 4
700 %add = add i32 %mul2, %s3
701 %add6 = add i32 %add, %mul1
702 store i32 %add6, i32 addrspace(1)* %dst, align 4
703 ret void
704}
705
706define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000707; GFX7-LABEL: notudot2_SameVec:
708; GFX7: ; %bb.0: ; %entry
709; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
710; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
711; GFX7-NEXT: s_mov_b32 s3, 0xf000
712; GFX7-NEXT: s_mov_b32 s2, -1
713; GFX7-NEXT: s_waitcnt lgkmcnt(0)
714; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
715; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
716; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
717; GFX7-NEXT: s_waitcnt lgkmcnt(0)
718; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
719; GFX7-NEXT: s_lshr_b32 s5, s5, 16
720; GFX7-NEXT: v_mov_b32_e32 v0, s6
721; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0
722; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0
723; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
724; GFX7-NEXT: s_endpgm
725;
726; GFX8-LABEL: notudot2_SameVec:
727; GFX8: ; %bb.0: ; %entry
728; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
729; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
730; GFX8-NEXT: s_waitcnt lgkmcnt(0)
731; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
732; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
733; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
734; GFX8-NEXT: v_mov_b32_e32 v0, s0
735; GFX8-NEXT: v_mov_b32_e32 v1, s1
736; GFX8-NEXT: s_waitcnt lgkmcnt(0)
737; GFX8-NEXT: s_and_b32 s0, s2, 0xffff
738; GFX8-NEXT: s_lshr_b32 s1, s3, 16
739; GFX8-NEXT: v_mov_b32_e32 v2, s4
740; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2
741; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2
742; GFX8-NEXT: flat_store_dword v[0:1], v2
743; GFX8-NEXT: s_endpgm
744;
745; GFX9-NODL-LABEL: notudot2_SameVec:
746; GFX9-NODL: ; %bb.0: ; %entry
747; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
748; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
749; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
750; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
751; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
752; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
753; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
754; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
755; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
756; GFX9-NODL-NEXT: s_and_b32 s0, s2, 0xffff
757; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16
758; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
759; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
760; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
761; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
762; GFX9-NODL-NEXT: s_endpgm
763;
764; GFX9-DL-LABEL: notudot2_SameVec:
765; GFX9-DL: ; %bb.0: ; %entry
766; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
767; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
768; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
769; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
770; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
771; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
772; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
773; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
774; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
775; GFX9-DL-NEXT: s_and_b32 s0, s2, 0xffff
776; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16
777; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
778; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
779; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
780; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
781; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000782 <2 x i16> addrspace(1)* %src2,
783 i32 addrspace(1)* nocapture %dst) {
784entry:
785 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
786 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
787
788 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
789 %conv = zext i16 %s1.elt1 to i32
790 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
791 %conv2 = zext i16 %s2.elt1 to i32
792 %mul1 = mul i32 %conv2, %conv
793
794 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
795 %conv3 = zext i16 %s1.elt2 to i32
796 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
797 %conv4 = zext i16 %s2.elt2 to i32
798 %mul2 = mul i32 %conv4, %conv3
799
800 %s3 = load i32, i32 addrspace(1)* %dst, align 4
801 %add = add i32 %mul2, %s3
802 %add6 = add i32 %add, %mul1
803 store i32 %add6, i32 addrspace(1)* %dst, align 4
804 ret void
805}
806
807define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000808; GFX7-LABEL: udot2_v4i16:
809; GFX7: ; %bb.0: ; %entry
810; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
811; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
812; GFX7-NEXT: s_mov_b32 s8, 0xffff
813; GFX7-NEXT: s_mov_b32 s3, 0xf000
814; GFX7-NEXT: s_mov_b32 s2, -1
815; GFX7-NEXT: s_waitcnt lgkmcnt(0)
816; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
817; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
818; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
819; GFX7-NEXT: s_waitcnt lgkmcnt(0)
820; GFX7-NEXT: s_and_b32 s7, s4, s8
821; GFX7-NEXT: s_lshr_b32 s4, s4, 16
822; GFX7-NEXT: s_and_b32 s8, s5, s8
823; GFX7-NEXT: s_lshr_b32 s5, s5, 16
824; GFX7-NEXT: v_mov_b32_e32 v0, s4
825; GFX7-NEXT: v_mov_b32_e32 v1, s6
826; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
827; GFX7-NEXT: v_mov_b32_e32 v1, s7
828; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
829; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
830; GFX7-NEXT: s_endpgm
831;
832; GFX8-LABEL: udot2_v4i16:
833; GFX8: ; %bb.0: ; %entry
834; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
835; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
836; GFX8-NEXT: s_mov_b32 s2, 0xffff
837; GFX8-NEXT: s_waitcnt lgkmcnt(0)
838; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
839; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
840; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
841; GFX8-NEXT: s_waitcnt lgkmcnt(0)
842; GFX8-NEXT: s_and_b32 s6, s3, s2
843; GFX8-NEXT: s_lshr_b32 s3, s3, 16
844; GFX8-NEXT: s_and_b32 s2, s4, s2
845; GFX8-NEXT: s_lshr_b32 s4, s4, 16
846; GFX8-NEXT: v_mov_b32_e32 v0, s5
847; GFX8-NEXT: v_mov_b32_e32 v1, s3
848; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
849; GFX8-NEXT: v_mov_b32_e32 v1, s6
850; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
851; GFX8-NEXT: v_mov_b32_e32 v0, s0
852; GFX8-NEXT: v_mov_b32_e32 v1, s1
853; GFX8-NEXT: flat_store_dword v[0:1], v2
854; GFX8-NEXT: s_endpgm
855;
856; GFX9-NODL-LABEL: udot2_v4i16:
857; GFX9-NODL: ; %bb.0: ; %entry
858; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
859; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
860; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
861; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
862; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
863; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
864; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
865; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
866; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
867; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
868; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
869; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
870; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
871; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
872; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
873; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
874; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
875; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
876; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
877; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
878; GFX9-NODL-NEXT: s_endpgm
879;
880; GFX9-DL-LABEL: udot2_v4i16:
881; GFX9-DL: ; %bb.0: ; %entry
882; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
883; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
884; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
885; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
886; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
887; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
888; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
889; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
890; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
891; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
892; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
893; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
894; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
895; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +0000896 <4 x i16> addrspace(1)* %src2,
897 i32 addrspace(1)* nocapture %dst) {
898entry:
899 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
900 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
901
902 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
903 %conv = zext i16 %s1.elt1 to i32
904 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
905 %conv2 = zext i16 %s2.elt1 to i32
906 %mul1 = mul i32 %conv2, %conv
907
908 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
909 %conv3 = zext i16 %s1.elt2 to i32
910 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
911 %conv4 = zext i16 %s2.elt2 to i32
912 %mul2 = mul i32 %conv4, %conv3
913
914 %s3 = load i32, i32 addrspace(1)* %dst, align 4
915 %add = add i32 %mul2, %s3
916 %add6 = add i32 %add, %mul1
917 store i32 %add6, i32 addrspace(1)* %dst, align 4
918 ret void
919}
920
921define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +0000922; GFX7-LABEL: udot2_v4i16_Hi:
923; GFX7: ; %bb.0: ; %entry
924; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
925; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
926; GFX7-NEXT: s_mov_b32 s8, 0xffff
927; GFX7-NEXT: s_mov_b32 s3, 0xf000
928; GFX7-NEXT: s_mov_b32 s2, -1
929; GFX7-NEXT: s_waitcnt lgkmcnt(0)
930; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1
931; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1
932; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
933; GFX7-NEXT: s_waitcnt lgkmcnt(0)
934; GFX7-NEXT: s_and_b32 s7, s4, s8
935; GFX7-NEXT: s_lshr_b32 s4, s4, 16
936; GFX7-NEXT: s_and_b32 s8, s5, s8
937; GFX7-NEXT: s_lshr_b32 s5, s5, 16
938; GFX7-NEXT: v_mov_b32_e32 v0, s4
939; GFX7-NEXT: v_mov_b32_e32 v1, s6
940; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
941; GFX7-NEXT: v_mov_b32_e32 v1, s7
942; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
943; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
944; GFX7-NEXT: s_endpgm
945;
946; GFX8-LABEL: udot2_v4i16_Hi:
947; GFX8: ; %bb.0: ; %entry
948; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
949; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
950; GFX8-NEXT: s_mov_b32 s2, 0xffff
951; GFX8-NEXT: s_waitcnt lgkmcnt(0)
952; GFX8-NEXT: s_load_dword s3, s[4:5], 0x4
953; GFX8-NEXT: s_load_dword s4, s[6:7], 0x4
954; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
955; GFX8-NEXT: s_waitcnt lgkmcnt(0)
956; GFX8-NEXT: s_and_b32 s6, s3, s2
957; GFX8-NEXT: s_lshr_b32 s3, s3, 16
958; GFX8-NEXT: s_and_b32 s2, s4, s2
959; GFX8-NEXT: s_lshr_b32 s4, s4, 16
960; GFX8-NEXT: v_mov_b32_e32 v0, s5
961; GFX8-NEXT: v_mov_b32_e32 v1, s3
962; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
963; GFX8-NEXT: v_mov_b32_e32 v1, s6
964; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
965; GFX8-NEXT: v_mov_b32_e32 v0, s0
966; GFX8-NEXT: v_mov_b32_e32 v1, s1
967; GFX8-NEXT: flat_store_dword v[0:1], v2
968; GFX8-NEXT: s_endpgm
969;
970; GFX9-NODL-LABEL: udot2_v4i16_Hi:
971; GFX9-NODL: ; %bb.0: ; %entry
972; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
973; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
974; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
975; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
976; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x4
977; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x4
978; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
979; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
980; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
981; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
982; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
983; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
984; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
985; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
986; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
987; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
988; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
989; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
990; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
991; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
992; GFX9-NODL-NEXT: s_endpgm
993;
994; GFX9-DL-LABEL: udot2_v4i16_Hi:
995; GFX9-DL: ; %bb.0: ; %entry
996; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
997; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
998; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
999; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x4
1000; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x4
1001; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1002; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1003; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1004; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1005; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
1006; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
1007; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
1008; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1009; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001010 <4 x i16> addrspace(1)* %src2,
1011 i32 addrspace(1)* nocapture %dst) {
1012entry:
1013 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1014 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1015
1016 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
1017 %conv = zext i16 %s1.elt1 to i32
1018 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
1019 %conv2 = zext i16 %s2.elt1 to i32
1020 %mul1 = mul i32 %conv2, %conv
1021
1022 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
1023 %conv3 = zext i16 %s1.elt2 to i32
1024 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
1025 %conv4 = zext i16 %s2.elt2 to i32
1026 %mul2 = mul i32 %conv4, %conv3
1027
1028 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1029 %add = add i32 %mul2, %s3
1030 %add6 = add i32 %add, %mul1
1031 store i32 %add6, i32 addrspace(1)* %dst, align 4
1032 ret void
1033}
1034
1035define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001036; GFX7-LABEL: notudot2_v4i16_Even:
1037; GFX7: ; %bb.0: ; %entry
1038; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1039; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1040; GFX7-NEXT: s_mov_b32 s8, 0xffff
1041; GFX7-NEXT: s_mov_b32 s3, 0xf000
1042; GFX7-NEXT: s_mov_b32 s2, -1
1043; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1044; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1045; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
1046; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
1047; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1048; GFX7-NEXT: s_and_b32 s5, s5, s8
1049; GFX7-NEXT: s_and_b32 s4, s4, s8
1050; GFX7-NEXT: s_and_b32 s7, s7, s8
1051; GFX7-NEXT: v_mov_b32_e32 v0, s5
1052; GFX7-NEXT: v_mov_b32_e32 v1, s9
1053; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1054; GFX7-NEXT: s_and_b32 s6, s6, s8
1055; GFX7-NEXT: v_mov_b32_e32 v1, s4
1056; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1057; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1058; GFX7-NEXT: s_endpgm
1059;
1060; GFX8-LABEL: notudot2_v4i16_Even:
1061; GFX8: ; %bb.0: ; %entry
1062; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1063; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1064; GFX8-NEXT: s_mov_b32 s8, 0xffff
1065; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1066; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1067; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1068; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0
1069; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1070; GFX8-NEXT: s_and_b32 s3, s3, s8
1071; GFX8-NEXT: s_and_b32 s2, s2, s8
1072; GFX8-NEXT: s_and_b32 s5, s5, s8
1073; GFX8-NEXT: v_mov_b32_e32 v0, s6
1074; GFX8-NEXT: v_mov_b32_e32 v1, s3
1075; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1076; GFX8-NEXT: s_and_b32 s4, s4, s8
1077; GFX8-NEXT: v_mov_b32_e32 v1, s2
1078; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1079; GFX8-NEXT: v_mov_b32_e32 v0, s0
1080; GFX8-NEXT: v_mov_b32_e32 v1, s1
1081; GFX8-NEXT: flat_store_dword v[0:1], v2
1082; GFX8-NEXT: s_endpgm
1083;
1084; GFX9-NODL-LABEL: notudot2_v4i16_Even:
1085; GFX9-NODL: ; %bb.0: ; %entry
1086; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1087; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1088; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
1089; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1090; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1091; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1092; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0
1093; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1094; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
1095; GFX9-NODL-NEXT: s_and_b32 s2, s2, s8
1096; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
1097; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
1098; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1099; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1100; GFX9-NODL-NEXT: s_and_b32 s4, s4, s8
1101; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
1102; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1103; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1104; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1105; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1106; GFX9-NODL-NEXT: s_endpgm
1107;
1108; GFX9-DL-LABEL: notudot2_v4i16_Even:
1109; GFX9-DL: ; %bb.0: ; %entry
1110; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1111; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1112; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
1113; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1114; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1115; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1116; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1117; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1118; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
1119; GFX9-DL-NEXT: s_and_b32 s2, s2, s8
1120; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
1121; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
1122; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1123; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1124; GFX9-DL-NEXT: s_and_b32 s4, s4, s8
1125; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
1126; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1127; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1128; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1129; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1130; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001131 <4 x i16> addrspace(1)* %src2,
1132 i32 addrspace(1)* nocapture %dst) {
1133entry:
1134 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1135 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1136
1137 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
1138 %conv = zext i16 %s1.elt1 to i32
1139 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
1140 %conv2 = zext i16 %s2.elt1 to i32
1141 %mul1 = mul i32 %conv2, %conv
1142
1143 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1144 %conv3 = zext i16 %s1.elt2 to i32
1145 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1146 %conv4 = zext i16 %s2.elt2 to i32
1147 %mul2 = mul i32 %conv4, %conv3
1148
1149 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1150 %add = add i32 %mul2, %s3
1151 %add6 = add i32 %add, %mul1
1152 store i32 %add6, i32 addrspace(1)* %dst, align 4
1153 ret void
1154}
1155
1156define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001157; GFX7-LABEL: notudot2_v4i16_Middle:
1158; GFX7: ; %bb.0: ; %entry
1159; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1160; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1161; GFX7-NEXT: s_mov_b32 s8, 0xffff
1162; GFX7-NEXT: s_mov_b32 s3, 0xf000
1163; GFX7-NEXT: s_mov_b32 s2, -1
1164; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1165; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1166; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
1167; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
1168; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1169; GFX7-NEXT: s_and_b32 s5, s5, s8
1170; GFX7-NEXT: s_lshr_b32 s4, s4, 16
1171; GFX7-NEXT: s_and_b32 s7, s7, s8
1172; GFX7-NEXT: v_mov_b32_e32 v0, s5
1173; GFX7-NEXT: v_mov_b32_e32 v1, s9
1174; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1175; GFX7-NEXT: s_lshr_b32 s6, s6, 16
1176; GFX7-NEXT: v_mov_b32_e32 v1, s4
1177; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1178; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1179; GFX7-NEXT: s_endpgm
1180;
1181; GFX8-LABEL: notudot2_v4i16_Middle:
1182; GFX8: ; %bb.0: ; %entry
1183; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1184; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1185; GFX8-NEXT: s_mov_b32 s8, 0xffff
1186; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1187; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1188; GFX8-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1189; GFX8-NEXT: s_load_dword s6, s[0:1], 0x0
1190; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1191; GFX8-NEXT: s_and_b32 s3, s3, s8
1192; GFX8-NEXT: s_lshr_b32 s2, s2, 16
1193; GFX8-NEXT: s_and_b32 s5, s5, s8
1194; GFX8-NEXT: v_mov_b32_e32 v0, s6
1195; GFX8-NEXT: v_mov_b32_e32 v1, s3
1196; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1197; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1198; GFX8-NEXT: v_mov_b32_e32 v1, s2
1199; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1200; GFX8-NEXT: v_mov_b32_e32 v0, s0
1201; GFX8-NEXT: v_mov_b32_e32 v1, s1
1202; GFX8-NEXT: flat_store_dword v[0:1], v2
1203; GFX8-NEXT: s_endpgm
1204;
1205; GFX9-NODL-LABEL: notudot2_v4i16_Middle:
1206; GFX9-NODL: ; %bb.0: ; %entry
1207; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1208; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1209; GFX9-NODL-NEXT: s_mov_b32 s8, 0xffff
1210; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1211; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1212; GFX9-NODL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1213; GFX9-NODL-NEXT: s_load_dword s6, s[0:1], 0x0
1214; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1215; GFX9-NODL-NEXT: s_and_b32 s3, s3, s8
1216; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
1217; GFX9-NODL-NEXT: s_and_b32 s5, s5, s8
1218; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
1219; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1220; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1221; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1222; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s2
1223; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1224; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1225; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1226; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1227; GFX9-NODL-NEXT: s_endpgm
1228;
1229; GFX9-DL-LABEL: notudot2_v4i16_Middle:
1230; GFX9-DL: ; %bb.0: ; %entry
1231; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1232; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1233; GFX9-DL-NEXT: s_mov_b32 s8, 0xffff
1234; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1235; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1236; GFX9-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
1237; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
1238; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1239; GFX9-DL-NEXT: s_and_b32 s3, s3, s8
1240; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
1241; GFX9-DL-NEXT: s_and_b32 s5, s5, s8
1242; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
1243; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1244; GFX9-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1245; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1246; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2
1247; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
1248; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1249; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1250; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1251; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001252 <4 x i16> addrspace(1)* %src2,
1253 i32 addrspace(1)* nocapture %dst) {
1254entry:
1255 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
1256 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
1257
1258 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
1259 %conv = zext i16 %s1.elt1 to i32
1260 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
1261 %conv2 = zext i16 %s2.elt1 to i32
1262 %mul1 = mul i32 %conv2, %conv
1263
1264 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
1265 %conv3 = zext i16 %s1.elt2 to i32
1266 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
1267 %conv4 = zext i16 %s2.elt2 to i32
1268 %mul2 = mul i32 %conv4, %conv3
1269
1270 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1271 %add = add i32 %mul2, %s3
1272 %add6 = add i32 %add, %mul1
1273 store i32 %add6, i32 addrspace(1)* %dst, align 4
1274 ret void
1275}
1276
1277define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001278; GFX7-LABEL: notudot2_DiffIndex:
1279; GFX7: ; %bb.0: ; %entry
1280; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1281; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1282; GFX7-NEXT: s_mov_b32 s8, 0xffff
1283; GFX7-NEXT: s_mov_b32 s3, 0xf000
1284; GFX7-NEXT: s_mov_b32 s2, -1
1285; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1286; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1287; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1288; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1289; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1290; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1291; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1292; GFX7-NEXT: s_and_b32 s4, s4, s8
1293; GFX7-NEXT: s_and_b32 s5, s5, s8
1294; GFX7-NEXT: v_mov_b32_e32 v0, s7
1295; GFX7-NEXT: v_mov_b32_e32 v1, s6
1296; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
1297; GFX7-NEXT: v_mov_b32_e32 v1, s4
1298; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
1299; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1300; GFX7-NEXT: s_endpgm
1301;
1302; GFX8-LABEL: notudot2_DiffIndex:
1303; GFX8: ; %bb.0: ; %entry
1304; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1305; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1306; GFX8-NEXT: s_mov_b32 s2, 0xffff
1307; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1308; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1309; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1310; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1311; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1312; GFX8-NEXT: s_and_b32 s6, s3, s2
1313; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1314; GFX8-NEXT: s_and_b32 s2, s4, s2
1315; GFX8-NEXT: v_mov_b32_e32 v0, s5
1316; GFX8-NEXT: v_mov_b32_e32 v1, s3
1317; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1318; GFX8-NEXT: s_lshr_b32 s7, s4, 16
1319; GFX8-NEXT: v_mov_b32_e32 v1, s6
1320; GFX8-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1321; GFX8-NEXT: v_mov_b32_e32 v0, s0
1322; GFX8-NEXT: v_mov_b32_e32 v1, s1
1323; GFX8-NEXT: flat_store_dword v[0:1], v2
1324; GFX8-NEXT: s_endpgm
1325;
1326; GFX9-NODL-LABEL: notudot2_DiffIndex:
1327; GFX9-NODL: ; %bb.0: ; %entry
1328; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1329; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1330; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1331; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1332; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1333; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1334; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1335; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1336; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1337; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1338; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1339; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1340; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1341; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1342; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 16
1343; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1344; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1345; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1346; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1347; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1348; GFX9-NODL-NEXT: s_endpgm
1349;
1350; GFX9-DL-LABEL: notudot2_DiffIndex:
1351; GFX9-DL: ; %bb.0: ; %entry
1352; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1353; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1354; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1355; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1356; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1357; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1358; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1359; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1360; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1361; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1362; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1363; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1364; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1365; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1366; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 16
1367; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1368; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1369; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1370; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1371; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1372; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001373 <2 x i16> addrspace(1)* %src2,
1374 i32 addrspace(1)* nocapture %dst) {
1375entry:
1376 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1377 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1378
1379 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1380 %conv = zext i16 %s1.elt1 to i32
1381 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1382 %conv2 = zext i16 %s2.elt1 to i32
1383 %mul1 = mul i32 %conv2, %conv
1384
1385 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1386 %conv3 = zext i16 %s1.elt2 to i32
1387 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1388 %conv4 = zext i16 %s2.elt2 to i32
1389 %mul2 = mul i32 %conv4, %conv3
1390
1391 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1392 %add = add i32 %mul2, %s3
1393 %add6 = add i32 %add, %mul1
1394 store i32 %add6, i32 addrspace(1)* %dst, align 4
1395 ret void
1396}
1397
1398define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001399; GFX7-LABEL: udot2_MultipleUses_add1:
1400; GFX7: ; %bb.0: ; %entry
1401; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1402; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1403; GFX7-NEXT: s_mov_b32 s8, 0xffff
1404; GFX7-NEXT: s_mov_b32 s3, 0xf000
1405; GFX7-NEXT: s_mov_b32 s2, -1
1406; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1407; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1408; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1409; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1410; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1411; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1412; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1413; GFX7-NEXT: s_and_b32 s4, s4, s8
1414; GFX7-NEXT: v_mov_b32_e32 v0, s7
1415; GFX7-NEXT: v_mov_b32_e32 v1, s6
1416; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1417; GFX7-NEXT: s_and_b32 s5, s5, s8
1418; GFX7-NEXT: v_mov_b32_e32 v1, s4
1419; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0
1420; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1421; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1422; GFX7-NEXT: s_endpgm
1423;
1424; GFX8-LABEL: udot2_MultipleUses_add1:
1425; GFX8: ; %bb.0: ; %entry
1426; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1427; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1428; GFX8-NEXT: s_mov_b32 s2, 0xffff
1429; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1430; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1431; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1432; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1433; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1434; GFX8-NEXT: s_and_b32 s6, s3, s2
1435; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1436; GFX8-NEXT: s_and_b32 s2, s4, s2
1437; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1438; GFX8-NEXT: v_mov_b32_e32 v0, s5
1439; GFX8-NEXT: v_mov_b32_e32 v1, s3
1440; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1441; GFX8-NEXT: v_mov_b32_e32 v1, s6
1442; GFX8-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1443; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1444; GFX8-NEXT: v_mov_b32_e32 v0, s0
1445; GFX8-NEXT: v_mov_b32_e32 v1, s1
1446; GFX8-NEXT: flat_store_dword v[0:1], v2
1447; GFX8-NEXT: s_endpgm
1448;
1449; GFX9-NODL-LABEL: udot2_MultipleUses_add1:
1450; GFX9-NODL: ; %bb.0: ; %entry
1451; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1452; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1453; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1454; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1455; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1456; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1457; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1458; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1459; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1460; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1461; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1462; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1463; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1464; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1465; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1466; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1467; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1468; GFX9-NODL-NEXT: v_add_u32_e32 v2, v1, v0
1469; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1470; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1471; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1472; GFX9-NODL-NEXT: s_endpgm
1473;
1474; GFX9-DL-LABEL: udot2_MultipleUses_add1:
1475; GFX9-DL: ; %bb.0: ; %entry
1476; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1477; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1478; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1479; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1480; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1481; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1482; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1483; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1484; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1485; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1486; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1487; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1488; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1489; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1490; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1491; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1492; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1493; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0
1494; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1495; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1496; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1497; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001498 <2 x i16> addrspace(1)* %src2,
1499 i32 addrspace(1)* nocapture %dst) {
1500entry:
1501 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1502 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1503
1504 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1505 %conv = zext i16 %s1.elt1 to i32
1506 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1507 %conv2 = zext i16 %s2.elt1 to i32
1508 %mul1 = mul i32 %conv2, %conv
1509
1510 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1511 %conv3 = zext i16 %s1.elt2 to i32
1512 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1513 %conv4 = zext i16 %s2.elt2 to i32
1514 %mul2 = mul i32 %conv4, %conv3
1515
1516 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1517 %add1 = add i32 %mul2, %s3
1518 %add2 = add i32 %add1, %mul1
1519
1520 %res = add i32 %add2, %add1
1521 store i32 %res, i32 addrspace(1)* %dst, align 4
1522 ret void
1523}
1524
1525define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001526; GFX7-LABEL: idot2_MultipleUses_add1:
1527; GFX7: ; %bb.0: ; %entry
1528; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1529; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1530; GFX7-NEXT: s_mov_b32 s3, 0xf000
1531; GFX7-NEXT: s_mov_b32 s2, -1
1532; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1533; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1534; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1535; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1536; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1537; GFX7-NEXT: s_sext_i32_i16 s7, s4
1538; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1539; GFX7-NEXT: s_sext_i32_i16 s8, s5
1540; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1541; GFX7-NEXT: v_mov_b32_e32 v0, s4
1542; GFX7-NEXT: v_mov_b32_e32 v1, s6
1543; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
1544; GFX7-NEXT: v_mov_b32_e32 v1, s7
1545; GFX7-NEXT: v_mad_i32_i24 v1, s8, v1, v0
1546; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1547; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1548; GFX7-NEXT: s_endpgm
1549;
1550; GFX8-LABEL: idot2_MultipleUses_add1:
1551; GFX8: ; %bb.0: ; %entry
1552; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1553; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1554; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1555; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1556; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1557; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1558; GFX8-NEXT: v_mov_b32_e32 v0, s0
1559; GFX8-NEXT: v_mov_b32_e32 v1, s1
1560; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1561; GFX8-NEXT: s_sext_i32_i16 s0, s2
1562; GFX8-NEXT: s_ashr_i32 s2, s2, 16
1563; GFX8-NEXT: s_sext_i32_i16 s1, s3
1564; GFX8-NEXT: s_ashr_i32 s3, s3, 16
1565; GFX8-NEXT: v_mov_b32_e32 v2, s4
1566; GFX8-NEXT: v_mov_b32_e32 v3, s2
1567; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1568; GFX8-NEXT: v_mov_b32_e32 v3, s0
1569; GFX8-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1570; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
1571; GFX8-NEXT: flat_store_dword v[0:1], v2
1572; GFX8-NEXT: s_endpgm
1573;
1574; GFX9-NODL-LABEL: idot2_MultipleUses_add1:
1575; GFX9-NODL: ; %bb.0: ; %entry
1576; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1577; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1578; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1579; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1580; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1581; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1582; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1583; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1584; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1585; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
1586; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
1587; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
1588; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
1589; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
1590; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
1591; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1592; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
1593; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1594; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2
1595; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1596; GFX9-NODL-NEXT: s_endpgm
1597;
1598; GFX9-DL-LABEL: idot2_MultipleUses_add1:
1599; GFX9-DL: ; %bb.0: ; %entry
1600; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1601; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1602; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1603; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1604; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1605; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1606; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1607; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1608; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1609; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
1610; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
1611; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
1612; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
1613; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
1614; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
1615; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1616; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
1617; GFX9-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1618; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
1619; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1620; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001621 <2 x i16> addrspace(1)* %src2,
1622 i32 addrspace(1)* nocapture %dst) {
1623entry:
1624 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1625 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1626
1627 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1628 %conv = sext i16 %s1.elt1 to i32
1629 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1630 %conv2 = sext i16 %s2.elt1 to i32
1631 %mul1 = mul i32 %conv2, %conv
1632
1633 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1634 %conv3 = sext i16 %s1.elt2 to i32
1635 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1636 %conv4 = sext i16 %s2.elt2 to i32
1637 %mul2 = mul i32 %conv4, %conv3
1638
1639 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1640 %add1 = add i32 %mul2, %s3
1641 %add2 = add i32 %add1, %mul1
1642
1643 %res = add i32 %add2, %add1
1644 store i32 %res, i32 addrspace(1)* %dst, align 4
1645 ret void
1646}
1647
1648define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001649; GFX7-LABEL: udot2_MultipleUses_mul1:
1650; GFX7: ; %bb.0: ; %entry
1651; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1652; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1653; GFX7-NEXT: s_mov_b32 s8, 0xffff
1654; GFX7-NEXT: s_mov_b32 s3, 0xf000
1655; GFX7-NEXT: s_mov_b32 s2, -1
1656; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1657; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1658; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1659; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1660; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1661; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1662; GFX7-NEXT: s_and_b32 s4, s4, s8
1663; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1664; GFX7-NEXT: s_and_b32 s5, s5, s8
1665; GFX7-NEXT: v_mov_b32_e32 v0, s4
1666; GFX7-NEXT: v_mov_b32_e32 v1, s6
1667; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1
1668; GFX7-NEXT: v_mov_b32_e32 v2, s7
1669; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1
1670; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
1671; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1672; GFX7-NEXT: s_endpgm
1673;
1674; GFX8-LABEL: udot2_MultipleUses_mul1:
1675; GFX8: ; %bb.0: ; %entry
1676; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1677; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1678; GFX8-NEXT: s_mov_b32 s2, 0xffff
1679; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1680; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1681; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1682; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1683; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1684; GFX8-NEXT: s_and_b32 s6, s3, s2
1685; GFX8-NEXT: s_and_b32 s2, s4, s2
1686; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1687; GFX8-NEXT: v_mov_b32_e32 v0, s5
1688; GFX8-NEXT: v_mov_b32_e32 v1, s6
1689; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1690; GFX8-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1691; GFX8-NEXT: v_mov_b32_e32 v2, s3
1692; GFX8-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1693; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1694; GFX8-NEXT: v_mov_b32_e32 v0, s0
1695; GFX8-NEXT: v_mov_b32_e32 v1, s1
1696; GFX8-NEXT: flat_store_dword v[0:1], v2
1697; GFX8-NEXT: s_endpgm
1698;
1699; GFX9-NODL-LABEL: udot2_MultipleUses_mul1:
1700; GFX9-NODL: ; %bb.0: ; %entry
1701; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1702; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1703; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1704; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1705; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1706; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1707; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1708; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1709; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1710; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1711; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1712; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1713; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1714; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1715; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1716; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
1717; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1718; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1719; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1720; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1721; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1722; GFX9-NODL-NEXT: s_endpgm
1723;
1724; GFX9-DL-LABEL: udot2_MultipleUses_mul1:
1725; GFX9-DL: ; %bb.0: ; %entry
1726; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1727; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1728; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1729; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1730; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1731; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1732; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1733; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1734; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1735; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1736; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1737; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1738; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1739; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1740; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1741; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
1742; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1743; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1744; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1745; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1746; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1747; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001748 <2 x i16> addrspace(1)* %src2,
1749 i32 addrspace(1)* nocapture %dst) {
1750entry:
1751 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1752 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1753
1754 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1755 %conv = zext i16 %s1.elt1 to i32
1756 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1757 %conv2 = zext i16 %s2.elt1 to i32
1758 %mul1 = mul i32 %conv2, %conv
1759
1760 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1761 %conv3 = zext i16 %s1.elt2 to i32
1762 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1763 %conv4 = zext i16 %s2.elt2 to i32
1764 %mul2 = mul i32 %conv4, %conv3
1765
1766 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1767 %add0 = add i32 %mul1, %s3
1768
1769 %add1 = add i32 %mul2, %add0
1770 %add2 = add i32 %add1, %mul1
1771
1772 store i32 %add2, i32 addrspace(1)* %dst, align 4
1773 ret void
1774}
1775
1776define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001777; GFX7-LABEL: idot2_MultipleUses_mul1:
1778; GFX7: ; %bb.0: ; %entry
1779; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1780; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1781; GFX7-NEXT: s_mov_b32 s3, 0xf000
1782; GFX7-NEXT: s_mov_b32 s2, -1
1783; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1784; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1785; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1786; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1787; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1788; GFX7-NEXT: s_sext_i32_i16 s7, s4
1789; GFX7-NEXT: s_sext_i32_i16 s8, s5
1790; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1791; GFX7-NEXT: v_mov_b32_e32 v0, s7
1792; GFX7-NEXT: v_mov_b32_e32 v1, s6
1793; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1794; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1
1795; GFX7-NEXT: v_mov_b32_e32 v2, s4
1796; GFX7-NEXT: v_mad_i32_i24 v1, s5, v2, v1
1797; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
1798; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1799; GFX7-NEXT: s_endpgm
1800;
1801; GFX8-LABEL: idot2_MultipleUses_mul1:
1802; GFX8: ; %bb.0: ; %entry
1803; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1804; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1805; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1806; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1807; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1808; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1809; GFX8-NEXT: v_mov_b32_e32 v0, s0
1810; GFX8-NEXT: v_mov_b32_e32 v1, s1
1811; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1812; GFX8-NEXT: s_sext_i32_i16 s0, s2
1813; GFX8-NEXT: s_sext_i32_i16 s1, s3
1814; GFX8-NEXT: s_ashr_i32 s2, s2, 16
1815; GFX8-NEXT: v_mov_b32_e32 v2, s4
1816; GFX8-NEXT: v_mov_b32_e32 v3, s0
1817; GFX8-NEXT: s_ashr_i32 s3, s3, 16
1818; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1819; GFX8-NEXT: v_mov_b32_e32 v4, s2
1820; GFX8-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1821; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1822; GFX8-NEXT: flat_store_dword v[0:1], v2
1823; GFX8-NEXT: s_endpgm
1824;
1825; GFX9-NODL-LABEL: idot2_MultipleUses_mul1:
1826; GFX9-NODL: ; %bb.0: ; %entry
1827; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1828; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1829; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1830; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1831; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1832; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1833; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1834; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1835; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1836; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
1837; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
1838; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
1839; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
1840; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
1841; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
1842; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1843; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2
1844; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1845; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1846; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1847; GFX9-NODL-NEXT: s_endpgm
1848;
1849; GFX9-DL-LABEL: idot2_MultipleUses_mul1:
1850; GFX9-DL: ; %bb.0: ; %entry
1851; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1852; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1853; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1854; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1855; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1856; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1857; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1858; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1859; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1860; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
1861; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
1862; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
1863; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
1864; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
1865; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
1866; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1867; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2
1868; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1869; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1870; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1871; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00001872 <2 x i16> addrspace(1)* %src2,
1873 i32 addrspace(1)* nocapture %dst) {
1874entry:
1875 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1876 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1877
1878 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1879 %conv = sext i16 %s1.elt1 to i32
1880 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1881 %conv2 = sext i16 %s2.elt1 to i32
1882 %mul1 = mul i32 %conv2, %conv
1883
1884 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1885 %conv3 = sext i16 %s1.elt2 to i32
1886 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1887 %conv4 = sext i16 %s2.elt2 to i32
1888 %mul2 = mul i32 %conv4, %conv3
1889
1890 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1891 %add0 = add i32 %mul1, %s3
1892
1893 %add1 = add i32 %mul2, %add0
1894 %add2 = add i32 %add1, %mul1
1895
1896 store i32 %add2, i32 addrspace(1)* %dst, align 4
1897 ret void
1898}
1899
1900define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00001901; GFX7-LABEL: udot2_MultipleUses_mul2:
1902; GFX7: ; %bb.0: ; %entry
1903; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1904; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1905; GFX7-NEXT: s_mov_b32 s8, 0xffff
1906; GFX7-NEXT: s_mov_b32 s3, 0xf000
1907; GFX7-NEXT: s_mov_b32 s2, -1
1908; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1909; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1910; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1911; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1912; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1913; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1914; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1915; GFX7-NEXT: v_mov_b32_e32 v0, s7
1916; GFX7-NEXT: v_mov_b32_e32 v1, s6
1917; GFX7-NEXT: v_mad_u32_u24 v1, s9, v0, v1
1918; GFX7-NEXT: s_and_b32 s4, s4, s8
1919; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1920; GFX7-NEXT: s_and_b32 s5, s5, s8
1921; GFX7-NEXT: v_mov_b32_e32 v1, s4
1922; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1923; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1924; GFX7-NEXT: s_endpgm
1925;
1926; GFX8-LABEL: udot2_MultipleUses_mul2:
1927; GFX8: ; %bb.0: ; %entry
1928; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1929; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1930; GFX8-NEXT: s_mov_b32 s2, 0xffff
1931; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1932; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1933; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1934; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1935; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1936; GFX8-NEXT: s_and_b32 s6, s3, s2
1937; GFX8-NEXT: s_lshr_b32 s3, s3, 16
1938; GFX8-NEXT: s_and_b32 s2, s4, s2
1939; GFX8-NEXT: s_lshr_b32 s4, s4, 16
1940; GFX8-NEXT: v_mov_b32_e32 v0, s5
1941; GFX8-NEXT: v_mov_b32_e32 v1, s3
1942; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1943; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1944; GFX8-NEXT: v_mov_b32_e32 v1, s6
1945; GFX8-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1946; GFX8-NEXT: v_mov_b32_e32 v0, s0
1947; GFX8-NEXT: v_mov_b32_e32 v1, s1
1948; GFX8-NEXT: flat_store_dword v[0:1], v2
1949; GFX8-NEXT: s_endpgm
1950;
1951; GFX9-NODL-LABEL: udot2_MultipleUses_mul2:
1952; GFX9-NODL: ; %bb.0: ; %entry
1953; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1954; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1955; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1956; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1957; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1958; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1959; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1960; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1961; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1962; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 16
1963; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1964; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 16
1965; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s5
1966; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
1967; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1968; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1969; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
1970; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1971; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1972; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1973; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1974; GFX9-NODL-NEXT: s_endpgm
1975;
1976; GFX9-DL-LABEL: udot2_MultipleUses_mul2:
1977; GFX9-DL: ; %bb.0: ; %entry
1978; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1979; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1980; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1981; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1982; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1983; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1984; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1985; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1986; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1987; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 16
1988; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1989; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 16
1990; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5
1991; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
1992; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1993; GFX9-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1994; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
1995; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1996; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1997; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1998; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1999; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00002000 <2 x i16> addrspace(1)* %src2,
2001 i32 addrspace(1)* nocapture %dst) {
2002entry:
2003 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2004 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2005
2006 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2007 %conv = zext i16 %s1.elt1 to i32
2008 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2009 %conv2 = zext i16 %s2.elt1 to i32
2010 %mul1 = mul i32 %conv2, %conv
2011
2012 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2013 %conv3 = zext i16 %s1.elt2 to i32
2014 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2015 %conv4 = zext i16 %s2.elt2 to i32
2016 %mul2 = mul i32 %conv4, %conv3
2017
2018 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2019 %add0 = add i32 %mul2, %s3
2020
2021 %add1 = add i32 %mul2, %add0
2022 %add2 = add i32 %add1, %mul1
2023
2024 store i32 %add2, i32 addrspace(1)* %dst, align 4
2025 ret void
2026}
2027
2028define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002029; GFX7-LABEL: idot2_MultipleUses_mul2:
2030; GFX7: ; %bb.0: ; %entry
2031; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2032; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2033; GFX7-NEXT: s_mov_b32 s3, 0xf000
2034; GFX7-NEXT: s_mov_b32 s2, -1
2035; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2036; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2037; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2038; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
2039; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2040; GFX7-NEXT: s_sext_i32_i16 s7, s4
2041; GFX7-NEXT: s_ashr_i32 s4, s4, 16
2042; GFX7-NEXT: s_sext_i32_i16 s8, s5
2043; GFX7-NEXT: s_ashr_i32 s5, s5, 16
2044; GFX7-NEXT: v_mov_b32_e32 v0, s4
2045; GFX7-NEXT: v_mov_b32_e32 v1, s6
2046; GFX7-NEXT: v_mad_i32_i24 v1, s5, v0, v1
2047; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
2048; GFX7-NEXT: v_mov_b32_e32 v1, s7
2049; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
2050; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2051; GFX7-NEXT: s_endpgm
2052;
2053; GFX8-LABEL: idot2_MultipleUses_mul2:
2054; GFX8: ; %bb.0: ; %entry
2055; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2056; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2057; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2058; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
2059; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
2060; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
2061; GFX8-NEXT: v_mov_b32_e32 v0, s0
2062; GFX8-NEXT: v_mov_b32_e32 v1, s1
2063; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2064; GFX8-NEXT: s_sext_i32_i16 s0, s2
2065; GFX8-NEXT: s_ashr_i32 s2, s2, 16
2066; GFX8-NEXT: s_sext_i32_i16 s1, s3
2067; GFX8-NEXT: s_ashr_i32 s3, s3, 16
2068; GFX8-NEXT: v_mov_b32_e32 v2, s4
2069; GFX8-NEXT: v_mov_b32_e32 v3, s2
2070; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2071; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2072; GFX8-NEXT: v_mov_b32_e32 v3, s0
2073; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2074; GFX8-NEXT: flat_store_dword v[0:1], v2
2075; GFX8-NEXT: s_endpgm
2076;
2077; GFX9-NODL-LABEL: idot2_MultipleUses_mul2:
2078; GFX9-NODL: ; %bb.0: ; %entry
2079; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2080; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2081; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2082; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2083; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2084; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
2085; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2086; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2087; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2088; GFX9-NODL-NEXT: s_sext_i32_i16 s0, s2
2089; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 16
2090; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s3
2091; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 16
2092; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
2093; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
2094; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2095; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2096; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
2097; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2098; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2099; GFX9-NODL-NEXT: s_endpgm
2100;
2101; GFX9-DL-LABEL: idot2_MultipleUses_mul2:
2102; GFX9-DL: ; %bb.0: ; %entry
2103; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2104; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2105; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2106; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2107; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2108; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2109; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2110; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2111; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2112; GFX9-DL-NEXT: s_sext_i32_i16 s0, s2
2113; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 16
2114; GFX9-DL-NEXT: s_sext_i32_i16 s1, s3
2115; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 16
2116; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
2117; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
2118; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2119; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
2120; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
2121; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2122; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2123; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00002124 <2 x i16> addrspace(1)* %src2,
2125 i32 addrspace(1)* nocapture %dst) {
2126entry:
2127 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2128 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2129
2130 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
2131 %conv = sext i16 %s1.elt1 to i32
2132 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
2133 %conv2 = sext i16 %s2.elt1 to i32
2134 %mul1 = mul i32 %conv2, %conv
2135
2136 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
2137 %conv3 = sext i16 %s1.elt2 to i32
2138 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
2139 %conv4 = sext i16 %s2.elt2 to i32
2140 %mul2 = mul i32 %conv4, %conv3
2141
2142 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2143 %add0 = add i32 %mul2, %s3
2144
2145 %add1 = add i32 %mul2, %add0
2146 %add2 = add i32 %add1, %mul1
2147
2148 store i32 %add2, i32 addrspace(1)* %dst, align 4
2149 ret void
2150}
2151
2152define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002153; GFX7-LABEL: udot2_acc16:
2154; GFX7: ; %bb.0: ; %entry
2155; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2156; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2157; GFX7-NEXT: s_mov_b32 s3, 0xf000
2158; GFX7-NEXT: s_mov_b32 s2, -1
2159; GFX7-NEXT: s_mov_b32 s8, 0xffff
2160; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2161; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2162; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
2163; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2164; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2165; GFX7-NEXT: s_lshr_b32 s6, s4, 16
2166; GFX7-NEXT: s_and_b32 s4, s4, s8
2167; GFX7-NEXT: s_lshr_b32 s7, s5, 16
2168; GFX7-NEXT: v_mov_b32_e32 v1, s7
2169; GFX7-NEXT: s_and_b32 s5, s5, s8
2170; GFX7-NEXT: s_waitcnt vmcnt(0)
2171; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
2172; GFX7-NEXT: v_mov_b32_e32 v1, s5
2173; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
2174; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2175; GFX7-NEXT: s_endpgm
2176;
2177; GFX8-LABEL: udot2_acc16:
2178; GFX8: ; %bb.0: ; %entry
2179; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2180; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2181; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2182; GFX8-NEXT: v_mov_b32_e32 v0, s0
2183; GFX8-NEXT: v_mov_b32_e32 v1, s1
2184; GFX8-NEXT: flat_load_ushort v2, v[0:1]
2185; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2186; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2187; GFX8-NEXT: s_mov_b32 s0, 0xffff
2188; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2189; GFX8-NEXT: s_and_b32 s3, s1, s0
2190; GFX8-NEXT: s_and_b32 s0, s2, s0
2191; GFX8-NEXT: s_lshr_b32 s2, s2, 16
2192; GFX8-NEXT: s_lshr_b32 s1, s1, 16
2193; GFX8-NEXT: v_mov_b32_e32 v3, s2
2194; GFX8-NEXT: s_waitcnt vmcnt(0)
2195; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
2196; GFX8-NEXT: v_mov_b32_e32 v3, s0
2197; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
2198; GFX8-NEXT: flat_store_short v[0:1], v2
2199; GFX8-NEXT: s_endpgm
2200;
2201; GFX9-NODL-LABEL: udot2_acc16:
2202; GFX9-NODL: ; %bb.0: ; %entry
2203; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2204; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2205; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2206; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2207; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2208; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
2209; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
2210; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
2211; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
2212; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2213; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
2214; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
2215; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
2216; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
2217; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
2218; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2219; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
2220; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
2221; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
2222; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
2223; GFX9-NODL-NEXT: s_endpgm
2224;
2225; GFX9-DL-LABEL: udot2_acc16:
2226; GFX9-DL: ; %bb.0: ; %entry
2227; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2228; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2229; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2230; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2231; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2232; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2233; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2234; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
2235; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2236; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
2237; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2238; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
2239; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
2240; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00002241 <2 x i16> addrspace(1)* %src2,
2242 i16 addrspace(1)* nocapture %dst) {
2243entry:
2244 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
2245 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
2246
2247 %v1e1 = extractelement <2 x i16> %v1, i64 0
2248 %v2e1 = extractelement <2 x i16> %v2, i64 0
2249 %mul1 = mul i16 %v1e1, %v2e1
2250
2251 %v1e2 = extractelement <2 x i16> %v1, i64 1
2252 %v2e2 = extractelement <2 x i16> %v2, i64 1
2253 %mul2 = mul i16 %v1e2, %v2e2
2254
2255 %s2 = load i16, i16 addrspace(1)* %dst, align 2
2256 %add1 = add i16 %mul2, %s2
2257 %add2 = add i16 %add1, %mul1
2258 store i16 %add2, i16 addrspace(1)* %dst, align 2
2259 ret void
2260}
2261
Farhana Aleen3528c802018-08-21 16:21:15 +00002262define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
Matt Arsenault6f35f0c2018-08-31 15:05:06 +00002263; GFX7-LABEL: notsdot2_sext8:
2264; GFX7: ; %bb.0: ; %entry
2265; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2266; GFX7-NEXT: s_mov_b32 s3, 0xf000
2267; GFX7-NEXT: s_mov_b32 s2, -1
2268; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2269; GFX7-NEXT: s_mov_b32 s10, s2
2270; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2271; GFX7-NEXT: s_mov_b32 s8, s6
2272; GFX7-NEXT: s_mov_b32 s9, s7
2273; GFX7-NEXT: s_mov_b32 s11, s3
2274; GFX7-NEXT: s_mov_b32 s6, s2
2275; GFX7-NEXT: s_mov_b32 s7, s3
2276; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
2277; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0
2278; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
2279; GFX7-NEXT: s_waitcnt vmcnt(1)
2280; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8
2281; GFX7-NEXT: s_waitcnt vmcnt(0)
2282; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8
2283; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
2284; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8
2285; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2286; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4
2287; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0
2288; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2289; GFX7-NEXT: s_endpgm
2290;
2291; GFX8-LABEL: notsdot2_sext8:
2292; GFX8: ; %bb.0: ; %entry
2293; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2294; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2295; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2296; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
2297; GFX8-NEXT: v_mov_b32_e32 v0, s6
2298; GFX8-NEXT: v_mov_b32_e32 v1, s7
2299; GFX8-NEXT: v_mov_b32_e32 v2, s4
2300; GFX8-NEXT: v_mov_b32_e32 v3, s5
2301; GFX8-NEXT: flat_load_ushort v2, v[2:3]
2302; GFX8-NEXT: flat_load_ushort v3, v[0:1]
2303; GFX8-NEXT: v_mov_b32_e32 v0, s0
2304; GFX8-NEXT: v_mov_b32_e32 v1, s1
2305; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
2306; GFX8-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2307; GFX8-NEXT: s_waitcnt vmcnt(0)
2308; GFX8-NEXT: v_bfe_i32 v5, v3, 0, 8
2309; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2310; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
2311; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
2312; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
2313; GFX8-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2314; GFX8-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2315; GFX8-NEXT: flat_store_dword v[0:1], v2
2316; GFX8-NEXT: s_endpgm
2317;
2318; GFX9-NODL-LABEL: notsdot2_sext8:
2319; GFX9-NODL: ; %bb.0: ; %entry
2320; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2321; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2322; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2323; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
2324; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
2325; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
2326; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
2327; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
2328; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
2329; GFX9-NODL-NEXT: global_load_ushort v3, v[0:1], off
2330; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2331; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2332; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
2333; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2334; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2335; GFX9-NODL-NEXT: v_bfe_i32 v5, v3, 0, 8
2336; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2337; GFX9-NODL-NEXT: v_bfe_i32 v4, v4, 0, 8
2338; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
2339; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
2340; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2341; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2342; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2343; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2344; GFX9-NODL-NEXT: s_endpgm
2345;
2346; GFX9-DL-LABEL: notsdot2_sext8:
2347; GFX9-DL: ; %bb.0: ; %entry
2348; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2349; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2350; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2351; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
2352; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
2353; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
2354; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
2355; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
2356; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
2357; GFX9-DL-NEXT: global_load_ushort v3, v[0:1], off
2358; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2359; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2360; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
2361; GFX9-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2
2362; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2363; GFX9-DL-NEXT: v_bfe_i32 v5, v3, 0, 8
2364; GFX9-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3
2365; GFX9-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
2366; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
2367; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
2368; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2369; GFX9-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2
2370; GFX9-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
2371; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2372; GFX9-DL-NEXT: s_endpgm
Farhana Aleen3528c802018-08-21 16:21:15 +00002373 <2 x i8> addrspace(1)* %src2,
2374 i32 addrspace(1)* nocapture %dst) {
2375entry:
2376 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1
2377 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2
2378
2379 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
2380 %conv = sext i8 %s1.elt1 to i32
2381 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
2382 %conv2 = sext i8 %s2.elt1 to i32
2383 %mul1 = mul nuw i32 %conv2, %conv
2384
2385 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
2386 %conv3 = sext i8 %s1.elt2 to i32
2387 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
2388 %conv4 = sext i8 %s2.elt2 to i32
2389 %mul2 = mul nuw i32 %conv4, %conv3
2390
2391 %s3 = load i32, i32 addrspace(1)* %dst, align 4
2392 %add = add i32 %mul2, %s3
2393 %add6 = add i32 %add, %mul1
2394 store i32 %add6, i32 addrspace(1)* %dst, align 4
2395 ret void
2396}