blob: c4e43ffb749cd8f9e3ea07070027386200ad3a9c [file] [log] [blame]
Farhana Aleen3528c802018-08-21 16:21:15 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-DL %s
6
7; add(mul(S0.x, S1.y),
8; add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
9
10define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
11; GFX7-LABEL: udot2:
12; GFX7: ; %bb.0: ; %entry
13; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
14; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
15; GFX7-NEXT: s_mov_b32 s8, 0xffff
16; GFX7-NEXT: s_mov_b32 s3, 0xf000
17; GFX7-NEXT: s_mov_b32 s2, -1
18; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
20; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
21; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
22; GFX7-NEXT: s_waitcnt lgkmcnt(0)
23; GFX7-NEXT: s_lshr_b32 s7, s4, 16
24; GFX7-NEXT: s_lshr_b32 s9, s5, 16
25; GFX7-NEXT: s_and_b32 s4, s4, s8
26; GFX7-NEXT: v_mov_b32_e32 v0, s7
27; GFX7-NEXT: v_mov_b32_e32 v1, s6
28; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
29; GFX7-NEXT: s_and_b32 s5, s5, s8
30; GFX7-NEXT: v_mov_b32_e32 v1, s4
31; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
32; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
33; GFX7-NEXT: s_endpgm
34;
35; GFX89-LABEL: udot2:
36; GFX89: ; %bb.0: ; %entry
37; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
38; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
39; GFX89-NEXT: s_mov_b32 s2, 0xffff
40; GFX89-NEXT: s_waitcnt lgkmcnt(0)
41; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
42; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
43; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
44; GFX89-NEXT: s_waitcnt lgkmcnt(0)
45; GFX89-NEXT: s_and_b32 s6, s3, s2
46; GFX89-NEXT: s_lshr_b32 s3, s3, 16
47; GFX89-NEXT: s_and_b32 s2, s4, s2
48; GFX89-NEXT: s_lshr_b32 s4, s4, 16
49; GFX89-NEXT: v_mov_b32_e32 v0, s5
50; GFX89-NEXT: v_mov_b32_e32 v1, s3
51; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
52; GFX89-NEXT: v_mov_b32_e32 v1, s6
53; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
54; GFX89-NEXT: v_mov_b32_e32 v0, s0
55; GFX89-NEXT: v_mov_b32_e32 v1, s1
56; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
57; GFX89-NEXT: s_endpgm
58;
59; GCN-DL-LABEL: udot2:
60; GCN-DL: ; %bb.0: ; %entry
61; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
62; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
63; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
64; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
65; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
66; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
67; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
68; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
69; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
70; GCN-DL-NEXT: v_mov_b32_e32 v2, s2
71; GCN-DL-NEXT: v_mov_b32_e32 v3, s4
72; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
73; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
74; GCN-DL-NEXT: s_endpgm
75 <2 x i16> addrspace(1)* %src2,
76 i32 addrspace(1)* nocapture %dst) {
77entry:
78 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
79 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
80
81 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
82 %conv = zext i16 %s1.elt1 to i32
83 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
84 %conv2 = zext i16 %s2.elt1 to i32
85 %mul1 = mul nuw i32 %conv2, %conv
86
87 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
88 %conv3 = zext i16 %s1.elt2 to i32
89 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
90 %conv4 = zext i16 %s2.elt2 to i32
91 %mul2 = mul nuw i32 %conv4, %conv3
92
93 %s3 = load i32, i32 addrspace(1)* %dst, align 4
94 %add = add i32 %mul2, %s3
95 %add6 = add i32 %add, %mul1
96 store i32 %add6, i32 addrspace(1)* %dst, align 4
97 ret void
98}
99
100; TODO: Support this pattern
101; add(S3,
102; add (mul (S0.y, S1.y), mul (S0.y, S1.y))) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
103define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
104; GFX7-LABEL: udot2_MulMul:
105; GFX7: ; %bb.0: ; %entry
106; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
107; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
108; GFX7-NEXT: s_mov_b32 s8, 0xffff
109; GFX7-NEXT: s_mov_b32 s3, 0xf000
110; GFX7-NEXT: s_mov_b32 s2, -1
111; GFX7-NEXT: s_waitcnt lgkmcnt(0)
112; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
113; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
114; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
115; GFX7-NEXT: s_waitcnt lgkmcnt(0)
116; GFX7-NEXT: s_lshr_b32 s7, s4, 16
117; GFX7-NEXT: s_and_b32 s4, s4, s8
118; GFX7-NEXT: s_lshr_b32 s9, s5, 16
119; GFX7-NEXT: s_and_b32 s5, s5, s8
120; GFX7-NEXT: v_mov_b32_e32 v0, s4
121; GFX7-NEXT: v_mul_u32_u24_e32 v0, s5, v0
122; GFX7-NEXT: v_mov_b32_e32 v1, s7
123; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
124; GFX7-NEXT: v_add_i32_e32 v0, vcc, s6, v0
125; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
126; GFX7-NEXT: s_endpgm
127;
128; GFX89-LABEL: udot2_MulMul:
129; GFX89: ; %bb.0: ; %entry
130; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
131; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
132; GFX89-NEXT: s_mov_b32 s2, 0xffff
133; GFX89-NEXT: s_waitcnt lgkmcnt(0)
134; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
135; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
136; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
137; GFX89-NEXT: s_waitcnt lgkmcnt(0)
138; GFX89-NEXT: s_and_b32 s6, s3, s2
139; GFX89-NEXT: s_and_b32 s2, s4, s2
140; GFX89-NEXT: v_mov_b32_e32 v0, s6
141; GFX89-NEXT: s_lshr_b32 s3, s3, 16
142; GFX89-NEXT: s_lshr_b32 s4, s4, 16
143; GFX89-NEXT: v_mov_b32_e32 v1, s3
144; GFX89-NEXT: v_mul_u32_u24_e32 v0, s2, v0
145; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
146; GFX89-NEXT: v_add_u32_e32 v2
147; GFX89-NEXT: v_mov_b32_e32 v0, s0
148; GFX89-NEXT: v_mov_b32_e32 v1, s1
149; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
150; GFX89-NEXT: s_endpgm
151;
152; GCN-DL-LABEL: udot2_MulMul:
153; GCN-DL: ; %bb.0: ; %entry
154; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
155; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
156; GCN-DL-NEXT: s_mov_b32 s2, 0xffff
157; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
158; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0
159; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0
160; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0
161; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
162; GCN-DL-NEXT: s_and_b32 s6, s3, s2
163; GCN-DL-NEXT: s_and_b32 s2, s4, s2
164; GCN-DL-NEXT: v_mov_b32_e32 v0, s6
165; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
166; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16
167; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
168; GCN-DL-NEXT: v_mul_u32_u24_e32 v0, s2, v0
169; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
170; GCN-DL-NEXT: v_add_u32_e32 v2, s5, v0
171; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
172; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
173; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
174; GCN-DL-NEXT: s_endpgm
175 <2 x i16> addrspace(1)* %src2,
176 i32 addrspace(1)* nocapture %dst) {
177entry:
178 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
179 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
180
181 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
182 %conv = zext i16 %s1.elt1 to i32
183 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
184 %conv2 = zext i16 %s2.elt1 to i32
185 %mul1 = mul nuw i32 %conv2, %conv
186
187 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
188 %conv3 = zext i16 %s1.elt2 to i32
189 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
190 %conv4 = zext i16 %s2.elt2 to i32
191 %mul2 = mul nuw i32 %conv4, %conv3
192 %s3 = load i32, i32 addrspace(1)* %dst, align 4
193 %add = add i32 %mul2, %mul1
194 %add6 = add i32 %add, %s3
195 store i32 %add6, i32 addrspace(1)* %dst, align 4
196 ret void
197}
198
199define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
200; GFX7-LABEL: idot2:
201; GFX7: ; %bb.0: ; %entry
202; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
203; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
204; GFX7-NEXT: s_mov_b32 s3, 0xf000
205; GFX7-NEXT: s_mov_b32 s2, -1
206; GFX7-NEXT: s_waitcnt lgkmcnt(0)
207; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
208; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
209; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
210; GFX7-NEXT: s_waitcnt lgkmcnt(0)
211; GFX7-NEXT: s_sext_i32_i16 s7, s4
212; GFX7-NEXT: s_ashr_i32 s4, s4, 16
213; GFX7-NEXT: s_sext_i32_i16 s8, s5
214; GFX7-NEXT: s_ashr_i32 s5, s5, 16
215; GFX7-NEXT: v_mov_b32_e32 v0, s4
216; GFX7-NEXT: v_mov_b32_e32 v1, s6
217; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
218; GFX7-NEXT: v_mov_b32_e32 v1, s7
219; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
220; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
221; GFX7-NEXT: s_endpgm
222;
223; GFX89-LABEL: idot2:
224; GFX89: ; %bb.0: ; %entry
225; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
226; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
227; GFX89-NEXT: s_waitcnt lgkmcnt(0)
228; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
229; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
230; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
231; GFX89-NEXT: v_mov_b32_e32 v0, s0
232; GFX89-NEXT: v_mov_b32_e32 v1, s1
233; GFX89-NEXT: s_waitcnt lgkmcnt(0)
234; GFX89-NEXT: s_sext_i32_i16 s0, s2
235; GFX89-NEXT: s_ashr_i32 s2, s2, 16
236; GFX89-NEXT: s_sext_i32_i16 s1, s3
237; GFX89-NEXT: s_ashr_i32 s3, s3, 16
238; GFX89-NEXT: v_mov_b32_e32 v2, s4
239; GFX89-NEXT: v_mov_b32_e32 v3, s2
240; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2
241; GFX89-NEXT: v_mov_b32_e32 v3, s0
242; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2
243; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
244; GFX89-NEXT: s_endpgm
245;
246; GCN-DL-LABEL: idot2:
247; GCN-DL: ; %bb.0: ; %entry
248; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
249; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
250; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
251; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
252; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
253; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
254; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
255; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
256; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
257; GCN-DL-NEXT: v_mov_b32_e32 v2, s2
258; GCN-DL-NEXT: v_mov_b32_e32 v3, s4
259; GCN-DL-NEXT: v_dot2_i32_i16 v2, s3, v2, v3
260; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
261; GCN-DL-NEXT: s_endpgm
262 <2 x i16> addrspace(1)* %src2,
263 i32 addrspace(1)* nocapture %dst) {
264entry:
265 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
266 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
267
268 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
269 %conv = sext i16 %s1.elt1 to i32
270 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
271 %conv2 = sext i16 %s2.elt1 to i32
272 %mul1 = mul nuw i32 %conv2, %conv
273
274 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
275 %conv3 = sext i16 %s1.elt2 to i32
276 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
277 %conv4 = sext i16 %s2.elt2 to i32
278 %mul2 = mul nuw i32 %conv4, %conv3
279
280 %s3 = load i32, i32 addrspace(1)* %dst, align 4
281 %add = add i32 %mul2, %s3
282 %add6 = add i32 %add, %mul1
283 store i32 %add6, i32 addrspace(1)* %dst, align 4
284 ret void
285}
286
287define amdgpu_kernel void @idot2_MixedTypedMul(<2 x i16> addrspace(1)* %src1,
288; GFX7-LABEL: idot2_MixedTypedMul:
289; GFX7: ; %bb.0: ; %entry
290; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
291; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
292; GFX7-NEXT: s_mov_b32 s3, 0xf000
293; GFX7-NEXT: s_mov_b32 s2, -1
294; GFX7-NEXT: s_waitcnt lgkmcnt(0)
295; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
296; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
297; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
298; GFX7-NEXT: s_waitcnt lgkmcnt(0)
299; GFX7-NEXT: s_lshr_b32 s7, s4, 16
300; GFX7-NEXT: s_lshr_b32 s8, s5, 16
301; GFX7-NEXT: s_sext_i32_i16 s4, s4
302; GFX7-NEXT: v_mov_b32_e32 v0, s7
303; GFX7-NEXT: v_mov_b32_e32 v1, s6
304; GFX7-NEXT: v_mad_u32_u24 v0, s8, v0, v1
305; GFX7-NEXT: s_sext_i32_i16 s5, s5
306; GFX7-NEXT: v_mov_b32_e32 v1, s4
307; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0
308; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
309; GFX7-NEXT: s_endpgm
310;
311; GFX89-LABEL: idot2_MixedTypedMul:
312; GFX89: ; %bb.0: ; %entry
313; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
314; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
315; GFX89-NEXT: s_waitcnt lgkmcnt(0)
316; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
317; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
318; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
319; GFX89-NEXT: v_mov_b32_e32 v0, s0
320; GFX89-NEXT: v_mov_b32_e32 v1, s1
321; GFX89-NEXT: s_waitcnt lgkmcnt(0)
322; GFX89-NEXT: s_sext_i32_i16 s0, s2
323; GFX89-NEXT: s_lshr_b32 s2, s2, 16
324; GFX89-NEXT: s_sext_i32_i16 s1, s3
325; GFX89-NEXT: s_lshr_b32 s3, s3, 16
326; GFX89-NEXT: v_mov_b32_e32 v2, s4
327; GFX89-NEXT: v_mov_b32_e32 v3, s2
328; GFX89-NEXT: v_mad_u32_u24 v2, s3, v3, v2
329; GFX89-NEXT: v_mov_b32_e32 v3, s0
330; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2
331; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
332; GFX89-NEXT: s_endpgm
333;
334; GCN-DL-LABEL: idot2_MixedTypedMul:
335; GCN-DL: ; %bb.0: ; %entry
336; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
337; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
338; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
339; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
340; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
341; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
342; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
343; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
344; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
345; GCN-DL-NEXT: s_sext_i32_i16 s0, s2
346; GCN-DL-NEXT: s_lshr_b32 s2, s2, 16
347; GCN-DL-NEXT: s_sext_i32_i16 s1, s3
348; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
349; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
350; GCN-DL-NEXT: v_mov_b32_e32 v3, s2
351; GCN-DL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
352; GCN-DL-NEXT: v_mov_b32_e32 v3, s0
353; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
354; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
355; GCN-DL-NEXT: s_endpgm
356 <2 x i16> addrspace(1)* %src2,
357 i32 addrspace(1)* nocapture %dst) {
358entry:
359 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
360 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
361
362 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
363 %conv = sext i16 %s1.elt1 to i32
364 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
365 %conv2 = sext i16 %s2.elt1 to i32
366 %mul1 = mul nuw i32 %conv2, %conv
367
368 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
369 %conv3 = zext i16 %s1.elt2 to i32
370 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
371 %conv4 = zext i16 %s2.elt2 to i32
372 %mul2 = mul nuw i32 %conv4, %conv3
373
374 %s3 = load i32, i32 addrspace(1)* %dst, align 4
375 %add = add i32 %mul2, %s3
376 %add6 = add i32 %add, %mul1
377 store i32 %add6, i32 addrspace(1)* %dst, align 4
378 ret void
379}
380
381define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
382 <2 x i16> addrspace(1)* %src2,
383 i32 addrspace(1)* nocapture %dst) {
384entry:
385 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
386 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
387
388 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
389 %conv = zext i16 %s1.elt1 to i32
390 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
391 %conv2 = zext i16 %s2.elt1 to i32
392 %mul1 = mul nuw i32 %conv2, %conv
393
394 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
395 %conv3 = zext i16 %s1.elt2 to i32
396 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
397 %conv4 = zext i16 %s2.elt2 to i32
398 %mul2 = mul nuw i32 %conv4, %conv3
399
400 %s3 = load i32, i32 addrspace(1)* %dst, align 4
401 %add = add i32 %s3, %mul2
402 %add6 = add i32 %mul1, %add
403 store i32 %add6, i32 addrspace(1)* %dst, align 4
404 ret void
405}
406
407define amdgpu_kernel void @idot2_MixedExt(<2 x i16> addrspace(1)* %src1,
408; GFX7-LABEL: udot2_alt_AddOperands:
409; GFX7: ; %bb.0: ; %entry
410; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
411; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
412; GFX7-NEXT: s_mov_b32 s8, 0xffff
413; GFX7-NEXT: s_mov_b32 s3, 0xf000
414; GFX7-NEXT: s_mov_b32 s2, -1
415; GFX7-NEXT: s_waitcnt lgkmcnt(0)
416; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
417; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
418; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
419; GFX7-NEXT: s_waitcnt lgkmcnt(0)
420; GFX7-NEXT: s_lshr_b32 s7, s4, 16
421; GFX7-NEXT: s_lshr_b32 s9, s5, 16
422; GFX7-NEXT: s_and_b32 s4, s4, s8
423; GFX7-NEXT: v_mov_b32_e32 v0, s7
424; GFX7-NEXT: v_mov_b32_e32 v1, s6
425; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
426; GFX7-NEXT: s_and_b32 s5, s5, s8
427; GFX7-NEXT: v_mov_b32_e32 v1, s4
428; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
429; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
430; GFX7-NEXT: s_endpgm
431;
432; GFX89-LABEL: udot2_alt_AddOperands:
433; GFX89: ; %bb.0: ; %entry
434; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
435; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
436; GFX89-NEXT: s_mov_b32 s2, 0xffff
437; GFX89-NEXT: s_waitcnt lgkmcnt(0)
438; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
439; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
440; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
441; GFX89-NEXT: s_waitcnt lgkmcnt(0)
442; GFX89-NEXT: s_and_b32 s6, s3, s2
443; GFX89-NEXT: s_lshr_b32 s3, s3, 16
444; GFX89-NEXT: s_and_b32 s2, s4, s2
445; GFX89-NEXT: s_lshr_b32 s4, s4, 16
446; GFX89-NEXT: v_mov_b32_e32 v0, s5
447; GFX89-NEXT: v_mov_b32_e32 v1, s3
448; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
449; GFX89-NEXT: v_mov_b32_e32 v1, s6
450; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
451; GFX89-NEXT: v_mov_b32_e32 v0, s0
452; GFX89-NEXT: v_mov_b32_e32 v1, s1
453; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
454; GFX89-NEXT: s_endpgm
455;
456; GCN-DL-LABEL: udot2_alt_AddOperands:
457; GCN-DL: ; %bb.0: ; %entry
458; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
459; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
460; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
461; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
462; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
463; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
464; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
465; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
466; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
467; GCN-DL-NEXT: v_mov_b32_e32 v2, s2
468; GCN-DL-NEXT: v_mov_b32_e32 v3, s4
469; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
470; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
471; GCN-DL-NEXT: s_endpgm
472 <2 x i16> addrspace(1)* %src2,
473 i32 addrspace(1)* nocapture %dst) {
474entry:
475 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
476 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
477
478 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
479 %conv = sext i16 %s1.elt1 to i32
480 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
481 %conv2 = zext i16 %s2.elt1 to i32
482 %mul1 = mul nuw i32 %conv2, %conv
483
484 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
485 %conv3 = sext i16 %s1.elt2 to i32
486 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
487 %conv4 = sext i16 %s2.elt2 to i32
488 %mul2 = mul nuw i32 %conv4, %conv3
489
490 %s3 = load i32, i32 addrspace(1)* %dst, align 4
491 %add = add i32 %mul2, %s3
492 %add6 = add i32 %add, %mul1
493 store i32 %add6, i32 addrspace(1)* %dst, align 4
494 ret void
495}
496
497define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
498; GFX7-LABEL: notudot2_SameVec:
499; GFX7: ; %bb.0: ; %entry
500; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
501; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
502; GFX7-NEXT: s_mov_b32 s3, 0xf000
503; GFX7-NEXT: s_mov_b32 s2, -1
504; GFX7-NEXT: s_waitcnt lgkmcnt(0)
505; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
506; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
507; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
508; GFX7-NEXT: s_waitcnt lgkmcnt(0)
509; GFX7-NEXT: s_and_b32 s4, s4, 0xffff
510; GFX7-NEXT: s_lshr_b32 s5, s5, 16
511; GFX7-NEXT: v_mov_b32_e32 v0, s6
512; GFX7-NEXT: v_mad_u32_u24 v0, s5, s5, v0
513; GFX7-NEXT: v_mad_u32_u24 v0, s4, s4, v0
514; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
515; GFX7-NEXT: s_endpgm
516;
517; GFX89-LABEL: notudot2_SameVec:
518; GFX89: ; %bb.0: ; %entry
519; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
520; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
521; GFX89-NEXT: s_waitcnt lgkmcnt(0)
522; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
523; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
524; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
525; GFX89-NEXT: v_mov_b32_e32 v0, s0
526; GFX89-NEXT: v_mov_b32_e32 v1, s1
527; GFX89-NEXT: s_waitcnt lgkmcnt(0)
528; GFX89-NEXT: s_and_b32 s0, s2, 0xffff
529; GFX89-NEXT: s_lshr_b32 s1, s3, 16
530; GFX89-NEXT: v_mov_b32_e32 v2, s4
531; GFX89-NEXT: v_mad_u32_u24 v2, s1, s1, v2
532; GFX89-NEXT: v_mad_u32_u24 v2, s0, s0, v2
533; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
534; GFX89-NEXT: s_endpgm
535;
536; GCN-DL-LABEL: notudot2_SameVec:
537; GCN-DL: ; %bb.0: ; %entry
538; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
539; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
540; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
541; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
542; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
543; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
544; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
545; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
546; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
547; GCN-DL-NEXT: s_and_b32 s0, s2, 0xffff
548; GCN-DL-NEXT: s_lshr_b32 s1, s3, 16
549; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
550; GCN-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
551; GCN-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
552; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
553; GCN-DL-NEXT: s_endpgm
554 <2 x i16> addrspace(1)* %src2,
555 i32 addrspace(1)* nocapture %dst) {
556entry:
557 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
558 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
559
560 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
561 %conv = zext i16 %s1.elt1 to i32
562 %s2.elt1 = extractelement <2 x i16> %vec1, i64 0
563 %conv2 = zext i16 %s2.elt1 to i32
564 %mul1 = mul i32 %conv2, %conv
565
566 %s1.elt2 = extractelement <2 x i16> %vec2, i64 1
567 %conv3 = zext i16 %s1.elt2 to i32
568 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
569 %conv4 = zext i16 %s2.elt2 to i32
570 %mul2 = mul i32 %conv4, %conv3
571
572 %s3 = load i32, i32 addrspace(1)* %dst, align 4
573 %add = add i32 %mul2, %s3
574 %add6 = add i32 %add, %mul1
575 store i32 %add6, i32 addrspace(1)* %dst, align 4
576 ret void
577}
578
579define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
580; GFX7-LABEL: udot2_v4i16:
581; GFX7: ; %bb.0: ; %entry
582; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
583; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
584; GFX7-NEXT: s_mov_b32 s8, 0xffff
585; GFX7-NEXT: s_mov_b32 s3, 0xf000
586; GFX7-NEXT: s_mov_b32 s2, -1
587; GFX7-NEXT: s_waitcnt lgkmcnt(0)
588; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
589; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
590; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
591; GFX7-NEXT: s_waitcnt lgkmcnt(0)
592; GFX7-NEXT: s_and_b32 s7, s4, s8
593; GFX7-NEXT: s_lshr_b32 s4, s4, 16
594; GFX7-NEXT: s_and_b32 s8, s5, s8
595; GFX7-NEXT: s_lshr_b32 s5, s5, 16
596; GFX7-NEXT: v_mov_b32_e32 v0, s4
597; GFX7-NEXT: v_mov_b32_e32 v1, s6
598; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
599; GFX7-NEXT: v_mov_b32_e32 v1, s7
600; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
601; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
602; GFX7-NEXT: s_endpgm
603;
604; GFX89-LABEL: udot2_v4i16:
605; GFX89: ; %bb.0: ; %entry
606; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
607; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
608; GFX89-NEXT: s_mov_b32 s2, 0xffff
609; GFX89-NEXT: s_waitcnt lgkmcnt(0)
610; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
611; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
612; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
613; GFX89-NEXT: s_waitcnt lgkmcnt(0)
614; GFX89-NEXT: s_and_b32 s6, s3, s2
615; GFX89-NEXT: s_lshr_b32 s3, s3, 16
616; GFX89-NEXT: s_and_b32 s2, s4, s2
617; GFX89-NEXT: s_lshr_b32 s4, s4, 16
618; GFX89-NEXT: v_mov_b32_e32 v0, s5
619; GFX89-NEXT: v_mov_b32_e32 v1, s3
620; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
621; GFX89-NEXT: v_mov_b32_e32 v1, s6
622; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
623; GFX89-NEXT: v_mov_b32_e32 v0, s0
624; GFX89-NEXT: v_mov_b32_e32 v1, s1
625; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
626; GFX89-NEXT: s_endpgm
627;
628; GCN-DL-LABEL: udot2_v4i16:
629; GCN-DL: ; %bb.0: ; %entry
630; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
631; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
632; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
633; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
634; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
635; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
636; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
637; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
638; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
639; GCN-DL-NEXT: v_mov_b32_e32 v2, s2
640; GCN-DL-NEXT: v_mov_b32_e32 v3, s4
641; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
642; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
643; GCN-DL-NEXT: s_endpgm
644 <4 x i16> addrspace(1)* %src2,
645 i32 addrspace(1)* nocapture %dst) {
646entry:
647 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
648 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
649
650 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
651 %conv = zext i16 %s1.elt1 to i32
652 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
653 %conv2 = zext i16 %s2.elt1 to i32
654 %mul1 = mul i32 %conv2, %conv
655
656 %s1.elt2 = extractelement <4 x i16> %vec1, i64 1
657 %conv3 = zext i16 %s1.elt2 to i32
658 %s2.elt2 = extractelement <4 x i16> %vec2, i64 1
659 %conv4 = zext i16 %s2.elt2 to i32
660 %mul2 = mul i32 %conv4, %conv3
661
662 %s3 = load i32, i32 addrspace(1)* %dst, align 4
663 %add = add i32 %mul2, %s3
664 %add6 = add i32 %add, %mul1
665 store i32 %add6, i32 addrspace(1)* %dst, align 4
666 ret void
667}
668
669define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
670; GFX7-LABEL: udot2_v4i16_Hi:
671; GFX7: ; %bb.0: ; %entry
672; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
673; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
674; GFX7-NEXT: s_mov_b32 s8, 0xffff
675; GFX7-NEXT: s_mov_b32 s3, 0xf000
676; GFX7-NEXT: s_mov_b32 s2, -1
677; GFX7-NEXT: s_waitcnt lgkmcnt(0)
678; GFX7-NEXT: s_load_dword s4, s[4:5], 0x1
679; GFX7-NEXT: s_load_dword s5, s[6:7], 0x1
680; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
681; GFX7-NEXT: s_waitcnt lgkmcnt(0)
682; GFX7-NEXT: s_and_b32 s7, s4, s8
683; GFX7-NEXT: s_lshr_b32 s4, s4, 16
684; GFX7-NEXT: s_and_b32 s8, s5, s8
685; GFX7-NEXT: s_lshr_b32 s5, s5, 16
686; GFX7-NEXT: v_mov_b32_e32 v0, s4
687; GFX7-NEXT: v_mov_b32_e32 v1, s6
688; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
689; GFX7-NEXT: v_mov_b32_e32 v1, s7
690; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
691; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
692; GFX7-NEXT: s_endpgm
693;
694; GFX89-LABEL: udot2_v4i16_Hi:
695; GFX89: ; %bb.0: ; %entry
696; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
697; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
698; GFX89-NEXT: s_mov_b32 s2, 0xffff
699; GFX89-NEXT: s_waitcnt lgkmcnt(0)
700; GFX89-NEXT: s_load_dword s3, s[4:5], 0x4
701; GFX89-NEXT: s_load_dword s4, s[6:7], 0x4
702; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
703; GFX89-NEXT: s_waitcnt lgkmcnt(0)
704; GFX89-NEXT: s_and_b32 s6, s3, s2
705; GFX89-NEXT: s_lshr_b32 s3, s3, 16
706; GFX89-NEXT: s_and_b32 s2, s4, s2
707; GFX89-NEXT: s_lshr_b32 s4, s4, 16
708; GFX89-NEXT: v_mov_b32_e32 v0, s5
709; GFX89-NEXT: v_mov_b32_e32 v1, s3
710; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
711; GFX89-NEXT: v_mov_b32_e32 v1, s6
712; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
713; GFX89-NEXT: v_mov_b32_e32 v0, s0
714; GFX89-NEXT: v_mov_b32_e32 v1, s1
715; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
716; GFX89-NEXT: s_endpgm
717;
718; GCN-DL-LABEL: udot2_v4i16_Hi:
719; GCN-DL: ; %bb.0: ; %entry
720; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
721; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
722; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
723; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x4
724; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x4
725; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
726; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
727; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
728; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
729; GCN-DL-NEXT: v_mov_b32_e32 v2, s2
730; GCN-DL-NEXT: v_mov_b32_e32 v3, s4
731; GCN-DL-NEXT: v_dot2_u32_u16 v2, s3, v2, v3
732; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
733; GCN-DL-NEXT: s_endpgm
734 <4 x i16> addrspace(1)* %src2,
735 i32 addrspace(1)* nocapture %dst) {
736entry:
737 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
738 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
739
740 %s1.elt1 = extractelement <4 x i16> %vec1, i64 2
741 %conv = zext i16 %s1.elt1 to i32
742 %s2.elt1 = extractelement <4 x i16> %vec2, i64 2
743 %conv2 = zext i16 %s2.elt1 to i32
744 %mul1 = mul i32 %conv2, %conv
745
746 %s1.elt2 = extractelement <4 x i16> %vec1, i64 3
747 %conv3 = zext i16 %s1.elt2 to i32
748 %s2.elt2 = extractelement <4 x i16> %vec2, i64 3
749 %conv4 = zext i16 %s2.elt2 to i32
750 %mul2 = mul i32 %conv4, %conv3
751
752 %s3 = load i32, i32 addrspace(1)* %dst, align 4
753 %add = add i32 %mul2, %s3
754 %add6 = add i32 %add, %mul1
755 store i32 %add6, i32 addrspace(1)* %dst, align 4
756 ret void
757}
758
759define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
760; GFX7-LABEL: notudot2_v4i16_Even:
761; GFX7: ; %bb.0: ; %entry
762; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
763; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
764; GFX7-NEXT: s_mov_b32 s8, 0xffff
765; GFX7-NEXT: s_mov_b32 s3, 0xf000
766; GFX7-NEXT: s_mov_b32 s2, -1
767; GFX7-NEXT: s_waitcnt lgkmcnt(0)
768; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
769; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
770; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
771; GFX7-NEXT: s_waitcnt lgkmcnt(0)
772; GFX7-NEXT: s_and_b32 s5, s5, s8
773; GFX7-NEXT: s_and_b32 s4, s4, s8
774; GFX7-NEXT: s_and_b32 s7, s7, s8
775; GFX7-NEXT: v_mov_b32_e32 v0, s5
776; GFX7-NEXT: v_mov_b32_e32 v1, s9
777; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
778; GFX7-NEXT: s_and_b32 s6, s6, s8
779; GFX7-NEXT: v_mov_b32_e32 v1, s4
780; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
781; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
782; GFX7-NEXT: s_endpgm
783;
784; GFX89-LABEL: notudot2_v4i16_Even:
785; GFX89: ; %bb.0: ; %entry
786; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
787; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
788; GFX89-NEXT: s_mov_b32 s8, 0xffff
789; GFX89-NEXT: s_waitcnt lgkmcnt(0)
790; GFX89-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
791; GFX89-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
792; GFX89-NEXT: s_load_dword s6, s[0:1], 0x0
793; GFX89-NEXT: s_waitcnt lgkmcnt(0)
794; GFX89-NEXT: s_and_b32 s3, s3, s8
795; GFX89-NEXT: s_and_b32 s2, s2, s8
796; GFX89-NEXT: s_and_b32 s5, s5, s8
797; GFX89-NEXT: v_mov_b32_e32 v0, s6
798; GFX89-NEXT: v_mov_b32_e32 v1, s3
799; GFX89-NEXT: v_mad_u32_u24 v0, s5, v1, v0
800; GFX89-NEXT: s_and_b32 s4, s4, s8
801; GFX89-NEXT: v_mov_b32_e32 v1, s2
802; GFX89-NEXT: v_mad_u32_u24 v2, s4, v1, v0
803; GFX89-NEXT: v_mov_b32_e32 v0, s0
804; GFX89-NEXT: v_mov_b32_e32 v1, s1
805; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
806; GFX89-NEXT: s_endpgm
807;
808; GCN-DL-LABEL: notudot2_v4i16_Even:
809; GCN-DL: ; %bb.0: ; %entry
810; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
811; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
812; GCN-DL-NEXT: s_mov_b32 s8, 0xffff
813; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
814; GCN-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
815; GCN-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
816; GCN-DL-NEXT: s_load_dword s6, s[0:1], 0x0
817; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
818; GCN-DL-NEXT: s_and_b32 s3, s3, s8
819; GCN-DL-NEXT: s_and_b32 s2, s2, s8
820; GCN-DL-NEXT: s_and_b32 s5, s5, s8
821; GCN-DL-NEXT: v_mov_b32_e32 v0, s6
822; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
823; GCN-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
824; GCN-DL-NEXT: s_and_b32 s4, s4, s8
825; GCN-DL-NEXT: v_mov_b32_e32 v1, s2
826; GCN-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
827; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
828; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
829; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
830; GCN-DL-NEXT: s_endpgm
831 <4 x i16> addrspace(1)* %src2,
832 i32 addrspace(1)* nocapture %dst) {
833entry:
834 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
835 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
836
837 %s1.elt1 = extractelement <4 x i16> %vec1, i64 0
838 %conv = zext i16 %s1.elt1 to i32
839 %s2.elt1 = extractelement <4 x i16> %vec2, i64 0
840 %conv2 = zext i16 %s2.elt1 to i32
841 %mul1 = mul i32 %conv2, %conv
842
843 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
844 %conv3 = zext i16 %s1.elt2 to i32
845 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
846 %conv4 = zext i16 %s2.elt2 to i32
847 %mul2 = mul i32 %conv4, %conv3
848
849 %s3 = load i32, i32 addrspace(1)* %dst, align 4
850 %add = add i32 %mul2, %s3
851 %add6 = add i32 %add, %mul1
852 store i32 %add6, i32 addrspace(1)* %dst, align 4
853 ret void
854}
855
856define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
857; GFX7-LABEL: notudot2_v4i16_Middle:
858; GFX7: ; %bb.0: ; %entry
859; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
860; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
861; GFX7-NEXT: s_mov_b32 s8, 0xffff
862; GFX7-NEXT: s_mov_b32 s3, 0xf000
863; GFX7-NEXT: s_mov_b32 s2, -1
864; GFX7-NEXT: s_waitcnt lgkmcnt(0)
865; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
866; GFX7-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
867; GFX7-NEXT: s_load_dword s9, s[0:1], 0x0
868; GFX7-NEXT: s_waitcnt lgkmcnt(0)
869; GFX7-NEXT: s_and_b32 s5, s5, s8
870; GFX7-NEXT: s_lshr_b32 s4, s4, 16
871; GFX7-NEXT: s_and_b32 s7, s7, s8
872; GFX7-NEXT: v_mov_b32_e32 v0, s5
873; GFX7-NEXT: v_mov_b32_e32 v1, s9
874; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
875; GFX7-NEXT: s_lshr_b32 s6, s6, 16
876; GFX7-NEXT: v_mov_b32_e32 v1, s4
877; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
878; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
879; GFX7-NEXT: s_endpgm
880;
881; GFX89-LABEL: notudot2_v4i16_Middle:
882; GFX89: ; %bb.0: ; %entry
883; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
884; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
885; GFX89-NEXT: s_mov_b32 s8, 0xffff
886; GFX89-NEXT: s_waitcnt lgkmcnt(0)
887; GFX89-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
888; GFX89-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
889; GFX89-NEXT: s_load_dword s6, s[0:1], 0x0
890; GFX89-NEXT: s_waitcnt lgkmcnt(0)
891; GFX89-NEXT: s_and_b32 s3, s3, s8
892; GFX89-NEXT: s_lshr_b32 s2, s2, 16
893; GFX89-NEXT: s_and_b32 s5, s5, s8
894; GFX89-NEXT: v_mov_b32_e32 v0, s6
895; GFX89-NEXT: v_mov_b32_e32 v1, s3
896; GFX89-NEXT: v_mad_u32_u24 v0, s5, v1, v0
897; GFX89-NEXT: s_lshr_b32 s4, s4, 16
898; GFX89-NEXT: v_mov_b32_e32 v1, s2
899; GFX89-NEXT: v_mad_u32_u24 v2, s4, v1, v0
900; GFX89-NEXT: v_mov_b32_e32 v0, s0
901; GFX89-NEXT: v_mov_b32_e32 v1, s1
902; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
903; GFX89-NEXT: s_endpgm
904;
905; GCN-DL-LABEL: notudot2_v4i16_Middle:
906; GCN-DL: ; %bb.0: ; %entry
907; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
908; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
909; GCN-DL-NEXT: s_mov_b32 s8, 0xffff
910; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
911; GCN-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
912; GCN-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
913; GCN-DL-NEXT: s_load_dword s6, s[0:1], 0x0
914; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
915; GCN-DL-NEXT: s_and_b32 s3, s3, s8
916; GCN-DL-NEXT: s_lshr_b32 s2, s2, 16
917; GCN-DL-NEXT: s_and_b32 s5, s5, s8
918; GCN-DL-NEXT: v_mov_b32_e32 v0, s6
919; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
920; GCN-DL-NEXT: v_mad_u32_u24 v0, s5, v1, v0
921; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16
922; GCN-DL-NEXT: v_mov_b32_e32 v1, s2
923; GCN-DL-NEXT: v_mad_u32_u24 v2, s4, v1, v0
924; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
925; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
926; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
927; GCN-DL-NEXT: s_endpgm
928 <4 x i16> addrspace(1)* %src2,
929 i32 addrspace(1)* nocapture %dst) {
930entry:
931 %vec1 = load <4 x i16>, <4 x i16> addrspace(1)* %src1
932 %vec2 = load <4 x i16>, <4 x i16> addrspace(1)* %src2
933
934 %s1.elt1 = extractelement <4 x i16> %vec1, i64 1
935 %conv = zext i16 %s1.elt1 to i32
936 %s2.elt1 = extractelement <4 x i16> %vec2, i64 1
937 %conv2 = zext i16 %s2.elt1 to i32
938 %mul1 = mul i32 %conv2, %conv
939
940 %s1.elt2 = extractelement <4 x i16> %vec1, i64 2
941 %conv3 = zext i16 %s1.elt2 to i32
942 %s2.elt2 = extractelement <4 x i16> %vec2, i64 2
943 %conv4 = zext i16 %s2.elt2 to i32
944 %mul2 = mul i32 %conv4, %conv3
945
946 %s3 = load i32, i32 addrspace(1)* %dst, align 4
947 %add = add i32 %mul2, %s3
948 %add6 = add i32 %add, %mul1
949 store i32 %add6, i32 addrspace(1)* %dst, align 4
950 ret void
951}
952
953define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
954; GFX7-LABEL: notudot2_DiffIndex:
955; GFX7: ; %bb.0: ; %entry
956; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
957; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
958; GFX7-NEXT: s_mov_b32 s8, 0xffff
959; GFX7-NEXT: s_mov_b32 s3, 0xf000
960; GFX7-NEXT: s_mov_b32 s2, -1
961; GFX7-NEXT: s_waitcnt lgkmcnt(0)
962; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
963; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
964; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
965; GFX7-NEXT: s_waitcnt lgkmcnt(0)
966; GFX7-NEXT: s_lshr_b32 s7, s4, 16
967; GFX7-NEXT: s_lshr_b32 s9, s5, 16
968; GFX7-NEXT: s_and_b32 s4, s4, s8
969; GFX7-NEXT: s_and_b32 s5, s5, s8
970; GFX7-NEXT: v_mov_b32_e32 v0, s7
971; GFX7-NEXT: v_mov_b32_e32 v1, s6
972; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
973; GFX7-NEXT: v_mov_b32_e32 v1, s4
974; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
975; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
976; GFX7-NEXT: s_endpgm
977;
978; GFX89-LABEL: notudot2_DiffIndex:
979; GFX89: ; %bb.0: ; %entry
980; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
981; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
982; GFX89-NEXT: s_mov_b32 s2, 0xffff
983; GFX89-NEXT: s_waitcnt lgkmcnt(0)
984; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
985; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
986; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
987; GFX89-NEXT: s_waitcnt lgkmcnt(0)
988; GFX89-NEXT: s_and_b32 s6, s3, s2
989; GFX89-NEXT: s_lshr_b32 s3, s3, 16
990; GFX89-NEXT: s_and_b32 s2, s4, s2
991; GFX89-NEXT: v_mov_b32_e32 v0, s5
992; GFX89-NEXT: v_mov_b32_e32 v1, s3
993; GFX89-NEXT: v_mad_u32_u24 v0, s2, v1, v0
994; GFX89-NEXT: s_lshr_b32 s7, s4, 16
995; GFX89-NEXT: v_mov_b32_e32 v1, s6
996; GFX89-NEXT: v_mad_u32_u24 v2, s7, v1, v0
997; GFX89-NEXT: v_mov_b32_e32 v0, s0
998; GFX89-NEXT: v_mov_b32_e32 v1, s1
999; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1000; GFX89-NEXT: s_endpgm
1001;
1002; GCN-DL-LABEL: notudot2_DiffIndex:
1003; GCN-DL: ; %bb.0: ; %entry
1004; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1005; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1006; GCN-DL-NEXT: s_mov_b32 s2, 0xffff
1007; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1008; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1009; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1010; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1011; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1012; GCN-DL-NEXT: s_and_b32 s6, s3, s2
1013; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
1014; GCN-DL-NEXT: s_and_b32 s2, s4, s2
1015; GCN-DL-NEXT: v_mov_b32_e32 v0, s5
1016; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
1017; GCN-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1018; GCN-DL-NEXT: s_lshr_b32 s7, s4, 16
1019; GCN-DL-NEXT: v_mov_b32_e32 v1, s6
1020; GCN-DL-NEXT: v_mad_u32_u24 v2, s7, v1, v0
1021; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1022; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1023; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1024; GCN-DL-NEXT: s_endpgm
1025 <2 x i16> addrspace(1)* %src2,
1026 i32 addrspace(1)* nocapture %dst) {
1027entry:
1028 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1029 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1030
1031 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1032 %conv = zext i16 %s1.elt1 to i32
1033 %s2.elt1 = extractelement <2 x i16> %vec2, i64 1
1034 %conv2 = zext i16 %s2.elt1 to i32
1035 %mul1 = mul i32 %conv2, %conv
1036
1037 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1038 %conv3 = zext i16 %s1.elt2 to i32
1039 %s2.elt2 = extractelement <2 x i16> %vec2, i64 0
1040 %conv4 = zext i16 %s2.elt2 to i32
1041 %mul2 = mul i32 %conv4, %conv3
1042
1043 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1044 %add = add i32 %mul2, %s3
1045 %add6 = add i32 %add, %mul1
1046 store i32 %add6, i32 addrspace(1)* %dst, align 4
1047 ret void
1048}
1049
1050define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1051; GFX7-LABEL: udot2_MultipleUses_add1:
1052; GFX7: ; %bb.0: ; %entry
1053; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1054; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1055; GFX7-NEXT: s_mov_b32 s8, 0xffff
1056; GFX7-NEXT: s_mov_b32 s3, 0xf000
1057; GFX7-NEXT: s_mov_b32 s2, -1
1058; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1059; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1060; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1061; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1062; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1063; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1064; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1065; GFX7-NEXT: s_and_b32 s4, s4, s8
1066; GFX7-NEXT: v_mov_b32_e32 v0, s7
1067; GFX7-NEXT: v_mov_b32_e32 v1, s6
1068; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1069; GFX7-NEXT: s_and_b32 s5, s5, s8
1070; GFX7-NEXT: v_mov_b32_e32 v1, s4
1071; GFX7-NEXT: v_mad_u32_u24 v1, s5, v1, v0
1072; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1073; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1074; GFX7-NEXT: s_endpgm
1075;
1076; GFX89-LABEL: udot2_MultipleUses_add1:
1077; GFX89: ; %bb.0: ; %entry
1078; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1079; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1080; GFX89-NEXT: s_mov_b32 s2, 0xffff
1081; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1082; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
1083; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
1084; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
1085; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1086; GFX89-NEXT: s_and_b32 s6, s3, s2
1087; GFX89-NEXT: s_lshr_b32 s3, s3, 16
1088; GFX89-NEXT: s_and_b32 s2, s4, s2
1089; GFX89-NEXT: s_lshr_b32 s4, s4, 16
1090; GFX89-NEXT: v_mov_b32_e32 v0, s5
1091; GFX89-NEXT: v_mov_b32_e32 v1, s3
1092; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1093; GFX89-NEXT: v_mov_b32_e32 v1, s6
1094; GFX89-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1095; GFX89-NEXT: v_add_u32_e32 v2
1096; GFX89-NEXT: v_mov_b32_e32 v0, s0
1097; GFX89-NEXT: v_mov_b32_e32 v1, s1
1098; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1099; GFX89-NEXT: s_endpgm
1100;
1101; GCN-DL-LABEL: udot2_MultipleUses_add1:
1102; GCN-DL: ; %bb.0: ; %entry
1103; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1104; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1105; GCN-DL-NEXT: s_mov_b32 s2, 0xffff
1106; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1107; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1108; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1109; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1110; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1111; GCN-DL-NEXT: s_and_b32 s6, s3, s2
1112; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
1113; GCN-DL-NEXT: s_and_b32 s2, s4, s2
1114; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16
1115; GCN-DL-NEXT: v_mov_b32_e32 v0, s5
1116; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
1117; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1118; GCN-DL-NEXT: v_mov_b32_e32 v1, s6
1119; GCN-DL-NEXT: v_mad_u32_u24 v1, s2, v1, v0
1120; GCN-DL-NEXT: v_add_u32_e32 v2, v1, v0
1121; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1122; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1123; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1124; GCN-DL-NEXT: s_endpgm
1125 <2 x i16> addrspace(1)* %src2,
1126 i32 addrspace(1)* nocapture %dst) {
1127entry:
1128 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1129 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1130
1131 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1132 %conv = zext i16 %s1.elt1 to i32
1133 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1134 %conv2 = zext i16 %s2.elt1 to i32
1135 %mul1 = mul i32 %conv2, %conv
1136
1137 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1138 %conv3 = zext i16 %s1.elt2 to i32
1139 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1140 %conv4 = zext i16 %s2.elt2 to i32
1141 %mul2 = mul i32 %conv4, %conv3
1142
1143 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1144 %add1 = add i32 %mul2, %s3
1145 %add2 = add i32 %add1, %mul1
1146
1147 %res = add i32 %add2, %add1
1148 store i32 %res, i32 addrspace(1)* %dst, align 4
1149 ret void
1150}
1151
1152define amdgpu_kernel void @idot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1,
1153; GFX7-LABEL: idot2_MultipleUses_add1:
1154; GFX7: ; %bb.0: ; %entry
1155; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1156; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1157; GFX7-NEXT: s_mov_b32 s3, 0xf000
1158; GFX7-NEXT: s_mov_b32 s2, -1
1159; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1160; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1161; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1162; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1163; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1164; GFX7-NEXT: s_sext_i32_i16 s7, s4
1165; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1166; GFX7-NEXT: s_sext_i32_i16 s8, s5
1167; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1168; GFX7-NEXT: v_mov_b32_e32 v0, s4
1169; GFX7-NEXT: v_mov_b32_e32 v1, s6
1170; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
1171; GFX7-NEXT: v_mov_b32_e32 v1, s7
1172; GFX7-NEXT: v_mad_i32_i24 v1, s8, v1, v0
1173; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1174; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1175; GFX7-NEXT: s_endpgm
1176;
1177; GFX89-LABEL: idot2_MultipleUses_add1:
1178; GFX89: ; %bb.0: ; %entry
1179; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1180; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1181; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1182; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
1183; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
1184; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
1185; GFX89-NEXT: v_mov_b32_e32 v0, s0
1186; GFX89-NEXT: v_mov_b32_e32 v1, s1
1187; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1188; GFX89-NEXT: s_sext_i32_i16 s0, s2
1189; GFX89-NEXT: s_ashr_i32 s2, s2, 16
1190; GFX89-NEXT: s_sext_i32_i16 s1, s3
1191; GFX89-NEXT: s_ashr_i32 s3, s3, 16
1192; GFX89-NEXT: v_mov_b32_e32 v2, s4
1193; GFX89-NEXT: v_mov_b32_e32 v3, s2
1194; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1195; GFX89-NEXT: v_mov_b32_e32 v3, s0
1196; GFX89-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1197; GFX89-NEXT: v_add_u32_e32 v2
1198; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1199; GFX89-NEXT: s_endpgm
1200;
1201; GCN-DL-LABEL: idot2_MultipleUses_add1:
1202; GCN-DL: ; %bb.0: ; %entry
1203; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1204; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1205; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1206; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1207; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1208; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1209; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1210; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1211; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1212; GCN-DL-NEXT: s_sext_i32_i16 s0, s2
1213; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16
1214; GCN-DL-NEXT: s_sext_i32_i16 s1, s3
1215; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16
1216; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
1217; GCN-DL-NEXT: v_mov_b32_e32 v3, s2
1218; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1219; GCN-DL-NEXT: v_mov_b32_e32 v3, s0
1220; GCN-DL-NEXT: v_mad_i32_i24 v3, s1, v3, v2
1221; GCN-DL-NEXT: v_add_u32_e32 v2, v3, v2
1222; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1223; GCN-DL-NEXT: s_endpgm
1224 <2 x i16> addrspace(1)* %src2,
1225 i32 addrspace(1)* nocapture %dst) {
1226entry:
1227 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1228 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1229
1230 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1231 %conv = sext i16 %s1.elt1 to i32
1232 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1233 %conv2 = sext i16 %s2.elt1 to i32
1234 %mul1 = mul i32 %conv2, %conv
1235
1236 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1237 %conv3 = sext i16 %s1.elt2 to i32
1238 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1239 %conv4 = sext i16 %s2.elt2 to i32
1240 %mul2 = mul i32 %conv4, %conv3
1241
1242 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1243 %add1 = add i32 %mul2, %s3
1244 %add2 = add i32 %add1, %mul1
1245
1246 %res = add i32 %add2, %add1
1247 store i32 %res, i32 addrspace(1)* %dst, align 4
1248 ret void
1249}
1250
1251define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1252; GFX7-LABEL: udot2_MultipleUses_mul1:
1253; GFX7: ; %bb.0: ; %entry
1254; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1255; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1256; GFX7-NEXT: s_mov_b32 s8, 0xffff
1257; GFX7-NEXT: s_mov_b32 s3, 0xf000
1258; GFX7-NEXT: s_mov_b32 s2, -1
1259; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1260; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1261; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1262; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1263; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1264; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1265; GFX7-NEXT: s_and_b32 s4, s4, s8
1266; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1267; GFX7-NEXT: s_and_b32 s5, s5, s8
1268; GFX7-NEXT: v_mov_b32_e32 v0, s4
1269; GFX7-NEXT: v_mov_b32_e32 v1, s6
1270; GFX7-NEXT: v_mad_u32_u24 v1, s5, v0, v1
1271; GFX7-NEXT: v_mov_b32_e32 v2, s7
1272; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1
1273; GFX7-NEXT: v_mad_u32_u24 v0, s5, v0, v1
1274; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1275; GFX7-NEXT: s_endpgm
1276;
1277; GFX89-LABEL: udot2_MultipleUses_mul1:
1278; GFX89: ; %bb.0: ; %entry
1279; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1280; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1281; GFX89-NEXT: s_mov_b32 s2, 0xffff
1282; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1283; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
1284; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
1285; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
1286; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1287; GFX89-NEXT: s_and_b32 s6, s3, s2
1288; GFX89-NEXT: s_and_b32 s2, s4, s2
1289; GFX89-NEXT: s_lshr_b32 s3, s3, 16
1290; GFX89-NEXT: v_mov_b32_e32 v0, s5
1291; GFX89-NEXT: v_mov_b32_e32 v1, s6
1292; GFX89-NEXT: s_lshr_b32 s4, s4, 16
1293; GFX89-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1294; GFX89-NEXT: v_mov_b32_e32 v2, s3
1295; GFX89-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1296; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1297; GFX89-NEXT: v_mov_b32_e32 v0, s0
1298; GFX89-NEXT: v_mov_b32_e32 v1, s1
1299; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1300; GFX89-NEXT: s_endpgm
1301;
1302; GCN-DL-LABEL: udot2_MultipleUses_mul1:
1303; GCN-DL: ; %bb.0: ; %entry
1304; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1305; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1306; GCN-DL-NEXT: s_mov_b32 s2, 0xffff
1307; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1308; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1309; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1310; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1311; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1312; GCN-DL-NEXT: s_and_b32 s6, s3, s2
1313; GCN-DL-NEXT: s_and_b32 s2, s4, s2
1314; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
1315; GCN-DL-NEXT: v_mov_b32_e32 v0, s5
1316; GCN-DL-NEXT: v_mov_b32_e32 v1, s6
1317; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16
1318; GCN-DL-NEXT: v_mad_u32_u24 v0, s2, v1, v0
1319; GCN-DL-NEXT: v_mov_b32_e32 v2, s3
1320; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1321; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1322; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1323; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1324; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1325; GCN-DL-NEXT: s_endpgm
1326 <2 x i16> addrspace(1)* %src2,
1327 i32 addrspace(1)* nocapture %dst) {
1328entry:
1329 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1330 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1331
1332 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1333 %conv = zext i16 %s1.elt1 to i32
1334 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1335 %conv2 = zext i16 %s2.elt1 to i32
1336 %mul1 = mul i32 %conv2, %conv
1337
1338 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1339 %conv3 = zext i16 %s1.elt2 to i32
1340 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1341 %conv4 = zext i16 %s2.elt2 to i32
1342 %mul2 = mul i32 %conv4, %conv3
1343
1344 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1345 %add0 = add i32 %mul1, %s3
1346
1347 %add1 = add i32 %mul2, %add0
1348 %add2 = add i32 %add1, %mul1
1349
1350 store i32 %add2, i32 addrspace(1)* %dst, align 4
1351 ret void
1352}
1353
1354define amdgpu_kernel void @idot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1,
1355; GFX7-LABEL: idot2_MultipleUses_mul1:
1356; GFX7: ; %bb.0: ; %entry
1357; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1358; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1359; GFX7-NEXT: s_mov_b32 s3, 0xf000
1360; GFX7-NEXT: s_mov_b32 s2, -1
1361; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1362; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1363; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1364; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1365; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1366; GFX7-NEXT: s_sext_i32_i16 s7, s4
1367; GFX7-NEXT: s_sext_i32_i16 s8, s5
1368; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1369; GFX7-NEXT: v_mov_b32_e32 v0, s7
1370; GFX7-NEXT: v_mov_b32_e32 v1, s6
1371; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1372; GFX7-NEXT: v_mad_i32_i24 v1, s8, v0, v1
1373; GFX7-NEXT: v_mov_b32_e32 v2, s4
1374; GFX7-NEXT: v_mad_i32_i24 v1, s5, v2, v1
1375; GFX7-NEXT: v_mad_i32_i24 v0, s8, v0, v1
1376; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1377; GFX7-NEXT: s_endpgm
1378;
1379; GFX89-LABEL: idot2_MultipleUses_mul1:
1380; GFX89: ; %bb.0: ; %entry
1381; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1382; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1383; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1384; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
1385; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
1386; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
1387; GFX89-NEXT: v_mov_b32_e32 v0, s0
1388; GFX89-NEXT: v_mov_b32_e32 v1, s1
1389; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1390; GFX89-NEXT: s_sext_i32_i16 s0, s2
1391; GFX89-NEXT: s_sext_i32_i16 s1, s3
1392; GFX89-NEXT: s_ashr_i32 s2, s2, 16
1393; GFX89-NEXT: v_mov_b32_e32 v2, s4
1394; GFX89-NEXT: v_mov_b32_e32 v3, s0
1395; GFX89-NEXT: s_ashr_i32 s3, s3, 16
1396; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1397; GFX89-NEXT: v_mov_b32_e32 v4, s2
1398; GFX89-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1399; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1400; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1401; GFX89-NEXT: s_endpgm
1402;
1403; GCN-DL-LABEL: idot2_MultipleUses_mul1:
1404; GCN-DL: ; %bb.0: ; %entry
1405; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1406; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1407; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1408; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1409; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1410; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1411; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1412; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1413; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1414; GCN-DL-NEXT: s_sext_i32_i16 s0, s2
1415; GCN-DL-NEXT: s_sext_i32_i16 s1, s3
1416; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16
1417; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
1418; GCN-DL-NEXT: v_mov_b32_e32 v3, s0
1419; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16
1420; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1421; GCN-DL-NEXT: v_mov_b32_e32 v4, s2
1422; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v4, v2
1423; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1424; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1425; GCN-DL-NEXT: s_endpgm
1426 <2 x i16> addrspace(1)* %src2,
1427 i32 addrspace(1)* nocapture %dst) {
1428entry:
1429 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1430 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1431
1432 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1433 %conv = sext i16 %s1.elt1 to i32
1434 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1435 %conv2 = sext i16 %s2.elt1 to i32
1436 %mul1 = mul i32 %conv2, %conv
1437
1438 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1439 %conv3 = sext i16 %s1.elt2 to i32
1440 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1441 %conv4 = sext i16 %s2.elt2 to i32
1442 %mul2 = mul i32 %conv4, %conv3
1443
1444 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1445 %add0 = add i32 %mul1, %s3
1446
1447 %add1 = add i32 %mul2, %add0
1448 %add2 = add i32 %add1, %mul1
1449
1450 store i32 %add2, i32 addrspace(1)* %dst, align 4
1451 ret void
1452}
1453
1454define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
1455; GFX7-LABEL: udot2_MultipleUses_mul2:
1456; GFX7: ; %bb.0: ; %entry
1457; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1458; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1459; GFX7-NEXT: s_mov_b32 s8, 0xffff
1460; GFX7-NEXT: s_mov_b32 s3, 0xf000
1461; GFX7-NEXT: s_mov_b32 s2, -1
1462; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1463; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1464; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1465; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1466; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1467; GFX7-NEXT: s_lshr_b32 s7, s4, 16
1468; GFX7-NEXT: s_lshr_b32 s9, s5, 16
1469; GFX7-NEXT: v_mov_b32_e32 v0, s7
1470; GFX7-NEXT: v_mov_b32_e32 v1, s6
1471; GFX7-NEXT: v_mad_u32_u24 v1, s9, v0, v1
1472; GFX7-NEXT: s_and_b32 s4, s4, s8
1473; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1474; GFX7-NEXT: s_and_b32 s5, s5, s8
1475; GFX7-NEXT: v_mov_b32_e32 v1, s4
1476; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
1477; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1478; GFX7-NEXT: s_endpgm
1479;
1480; GFX89-LABEL: udot2_MultipleUses_mul2:
1481; GFX89: ; %bb.0: ; %entry
1482; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1483; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1484; GFX89-NEXT: s_mov_b32 s2, 0xffff
1485; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1486; GFX89-NEXT: s_load_dword s3, s[4:5], 0x0
1487; GFX89-NEXT: s_load_dword s4, s[6:7], 0x0
1488; GFX89-NEXT: s_load_dword s5, s[0:1], 0x0
1489; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1490; GFX89-NEXT: s_and_b32 s6, s3, s2
1491; GFX89-NEXT: s_lshr_b32 s3, s3, 16
1492; GFX89-NEXT: s_and_b32 s2, s4, s2
1493; GFX89-NEXT: s_lshr_b32 s4, s4, 16
1494; GFX89-NEXT: v_mov_b32_e32 v0, s5
1495; GFX89-NEXT: v_mov_b32_e32 v1, s3
1496; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1497; GFX89-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1498; GFX89-NEXT: v_mov_b32_e32 v1, s6
1499; GFX89-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1500; GFX89-NEXT: v_mov_b32_e32 v0, s0
1501; GFX89-NEXT: v_mov_b32_e32 v1, s1
1502; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1503; GFX89-NEXT: s_endpgm
1504;
1505; GCN-DL-LABEL: udot2_MultipleUses_mul2:
1506; GCN-DL: ; %bb.0: ; %entry
1507; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1508; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1509; GCN-DL-NEXT: s_mov_b32 s2, 0xffff
1510; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1511; GCN-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1512; GCN-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1513; GCN-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1514; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1515; GCN-DL-NEXT: s_and_b32 s6, s3, s2
1516; GCN-DL-NEXT: s_lshr_b32 s3, s3, 16
1517; GCN-DL-NEXT: s_and_b32 s2, s4, s2
1518; GCN-DL-NEXT: s_lshr_b32 s4, s4, 16
1519; GCN-DL-NEXT: v_mov_b32_e32 v0, s5
1520; GCN-DL-NEXT: v_mov_b32_e32 v1, s3
1521; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1522; GCN-DL-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1523; GCN-DL-NEXT: v_mov_b32_e32 v1, s6
1524; GCN-DL-NEXT: v_mad_u32_u24 v2, s2, v1, v0
1525; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1526; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1527; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1528; GCN-DL-NEXT: s_endpgm
1529 <2 x i16> addrspace(1)* %src2,
1530 i32 addrspace(1)* nocapture %dst) {
1531entry:
1532 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1533 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1534
1535 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1536 %conv = zext i16 %s1.elt1 to i32
1537 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1538 %conv2 = zext i16 %s2.elt1 to i32
1539 %mul1 = mul i32 %conv2, %conv
1540
1541 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1542 %conv3 = zext i16 %s1.elt2 to i32
1543 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1544 %conv4 = zext i16 %s2.elt2 to i32
1545 %mul2 = mul i32 %conv4, %conv3
1546
1547 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1548 %add0 = add i32 %mul2, %s3
1549
1550 %add1 = add i32 %mul2, %add0
1551 %add2 = add i32 %add1, %mul1
1552
1553 store i32 %add2, i32 addrspace(1)* %dst, align 4
1554 ret void
1555}
1556
1557define amdgpu_kernel void @idot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1,
1558; GFX7-LABEL: idot2_MultipleUses_mul2:
1559; GFX7: ; %bb.0: ; %entry
1560; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1561; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1562; GFX7-NEXT: s_mov_b32 s3, 0xf000
1563; GFX7-NEXT: s_mov_b32 s2, -1
1564; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1565; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1566; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1567; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1568; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1569; GFX7-NEXT: s_sext_i32_i16 s7, s4
1570; GFX7-NEXT: s_ashr_i32 s4, s4, 16
1571; GFX7-NEXT: s_sext_i32_i16 s8, s5
1572; GFX7-NEXT: s_ashr_i32 s5, s5, 16
1573; GFX7-NEXT: v_mov_b32_e32 v0, s4
1574; GFX7-NEXT: v_mov_b32_e32 v1, s6
1575; GFX7-NEXT: v_mad_i32_i24 v1, s5, v0, v1
1576; GFX7-NEXT: v_mad_i32_i24 v0, s5, v0, v1
1577; GFX7-NEXT: v_mov_b32_e32 v1, s7
1578; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
1579; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1580; GFX7-NEXT: s_endpgm
1581;
1582; GFX89-LABEL: idot2_MultipleUses_mul2:
1583; GFX89: ; %bb.0: ; %entry
1584; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1585; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1586; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1587; GFX89-NEXT: s_load_dword s2, s[4:5], 0x0
1588; GFX89-NEXT: s_load_dword s3, s[6:7], 0x0
1589; GFX89-NEXT: s_load_dword s4, s[0:1], 0x0
1590; GFX89-NEXT: v_mov_b32_e32 v0, s0
1591; GFX89-NEXT: v_mov_b32_e32 v1, s1
1592; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1593; GFX89-NEXT: s_sext_i32_i16 s0, s2
1594; GFX89-NEXT: s_ashr_i32 s2, s2, 16
1595; GFX89-NEXT: s_sext_i32_i16 s1, s3
1596; GFX89-NEXT: s_ashr_i32 s3, s3, 16
1597; GFX89-NEXT: v_mov_b32_e32 v2, s4
1598; GFX89-NEXT: v_mov_b32_e32 v3, s2
1599; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1600; GFX89-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1601; GFX89-NEXT: v_mov_b32_e32 v3, s0
1602; GFX89-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1603; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1604; GFX89-NEXT: s_endpgm
1605;
1606; GCN-DL-LABEL: idot2_MultipleUses_mul2:
1607; GCN-DL: ; %bb.0: ; %entry
1608; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1609; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1610; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1611; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1612; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1613; GCN-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1614; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1615; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1616; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1617; GCN-DL-NEXT: s_sext_i32_i16 s0, s2
1618; GCN-DL-NEXT: s_ashr_i32 s2, s2, 16
1619; GCN-DL-NEXT: s_sext_i32_i16 s1, s3
1620; GCN-DL-NEXT: s_ashr_i32 s3, s3, 16
1621; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
1622; GCN-DL-NEXT: v_mov_b32_e32 v3, s2
1623; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1624; GCN-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1625; GCN-DL-NEXT: v_mov_b32_e32 v3, s0
1626; GCN-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
1627; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1628; GCN-DL-NEXT: s_endpgm
1629 <2 x i16> addrspace(1)* %src2,
1630 i32 addrspace(1)* nocapture %dst) {
1631entry:
1632 %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1633 %vec2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1634
1635 %s1.elt1 = extractelement <2 x i16> %vec1, i64 0
1636 %conv = sext i16 %s1.elt1 to i32
1637 %s2.elt1 = extractelement <2 x i16> %vec2, i64 0
1638 %conv2 = sext i16 %s2.elt1 to i32
1639 %mul1 = mul i32 %conv2, %conv
1640
1641 %s1.elt2 = extractelement <2 x i16> %vec1, i64 1
1642 %conv3 = sext i16 %s1.elt2 to i32
1643 %s2.elt2 = extractelement <2 x i16> %vec2, i64 1
1644 %conv4 = sext i16 %s2.elt2 to i32
1645 %mul2 = mul i32 %conv4, %conv3
1646
1647 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1648 %add0 = add i32 %mul2, %s3
1649
1650 %add1 = add i32 %mul2, %add0
1651 %add2 = add i32 %add1, %mul1
1652
1653 store i32 %add2, i32 addrspace(1)* %dst, align 4
1654 ret void
1655}
1656
1657define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
1658; GFX7-LABEL: udot2_acc16:
1659; GFX7: ; %bb.0: ; %entry
1660; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1661; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1662; GFX7-NEXT: s_mov_b32 s3, 0xf000
1663; GFX7-NEXT: s_mov_b32 s2, -1
1664; GFX7-NEXT: s_mov_b32 s8, 0xffff
1665; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1666; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1667; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
1668; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1669; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1670; GFX7-NEXT: s_lshr_b32 s6, s4, 16
1671; GFX7-NEXT: s_and_b32 s4, s4, s8
1672; GFX7-NEXT: s_lshr_b32 s7, s5, 16
1673; GFX7-NEXT: v_mov_b32_e32 v1, s7
1674; GFX7-NEXT: s_and_b32 s5, s5, s8
1675; GFX7-NEXT: s_waitcnt vmcnt(0)
1676; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1677; GFX7-NEXT: v_mov_b32_e32 v1, s5
1678; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1679; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1680; GFX7-NEXT: s_endpgm
1681;
1682; GFX89-LABEL: udot2_acc16:
1683; GFX89: ; %bb.0: ; %entry
1684; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1685; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1686; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1687; GFX89-NEXT: v_mov_b32_e32 v0, s0
1688; GFX89-NEXT: v_mov_b32_e32 v1, s1
1689; GFX89-NEXT: {{flat|global}}_load_ushort v2, v[0:1]
1690; GFX89-NEXT: s_load_dword s1, s[4:5], 0x0
1691; GFX89-NEXT: s_load_dword s2, s[6:7], 0x0
1692; GFX89-NEXT: s_mov_b32 s0, 0xffff
1693; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1694; GFX89-NEXT: s_and_b32 s3, s1, s0
1695; GFX89-NEXT: s_and_b32 s0, s2, s0
1696; GFX89-NEXT: s_lshr_b32 s2, s2, 16
1697; GFX89-NEXT: s_lshr_b32 s1, s1, 16
1698; GFX89-NEXT: v_mov_b32_e32 v3, s2
1699; GFX89-NEXT: s_waitcnt vmcnt(0)
1700; GFX89-NEXT: v_mad_u32_u24 v2, s1, v3, v2
1701; GFX89-NEXT: v_mov_b32_e32 v3, s0
1702; GFX89-NEXT: v_mad_u32_u24 v2, s3, v3, v2
1703; GFX89-NEXT: {{flat|global}}_store_short v[0:1], v2
1704; GFX89-NEXT: s_endpgm
1705;
1706; GCN-DL-LABEL: udot2_acc16:
1707; GCN-DL: ; %bb.0: ; %entry
1708; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1709; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1710; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1711; GCN-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1712; GCN-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1713; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1714; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1715; GCN-DL-NEXT: global_load_ushort v2, v[0:1], off
1716; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1717; GCN-DL-NEXT: v_mov_b32_e32 v3, s3
1718; GCN-DL-NEXT: s_waitcnt vmcnt(0)
1719; GCN-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
1720; GCN-DL-NEXT: global_store_short v[0:1], v2, off
1721; GCN-DL-NEXT: s_endpgm
1722 <2 x i16> addrspace(1)* %src2,
1723 i16 addrspace(1)* nocapture %dst) {
1724entry:
1725 %v1 = load <2 x i16>, <2 x i16> addrspace(1)* %src1
1726 %v2 = load <2 x i16>, <2 x i16> addrspace(1)* %src2
1727
1728 %v1e1 = extractelement <2 x i16> %v1, i64 0
1729 %v2e1 = extractelement <2 x i16> %v2, i64 0
1730 %mul1 = mul i16 %v1e1, %v2e1
1731
1732 %v1e2 = extractelement <2 x i16> %v1, i64 1
1733 %v2e2 = extractelement <2 x i16> %v2, i64 1
1734 %mul2 = mul i16 %v1e2, %v2e2
1735
1736 %s2 = load i16, i16 addrspace(1)* %dst, align 2
1737 %add1 = add i16 %mul2, %s2
1738 %add2 = add i16 %add1, %mul1
1739 store i16 %add2, i16 addrspace(1)* %dst, align 2
1740 ret void
1741}
1742
1743
1744define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
1745; GFX7-LABEL: notsdot2_sext8:
1746; GFX7: ; %bb.0: ; %entry
1747; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1748; GFX7-NEXT: s_mov_b32 s3, 0xf000
1749; GFX7-NEXT: s_mov_b32 s2, -1
1750; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1751; GFX7-NEXT: s_mov_b32 s10, s2
1752; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1753; GFX7-NEXT: s_mov_b32 s8, s6
1754; GFX7-NEXT: s_mov_b32 s9, s7
1755; GFX7-NEXT: s_mov_b32 s11, s3
1756; GFX7-NEXT: s_mov_b32 s6, s2
1757; GFX7-NEXT: s_mov_b32 s7, s3
1758; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0
1759; GFX7-NEXT: buffer_load_ushort v1, off, s[8:11], 0
1760; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0
1761; GFX7-NEXT: s_waitcnt vmcnt(1)
1762; GFX7-NEXT: v_bfe_i32 v2, v0, 0, 8
1763; GFX7-NEXT: s_waitcnt vmcnt(0)
1764; GFX7-NEXT: v_bfe_i32 v3, v1, 0, 8
1765; GFX7-NEXT: v_bfe_i32 v0, v0, 8, 8
1766; GFX7-NEXT: v_bfe_i32 v1, v1, 8, 8
1767; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1768; GFX7-NEXT: v_mad_i32_i24 v0, v1, v0, s4
1769; GFX7-NEXT: v_mad_i32_i24 v0, v3, v2, v0
1770; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1771; GFX7-NEXT: s_endpgm
1772;
1773; GFX89-LABEL: notsdot2_sext8:
1774; GFX89: ; %bb.0: ; %entry
1775; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1776; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1777; GFX89-NEXT: s_waitcnt lgkmcnt(0)
1778; GFX89-NEXT: s_load_dword s2, s[0:1], 0x0
1779; GFX89-NEXT: v_mov_b32_e32 v0, s6
1780; GFX89-NEXT: v_mov_b32_e32 v1, s7
1781; GFX89-NEXT: v_mov_b32_e32 v2, s4
1782; GFX89-NEXT: v_mov_b32_e32 v3, s5
1783; GFX89-NEXT: {{flat|global}}_load_ushort v2, v[2:3]
1784; GFX89-NEXT: {{flat|global}}_load_ushort v3, v[0:1]
1785; GFX89-NEXT: v_mov_b32_e32 v0, s0
1786; GFX89-NEXT: v_mov_b32_e32 v1, s1
1787; GFX89-NEXT: s_waitcnt vmcnt(1)
1788; GFX89-NEXT: v_lshrrev_b16_e32 v4, 8, v2
1789; GFX89-NEXT: s_waitcnt vmcnt(0)
1790; GFX89-NEXT: v_bfe_i32 v5, v3, 0, 8
1791; GFX89-NEXT: v_lshrrev_b16_e32 v3, 8, v3
1792; GFX89-NEXT: v_bfe_i32 v4, v4, 0, 8
1793; GFX89-NEXT: v_bfe_i32 v3, v3, 0, 8
1794; GFX89-NEXT: v_bfe_i32 v2, v2, 0, 8
1795; GFX89: v_mad_i32_i24 v3, v3, v4, s2
1796; GFX89: v_mad_i32_i24 v2, v5, v2, v3
1797; GFX89-NEXT: {{flat|global}}_store_dword v[0:1], v2
1798; GFX89-NEXT: s_endpgm
1799;
1800; GCN-DL-LABEL: notsdot2_sext8:
1801; GCN-DL: ; %bb.0: ; %entry
1802; GCN-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1803; GCN-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1804; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1805; GCN-DL-NEXT: s_load_dword s2, s[0:1], 0x0
1806; GCN-DL-NEXT: v_mov_b32_e32 v0, s6
1807; GCN-DL-NEXT: v_mov_b32_e32 v1, s7
1808; GCN-DL-NEXT: v_mov_b32_e32 v2, s4
1809; GCN-DL-NEXT: v_mov_b32_e32 v3, s5
1810; GCN-DL-NEXT: global_load_ushort v2, v[2:3], off
1811; GCN-DL-NEXT: global_load_ushort v3, v[0:1], off
1812; GCN-DL-NEXT: v_mov_b32_e32 v0, s0
1813; GCN-DL-NEXT: v_mov_b32_e32 v1, s1
1814; GCN-DL-NEXT: s_waitcnt vmcnt(1)
1815; GCN-DL-NEXT: v_lshrrev_b16_e32 v4, 8, v2
1816; GCN-DL-NEXT: s_waitcnt vmcnt(0)
1817; GCN-DL-NEXT: v_bfe_i32 v5, v3, 0, 8
1818; GCN-DL-NEXT: v_lshrrev_b16_e32 v3, 8, v3
1819; GCN-DL-NEXT: v_bfe_i32 v4, v4, 0, 8
1820; GCN-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
1821; GCN-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
1822; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
1823; GCN-DL-NEXT: v_mad_i32_i24 v3, v3, v4, s2
1824; GCN-DL-NEXT: v_mad_i32_i24 v2, v5, v2, v3
1825; GCN-DL-NEXT: global_store_dword v[0:1], v2, off
1826; GCN-DL-NEXT: s_endpgm
1827 <2 x i8> addrspace(1)* %src2,
1828 i32 addrspace(1)* nocapture %dst) {
1829entry:
1830 %vec1 = load <2 x i8>, <2 x i8> addrspace(1)* %src1
1831 %vec2 = load <2 x i8>, <2 x i8> addrspace(1)* %src2
1832
1833 %s1.elt1 = extractelement <2 x i8> %vec1, i64 0
1834 %conv = sext i8 %s1.elt1 to i32
1835 %s2.elt1 = extractelement <2 x i8> %vec2, i64 0
1836 %conv2 = sext i8 %s2.elt1 to i32
1837 %mul1 = mul nuw i32 %conv2, %conv
1838
1839 %s1.elt2 = extractelement <2 x i8> %vec1, i64 1
1840 %conv3 = sext i8 %s1.elt2 to i32
1841 %s2.elt2 = extractelement <2 x i8> %vec2, i64 1
1842 %conv4 = sext i8 %s2.elt2 to i32
1843 %mul2 = mul nuw i32 %conv4, %conv3
1844
1845 %s3 = load i32, i32 addrspace(1)* %dst, align 4
1846 %add = add i32 %mul2, %s3
1847 %add6 = add i32 %add, %mul1
1848 store i32 %add6, i32 addrspace(1)* %dst, align 4
1849 ret void
1850}