blob: 103245553a084815de580a06a6fced7b2d2eff34 [file] [log] [blame]
Farhana Aleen4bc597b2018-10-04 16:57:37 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
Farhana Aleen9250c922018-08-29 16:31:18 +00006
7define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +00008; GFX7-LABEL: udot4_acc32:
9; GFX7: ; %bb.0: ; %entry
10; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
11; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
12; GFX7-NEXT: s_movk_i32 s8, 0xff
13; GFX7-NEXT: s_mov_b32 s3, 0xf000
14; GFX7-NEXT: s_mov_b32 s2, -1
15; GFX7-NEXT: s_waitcnt lgkmcnt(0)
16; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
17; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
18; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
19; GFX7-NEXT: s_waitcnt lgkmcnt(0)
20; GFX7-NEXT: s_and_b32 s7, s4, s8
21; GFX7-NEXT: s_and_b32 s8, s5, s8
22; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
23; GFX7-NEXT: v_mov_b32_e32 v0, s8
24; GFX7-NEXT: v_mov_b32_e32 v1, s6
25; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010
26; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
27; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
28; GFX7-NEXT: v_mov_b32_e32 v1, s10
29; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
30; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
31; GFX7-NEXT: v_mov_b32_e32 v1, s12
32; GFX7-NEXT: s_lshr_b32 s5, s5, 24
33; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0
34; GFX7-NEXT: s_lshr_b32 s4, s4, 24
35; GFX7-NEXT: v_mov_b32_e32 v1, s5
36; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
37; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
38; GFX7-NEXT: s_endpgm
39;
40; GFX8-LABEL: udot4_acc32:
41; GFX8: ; %bb.0: ; %entry
42; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
43; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
44; GFX8-NEXT: s_movk_i32 s2, 0xff
45; GFX8-NEXT: s_waitcnt lgkmcnt(0)
46; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
47; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
48; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
49; GFX8-NEXT: s_waitcnt lgkmcnt(0)
50; GFX8-NEXT: s_and_b32 s6, s3, s2
51; GFX8-NEXT: s_and_b32 s2, s4, s2
52; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008
53; GFX8-NEXT: v_mov_b32_e32 v0, s2
54; GFX8-NEXT: v_mov_b32_e32 v1, s5
55; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010
56; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1
57; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008
58; GFX8-NEXT: v_mov_b32_e32 v1, s8
59; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010
60; GFX8-NEXT: v_mad_u32_u24 v0, s7, v1, v0
61; GFX8-NEXT: v_mov_b32_e32 v1, s10
62; GFX8-NEXT: s_lshr_b32 s4, s4, 24
63; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0
64; GFX8-NEXT: s_lshr_b32 s3, s3, 24
65; GFX8-NEXT: v_mov_b32_e32 v1, s4
66; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0
67; GFX8-NEXT: v_mov_b32_e32 v0, s0
68; GFX8-NEXT: v_mov_b32_e32 v1, s1
69; GFX8-NEXT: flat_store_dword v[0:1], v2
70; GFX8-NEXT: s_endpgm
71;
72; GFX9-NODL-LABEL: udot4_acc32:
73; GFX9-NODL: ; %bb.0: ; %entry
74; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
75; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
76; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
77; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
78; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
79; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
80; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
81; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
82; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
83; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
84; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008
85; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2
86; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
87; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010
88; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1
89; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
90; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8
91; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
92; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v1, v0
93; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
94; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
95; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v1, v0
96; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
97; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
98; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
99; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
100; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
101; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
102; GFX9-NODL-NEXT: s_endpgm
103;
104; GFX9-DL-LABEL: udot4_acc32:
105; GFX9-DL: ; %bb.0: ; %entry
106; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
107; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
108; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
109; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
110; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
111; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
112; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
113; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
114; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
115; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
116; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
117; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v2, v3
118; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
119; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +0000120 <4 x i8> addrspace(1)* %src2,
121 i32 addrspace(1)* nocapture %dst) {
122entry:
123 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
124 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
125
126 %v1e0 = extractelement <4 x i8> %vec1, i64 0
127 %cv1e0 = zext i8 %v1e0 to i32
128 %v2e0 = extractelement <4 x i8> %vec2, i64 0
129 %cv2e0 = zext i8 %v2e0 to i32
130 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
131
132 %v1e1 = extractelement <4 x i8> %vec1, i64 1
133 %cv1e1 = zext i8 %v1e1 to i32
134 %v2e1 = extractelement <4 x i8> %vec2, i64 1
135 %cv2e1 = zext i8 %v2e1 to i32
136 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
137
138 %v1e2 = extractelement <4 x i8> %vec1, i64 2
139 %cv1e2 = zext i8 %v1e2 to i32
140 %v2e2 = extractelement <4 x i8> %vec2, i64 2
141 %cv2e2 = zext i8 %v2e2 to i32
142 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
143
144 %v1e3 = extractelement <4 x i8> %vec1, i64 3
145 %cv1e3 = zext i8 %v1e3 to i32
146 %v2e3 = extractelement <4 x i8> %vec2, i64 3
147 %cv2e3 = zext i8 %v2e3 to i32
148 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
149
150 %acc = load i32, i32 addrspace(1)* %dst, align 4
151 %mad1 = add i32 %mul1, %acc
152 %mad2 = add i32 %mad1, %mul2
153 %mad3 = add i32 %mad2, %mul3
154 %mad4 = add i32 %mad3, %mul4
155
156 store i32 %mad4, i32 addrspace(1)* %dst, align 4
157 ret void
158}
159
160define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000161; GFX7-LABEL: udot4_acc16:
162; GFX7: ; %bb.0: ; %entry
163; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
164; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
165; GFX7-NEXT: s_mov_b32 s3, 0xf000
166; GFX7-NEXT: s_mov_b32 s2, -1
167; GFX7-NEXT: s_movk_i32 s8, 0xff
168; GFX7-NEXT: s_waitcnt lgkmcnt(0)
169; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
170; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
171; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
172; GFX7-NEXT: s_waitcnt lgkmcnt(0)
173; GFX7-NEXT: s_and_b32 s7, s4, s8
174; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
175; GFX7-NEXT: s_and_b32 s6, s5, s8
176; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
177; GFX7-NEXT: v_mov_b32_e32 v1, s6
178; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
179; GFX7-NEXT: v_mov_b32_e32 v2, s8
180; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
181; GFX7-NEXT: s_lshr_b32 s5, s5, 24
182; GFX7-NEXT: v_mov_b32_e32 v3, s10
183; GFX7-NEXT: s_lshr_b32 s4, s4, 24
184; GFX7-NEXT: s_waitcnt vmcnt(0)
185; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
186; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
187; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
188; GFX7-NEXT: v_mov_b32_e32 v1, s5
189; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
190; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
191; GFX7-NEXT: s_endpgm
192;
193; GFX8-LABEL: udot4_acc16:
194; GFX8: ; %bb.0: ; %entry
195; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
196; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
197; GFX8-NEXT: s_waitcnt lgkmcnt(0)
198; GFX8-NEXT: v_mov_b32_e32 v0, s0
199; GFX8-NEXT: v_mov_b32_e32 v1, s1
200; GFX8-NEXT: flat_load_ushort v2, v[0:1]
201; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
202; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
203; GFX8-NEXT: s_movk_i32 s0, 0xff
204; GFX8-NEXT: s_waitcnt lgkmcnt(0)
205; GFX8-NEXT: s_and_b32 s3, s1, s0
206; GFX8-NEXT: s_and_b32 s0, s2, s0
207; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
208; GFX8-NEXT: v_mov_b32_e32 v3, s0
209; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
210; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
211; GFX8-NEXT: v_mov_b32_e32 v4, s5
212; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
213; GFX8-NEXT: s_lshr_b32 s2, s2, 24
214; GFX8-NEXT: v_mov_b32_e32 v5, s7
215; GFX8-NEXT: s_lshr_b32 s1, s1, 24
216; GFX8-NEXT: s_waitcnt vmcnt(0)
217; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
218; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
219; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
220; GFX8-NEXT: v_mov_b32_e32 v3, s2
221; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
222; GFX8-NEXT: flat_store_short v[0:1], v2
223; GFX8-NEXT: s_endpgm
224;
225; GFX9-NODL-LABEL: udot4_acc16:
226; GFX9-NODL: ; %bb.0: ; %entry
227; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
228; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
229; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
230; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
231; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
232; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
233; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
234; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
235; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
236; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
237; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
238; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
239; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
240; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
241; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
242; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
243; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
244; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
245; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
246; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
247; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
248; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
249; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
250; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
251; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
252; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
253; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
254; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
255; GFX9-NODL-NEXT: s_endpgm
256;
257; GFX9-DL-LABEL: udot4_acc16:
258; GFX9-DL: ; %bb.0: ; %entry
259; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
260; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
261; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
262; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
263; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
264; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
265; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
266; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
267; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
268; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
269; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
270; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
271; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
272; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +0000273 <4 x i8> addrspace(1)* %src2,
274 i16 addrspace(1)* nocapture %dst) {
275entry:
276 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
277 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
278
279 %v1e0 = extractelement <4 x i8> %vec1, i64 0
280 %cv1e0 = zext i8 %v1e0 to i16
281 %v2e0 = extractelement <4 x i8> %vec2, i64 0
282 %cv2e0 = zext i8 %v2e0 to i16
283 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
284
285 %v1e1 = extractelement <4 x i8> %vec1, i64 1
286 %cv1e1 = zext i8 %v1e1 to i16
287 %v2e1 = extractelement <4 x i8> %vec2, i64 1
288 %cv2e1 = zext i8 %v2e1 to i16
289 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
290
291 %v1e2 = extractelement <4 x i8> %vec1, i64 2
292 %cv1e2 = zext i8 %v1e2 to i16
293 %v2e2 = extractelement <4 x i8> %vec2, i64 2
294 %cv2e2 = zext i8 %v2e2 to i16
295 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
296
297 %v1e3 = extractelement <4 x i8> %vec1, i64 3
298 %cv1e3 = zext i8 %v1e3 to i16
299 %v2e3 = extractelement <4 x i8> %vec2, i64 3
300 %cv2e3 = zext i8 %v2e3 to i16
301 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
302
303 %acc = load i16, i16 addrspace(1)* %dst, align 2
304 %mad1 = add i16 %mul1, %acc
305 %mad2 = add i16 %mad1, %mul2
306 %mad3 = add i16 %mad2, %mul3
307 %mad4 = add i16 %mad3, %mul4
308
309 store i16 %mad4, i16 addrspace(1)* %dst, align 2
310 ret void
311}
312
313define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000314; GFX7-LABEL: udot4_acc8:
315; GFX7: ; %bb.0: ; %entry
316; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
317; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
318; GFX7-NEXT: s_mov_b32 s3, 0xf000
319; GFX7-NEXT: s_mov_b32 s2, -1
320; GFX7-NEXT: s_movk_i32 s8, 0xff
321; GFX7-NEXT: s_waitcnt lgkmcnt(0)
322; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
323; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
324; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
325; GFX7-NEXT: s_waitcnt lgkmcnt(0)
326; GFX7-NEXT: s_and_b32 s7, s4, s8
327; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
328; GFX7-NEXT: s_and_b32 s6, s5, s8
329; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
330; GFX7-NEXT: v_mov_b32_e32 v1, s6
331; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
332; GFX7-NEXT: v_mov_b32_e32 v2, s8
333; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
334; GFX7-NEXT: s_lshr_b32 s5, s5, 24
335; GFX7-NEXT: v_mov_b32_e32 v3, s10
336; GFX7-NEXT: s_lshr_b32 s4, s4, 24
337; GFX7-NEXT: s_waitcnt vmcnt(0)
338; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
339; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
340; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
341; GFX7-NEXT: v_mov_b32_e32 v1, s5
342; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
343; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
344; GFX7-NEXT: s_endpgm
345;
346; GFX8-LABEL: udot4_acc8:
347; GFX8: ; %bb.0: ; %entry
348; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
349; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
350; GFX8-NEXT: s_movk_i32 s2, 0xff
351; GFX8-NEXT: s_waitcnt lgkmcnt(0)
352; GFX8-NEXT: v_mov_b32_e32 v0, s0
353; GFX8-NEXT: v_mov_b32_e32 v1, s1
354; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
355; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
356; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
357; GFX8-NEXT: s_waitcnt lgkmcnt(0)
358; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
359; GFX8-NEXT: s_and_b32 s3, s1, s2
360; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
361; GFX8-NEXT: s_and_b32 s2, s0, s2
362; GFX8-NEXT: v_mov_b32_e32 v3, s3
363; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
364; GFX8-NEXT: v_mov_b32_e32 v4, s5
365; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
366; GFX8-NEXT: s_lshr_b32 s1, s1, 24
367; GFX8-NEXT: v_mov_b32_e32 v5, s6
368; GFX8-NEXT: s_lshr_b32 s0, s0, 24
369; GFX8-NEXT: s_waitcnt vmcnt(0)
370; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
371; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
372; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
373; GFX8-NEXT: v_mov_b32_e32 v3, s1
374; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
375; GFX8-NEXT: flat_store_byte v[0:1], v2
376; GFX8-NEXT: s_endpgm
377;
378; GFX9-NODL-LABEL: udot4_acc8:
379; GFX9-NODL: ; %bb.0: ; %entry
380; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
381; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
382; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
383; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
384; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
385; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
386; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
387; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
388; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
389; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
390; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
391; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
392; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
393; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
394; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
395; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
396; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
397; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
398; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
399; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
400; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
401; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
402; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
403; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
404; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
405; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
406; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
407; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
408; GFX9-NODL-NEXT: s_endpgm
409;
410; GFX9-DL-LABEL: udot4_acc8:
411; GFX9-DL: ; %bb.0: ; %entry
412; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
413; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
414; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
415; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
416; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
417; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
418; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
419; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
420; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
421; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
422; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
423; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
424; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
425; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +0000426 <4 x i8> addrspace(1)* %src2,
427 i8 addrspace(1)* nocapture %dst) {
428entry:
429 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
430 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
431
432 %v1e0 = extractelement <4 x i8> %vec1, i64 0
433 %v2e0 = extractelement <4 x i8> %vec2, i64 0
434 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
435
436 %v1e1 = extractelement <4 x i8> %vec1, i64 1
437 %v2e1 = extractelement <4 x i8> %vec2, i64 1
438 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
439
440 %v1e2 = extractelement <4 x i8> %vec1, i64 2
441 %v2e2 = extractelement <4 x i8> %vec2, i64 2
442 %mul3 = mul nuw nsw i8 %v1e2, %v2e2
443
444 %v1e3 = extractelement <4 x i8> %vec1, i64 3
445 %v2e3 = extractelement <4 x i8> %vec2, i64 3
446 %mul4 = mul nuw nsw i8 %v1e3, %v2e3
447
448 %acc = load i8, i8 addrspace(1)* %dst, align 2
449 %mad1 = add i8 %mul1, %acc
450 %mad2 = add i8 %mad1, %mul2
451 %mad3 = add i8 %mad2, %mul3
452 %mad4 = add i8 %mad3, %mul4
453
454 store i8 %mad4, i8 addrspace(1)* %dst, align 2
455 ret void
456}
457
458; TODO: Generate udot4?
459define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000460; GFX7-LABEL: udot2_8:
461; GFX7: ; %bb.0: ; %entry
462; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
463; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
464; GFX7-NEXT: s_mov_b32 s3, 0xf000
465; GFX7-NEXT: s_mov_b32 s2, -1
466; GFX7-NEXT: s_movk_i32 s8, 0xff
467; GFX7-NEXT: s_waitcnt lgkmcnt(0)
468; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
469; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
470; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
471; GFX7-NEXT: s_waitcnt lgkmcnt(0)
472; GFX7-NEXT: s_and_b32 s7, s4, s8
473; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80008
474; GFX7-NEXT: s_and_b32 s6, s5, s8
475; GFX7-NEXT: v_mov_b32_e32 v1, s6
476; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80008
477; GFX7-NEXT: s_waitcnt vmcnt(0)
478; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
479; GFX7-NEXT: v_mov_b32_e32 v1, s5
480; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
481; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
482; GFX7-NEXT: s_endpgm
483;
484; GFX8-LABEL: udot2_8:
485; GFX8: ; %bb.0: ; %entry
486; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
487; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
488; GFX8-NEXT: s_movk_i32 s2, 0xff
489; GFX8-NEXT: s_waitcnt lgkmcnt(0)
490; GFX8-NEXT: v_mov_b32_e32 v0, s0
491; GFX8-NEXT: v_mov_b32_e32 v1, s1
492; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
493; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
494; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
495; GFX8-NEXT: s_waitcnt lgkmcnt(0)
496; GFX8-NEXT: s_and_b32 s3, s1, s2
497; GFX8-NEXT: s_and_b32 s2, s0, s2
498; GFX8-NEXT: v_mov_b32_e32 v3, s3
499; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
500; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008
501; GFX8-NEXT: s_waitcnt vmcnt(0)
502; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
503; GFX8-NEXT: v_mov_b32_e32 v3, s1
504; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
505; GFX8-NEXT: flat_store_byte v[0:1], v2
506; GFX8-NEXT: s_endpgm
507;
508; GFX9-NODL-LABEL: udot2_8:
509; GFX9-NODL: ; %bb.0: ; %entry
510; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
511; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
512; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
513; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
514; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
515; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
516; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
517; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
518; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
519; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
520; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
521; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
522; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
523; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008
524; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008
525; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
526; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
527; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
528; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
529; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
530; GFX9-NODL-NEXT: s_endpgm
531;
532; GFX9-DL-LABEL: udot2_8:
533; GFX9-DL: ; %bb.0: ; %entry
534; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
535; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
536; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
537; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
538; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
539; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
540; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
541; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
542; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
543; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
544; GFX9-DL-NEXT: s_and_b32 s3, s1, s2
545; GFX9-DL-NEXT: s_and_b32 s2, s0, s2
546; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
547; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
548; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
549; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
550; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
551; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
552; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
553; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
554; GFX9-DL-NEXT: s_endpgm
555 <4 x i8> addrspace(1)* %src2,
556 i8 addrspace(1)* nocapture %dst) {
Farhana Aleen9250c922018-08-29 16:31:18 +0000557entry:
558 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
559 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
560
561 %v1e0 = extractelement <4 x i8> %vec1, i64 0
562 %v2e0 = extractelement <4 x i8> %vec2, i64 0
563 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
564
565 %v1e1 = extractelement <4 x i8> %vec1, i64 1
566 %v2e1 = extractelement <4 x i8> %vec2, i64 1
567 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
568
569 %acc = load i8, i8 addrspace(1)* %dst, align 2
570 %mad1 = add i8 %mul1, %acc
571 %mad2 = add i8 %mad1, %mul2
572 store i8 %mad2, i8 addrspace(1)* %dst, align 2
573 ret void
574}
575
576define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000577; GFX7-LABEL: udot4_CommutationInsideMAD:
578; GFX7: ; %bb.0: ; %entry
579; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
580; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
581; GFX7-NEXT: s_mov_b32 s3, 0xf000
582; GFX7-NEXT: s_mov_b32 s2, -1
583; GFX7-NEXT: s_movk_i32 s8, 0xff
584; GFX7-NEXT: s_waitcnt lgkmcnt(0)
585; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
586; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
587; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
588; GFX7-NEXT: s_waitcnt lgkmcnt(0)
589; GFX7-NEXT: s_and_b32 s6, s4, s8
590; GFX7-NEXT: v_mov_b32_e32 v1, s6
591; GFX7-NEXT: s_and_b32 s7, s5, s8
592; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008
593; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010
594; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
595; GFX7-NEXT: v_mov_b32_e32 v2, s8
596; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010
597; GFX7-NEXT: s_lshr_b32 s4, s4, 24
598; GFX7-NEXT: v_mov_b32_e32 v3, s10
599; GFX7-NEXT: s_lshr_b32 s5, s5, 24
600; GFX7-NEXT: s_waitcnt vmcnt(0)
601; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
602; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
603; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
604; GFX7-NEXT: v_mov_b32_e32 v1, s4
605; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
606; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
607; GFX7-NEXT: s_endpgm
608;
609; GFX8-LABEL: udot4_CommutationInsideMAD:
610; GFX8: ; %bb.0: ; %entry
611; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
612; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
613; GFX8-NEXT: s_waitcnt lgkmcnt(0)
614; GFX8-NEXT: v_mov_b32_e32 v0, s0
615; GFX8-NEXT: v_mov_b32_e32 v1, s1
616; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
617; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
618; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
619; GFX8-NEXT: s_movk_i32 s0, 0xff
620; GFX8-NEXT: s_waitcnt lgkmcnt(0)
621; GFX8-NEXT: s_and_b32 s3, s1, s0
622; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
623; GFX8-NEXT: s_and_b32 s0, s2, s0
624; GFX8-NEXT: v_mov_b32_e32 v3, s3
625; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
626; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
627; GFX8-NEXT: v_mov_b32_e32 v4, s5
628; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
629; GFX8-NEXT: s_lshr_b32 s1, s1, 24
630; GFX8-NEXT: v_mov_b32_e32 v5, s6
631; GFX8-NEXT: s_lshr_b32 s2, s2, 24
632; GFX8-NEXT: s_waitcnt vmcnt(0)
633; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
634; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
635; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
636; GFX8-NEXT: v_mov_b32_e32 v3, s1
637; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
638; GFX8-NEXT: flat_store_byte v[0:1], v2
639; GFX8-NEXT: s_endpgm
640;
641; GFX9-NODL-LABEL: udot4_CommutationInsideMAD:
642; GFX9-NODL: ; %bb.0: ; %entry
643; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
644; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
645; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
646; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
647; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
648; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
649; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
650; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
651; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
652; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
653; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
654; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
655; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
656; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
657; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
658; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
659; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
660; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
661; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
662; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
663; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
664; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
665; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
666; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
667; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
668; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
669; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
670; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
671; GFX9-NODL-NEXT: s_endpgm
672;
673; GFX9-DL-LABEL: udot4_CommutationInsideMAD:
674; GFX9-DL: ; %bb.0: ; %entry
675; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
676; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
677; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
678; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
679; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
680; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
681; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
682; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
683; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
684; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
685; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
686; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2
687; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
688; GFX9-DL-NEXT: s_endpgm
689 <4 x i8> addrspace(1)* %src2,
690 i8 addrspace(1)* nocapture %dst) {
Farhana Aleen9250c922018-08-29 16:31:18 +0000691entry:
692 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
693 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
694
695 %v1e0 = extractelement <4 x i8> %vec1, i64 0
696 %v2e0 = extractelement <4 x i8> %vec2, i64 0
697 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
698
699 %v1e1 = extractelement <4 x i8> %vec1, i64 1
700 %v2e1 = extractelement <4 x i8> %vec2, i64 1
701 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
702
703 %v1e2 = extractelement <4 x i8> %vec1, i64 2
704 %v2e2 = extractelement <4 x i8> %vec2, i64 2
705 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
706
707 %v1e3 = extractelement <4 x i8> %vec1, i64 3
708 %v2e3 = extractelement <4 x i8> %vec2, i64 3
709 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
710
711 %acc = load i8, i8 addrspace(1)* %dst, align 2
712 %mad1 = add i8 %acc, %mul1
713 %mad2 = add i8 %mul2, %mad1
714 %mad3 = add i8 %mul3, %mad2
715 %mad4 = add i8 %mul4, %mad3
716
717 store i8 %mad4, i8 addrspace(1)* %dst, align 2
718 ret void
719}
720
721; TODO: Support commutation accross the adds.
722define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000723; GFX7-LABEL: udot4_CommutationAccrossMADs:
724; GFX7: ; %bb.0: ; %entry
725; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
726; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
727; GFX7-NEXT: s_mov_b32 s3, 0xf000
728; GFX7-NEXT: s_mov_b32 s2, -1
729; GFX7-NEXT: s_movk_i32 s8, 0xff
730; GFX7-NEXT: s_waitcnt lgkmcnt(0)
731; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
732; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
733; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
734; GFX7-NEXT: s_waitcnt lgkmcnt(0)
735; GFX7-NEXT: s_and_b32 s6, s4, s8
736; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80010
737; GFX7-NEXT: s_and_b32 s7, s5, s8
738; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008
739; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
740; GFX7-NEXT: v_mov_b32_e32 v1, s8
741; GFX7-NEXT: v_mov_b32_e32 v2, s6
742; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010
743; GFX7-NEXT: s_lshr_b32 s4, s4, 24
744; GFX7-NEXT: v_mov_b32_e32 v3, s10
745; GFX7-NEXT: s_lshr_b32 s5, s5, 24
746; GFX7-NEXT: s_waitcnt vmcnt(0)
747; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0
748; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0
749; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
750; GFX7-NEXT: v_mov_b32_e32 v1, s4
751; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
752; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
753; GFX7-NEXT: s_endpgm
754;
755; GFX8-LABEL: udot4_CommutationAccrossMADs:
756; GFX8: ; %bb.0: ; %entry
757; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
758; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
759; GFX8-NEXT: s_waitcnt lgkmcnt(0)
760; GFX8-NEXT: v_mov_b32_e32 v0, s0
761; GFX8-NEXT: v_mov_b32_e32 v1, s1
762; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
763; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
764; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
765; GFX8-NEXT: s_movk_i32 s0, 0xff
766; GFX8-NEXT: s_waitcnt lgkmcnt(0)
767; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
768; GFX8-NEXT: s_and_b32 s3, s2, s0
769; GFX8-NEXT: s_and_b32 s0, s1, s0
770; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
771; GFX8-NEXT: v_mov_b32_e32 v3, s4
772; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
773; GFX8-NEXT: v_mov_b32_e32 v4, s0
774; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
775; GFX8-NEXT: s_lshr_b32 s1, s1, 24
776; GFX8-NEXT: v_mov_b32_e32 v5, s6
777; GFX8-NEXT: s_lshr_b32 s2, s2, 24
778; GFX8-NEXT: s_waitcnt vmcnt(0)
779; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
780; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2
781; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
782; GFX8-NEXT: v_mov_b32_e32 v3, s1
783; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
784; GFX8-NEXT: flat_store_byte v[0:1], v2
785; GFX8-NEXT: s_endpgm
786;
787; GFX9-NODL-LABEL: udot4_CommutationAccrossMADs:
788; GFX9-NODL: ; %bb.0: ; %entry
789; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
790; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
791; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
792; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
793; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
794; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
795; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
796; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
797; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
798; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
799; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
800; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
801; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
802; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
803; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
804; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
805; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0
806; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
807; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
808; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
809; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
810; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
811; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
812; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
813; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
814; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
815; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
816; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
817; GFX9-NODL-NEXT: s_endpgm
818;
819; GFX9-DL-LABEL: udot4_CommutationAccrossMADs:
820; GFX9-DL: ; %bb.0: ; %entry
821; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
822; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
823; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
824; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
825; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
826; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
827; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
828; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
829; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
830; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
831; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
832; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
833; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
834; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
835; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
836; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010
837; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0
838; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
839; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
840; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
841; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
842; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
843; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
844; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
845; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
846; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
847; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
848; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
849; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +0000850 <4 x i8> addrspace(1)* %src2,
851 i8 addrspace(1)* nocapture %dst) {
852entry:
853 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
854 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
855
856 %v1e0 = extractelement <4 x i8> %vec1, i64 0
857 %v2e0 = extractelement <4 x i8> %vec2, i64 0
858 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
859
860 %v1e1 = extractelement <4 x i8> %vec1, i64 1
861 %v2e1 = extractelement <4 x i8> %vec2, i64 1
862 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
863
864 %v1e2 = extractelement <4 x i8> %vec1, i64 2
865 %v2e2 = extractelement <4 x i8> %vec2, i64 2
866 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
867
868 %v1e3 = extractelement <4 x i8> %vec1, i64 3
869 %v2e3 = extractelement <4 x i8> %vec2, i64 3
870 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
871
872 %acc = load i8, i8 addrspace(1)* %dst, align 2
873 %mad1 = add i8 %acc, %mul2
874 %mad2 = add i8 %mad1, %mul1
875 %mad3 = add i8 %mad2, %mul3
876 %mad4 = add i8 %mad3, %mul4
877
878 store i8 %mad4, i8 addrspace(1)* %dst, align 2
879 ret void
880}
881
882define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +0000883; GFX7-LABEL: udot4_multiuse_mul1:
884; GFX7: ; %bb.0: ; %entry
885; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
886; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
887; GFX7-NEXT: s_movk_i32 s8, 0xff
888; GFX7-NEXT: s_mov_b32 s3, 0xf000
889; GFX7-NEXT: s_mov_b32 s2, -1
890; GFX7-NEXT: s_waitcnt lgkmcnt(0)
891; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
892; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
893; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
894; GFX7-NEXT: s_waitcnt lgkmcnt(0)
895; GFX7-NEXT: s_and_b32 s7, s4, s8
896; GFX7-NEXT: s_and_b32 s8, s5, s8
897; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
898; GFX7-NEXT: v_mov_b32_e32 v0, s8
899; GFX7-NEXT: v_mov_b32_e32 v1, s6
900; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
901; GFX7-NEXT: v_mad_u32_u24 v1, s7, v0, v1
902; GFX7-NEXT: v_mov_b32_e32 v2, s10
903; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010
904; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1
905; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
906; GFX7-NEXT: v_mad_u32_u24 v0, s7, v0, v1
907; GFX7-NEXT: v_mov_b32_e32 v1, s12
908; GFX7-NEXT: s_lshr_b32 s5, s5, 24
909; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0
910; GFX7-NEXT: s_lshr_b32 s4, s4, 24
911; GFX7-NEXT: v_mov_b32_e32 v1, s5
912; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
913; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
914; GFX7-NEXT: s_endpgm
915;
916; GFX8-LABEL: udot4_multiuse_mul1:
917; GFX8: ; %bb.0: ; %entry
918; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
919; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
920; GFX8-NEXT: s_movk_i32 s2, 0xff
921; GFX8-NEXT: s_waitcnt lgkmcnt(0)
922; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
923; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
924; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
925; GFX8-NEXT: s_waitcnt lgkmcnt(0)
926; GFX8-NEXT: s_and_b32 s6, s3, s2
927; GFX8-NEXT: s_and_b32 s2, s4, s2
928; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008
929; GFX8-NEXT: v_mov_b32_e32 v0, s2
930; GFX8-NEXT: v_mov_b32_e32 v1, s5
931; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008
932; GFX8-NEXT: v_mad_u32_u24 v1, s6, v0, v1
933; GFX8-NEXT: v_mov_b32_e32 v2, s8
934; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010
935; GFX8-NEXT: v_mad_u32_u24 v1, s7, v2, v1
936; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010
937; GFX8-NEXT: v_mad_u32_u24 v0, s6, v0, v1
938; GFX8-NEXT: v_mov_b32_e32 v1, s10
939; GFX8-NEXT: s_lshr_b32 s4, s4, 24
940; GFX8-NEXT: v_mad_u32_u24 v0, s9, v1, v0
941; GFX8-NEXT: s_lshr_b32 s3, s3, 24
942; GFX8-NEXT: v_mov_b32_e32 v1, s4
943; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0
944; GFX8-NEXT: v_mov_b32_e32 v0, s0
945; GFX8-NEXT: v_mov_b32_e32 v1, s1
946; GFX8-NEXT: flat_store_dword v[0:1], v2
947; GFX8-NEXT: s_endpgm
948;
949; GFX9-NODL-LABEL: udot4_multiuse_mul1:
950; GFX9-NODL: ; %bb.0: ; %entry
951; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
952; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
953; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
954; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
955; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
956; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
957; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
958; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
959; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
960; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
961; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008
962; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s2
963; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
964; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
965; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s6, v0, v1
966; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s8
967; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010
968; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
969; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
970; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v0, v1
971; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s10
972; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
973; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v1, v0
974; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
975; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s4
976; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
977; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
978; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
979; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
980; GFX9-NODL-NEXT: s_endpgm
981;
982; GFX9-DL-LABEL: udot4_multiuse_mul1:
983; GFX9-DL: ; %bb.0: ; %entry
984; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
985; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
986; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
987; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
988; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
989; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
990; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
991; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
992; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
993; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
994; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80008
995; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
996; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
997; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
998; GFX9-DL-NEXT: v_mad_u32_u24 v1, s6, v0, v1
999; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8
1000; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x80010
1001; GFX9-DL-NEXT: v_mad_u32_u24 v1, s7, v2, v1
1002; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010
1003; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v0, v1
1004; GFX9-DL-NEXT: v_mov_b32_e32 v1, s10
1005; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
1006; GFX9-DL-NEXT: v_mad_u32_u24 v0, s9, v1, v0
1007; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
1008; GFX9-DL-NEXT: v_mov_b32_e32 v1, s4
1009; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v1, v0
1010; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1011; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1012; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1013; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +00001014 <4 x i8> addrspace(1)* %src2,
1015 i32 addrspace(1)* nocapture %dst) {
1016entry:
1017 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1018 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1019
1020 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1021 %cv1e0 = zext i8 %v1e0 to i32
1022 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1023 %cv2e0 = zext i8 %v2e0 to i32
1024 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1025
1026 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1027 %cv1e1 = zext i8 %v1e1 to i32
1028 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1029 %cv2e1 = zext i8 %v2e1 to i32
1030 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1031
1032 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1033 %cv1e2 = zext i8 %v1e2 to i32
1034 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1035 %cv2e2 = zext i8 %v2e2 to i32
1036 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1037
1038 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1039 %cv1e3 = zext i8 %v1e3 to i32
1040 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1041 %cv2e3 = zext i8 %v2e3 to i32
1042 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1043
1044 %acc = load i32, i32 addrspace(1)* %dst, align 4
1045 %add = add i32 %mul1, %acc
1046 %add1 = add i32 %mul2, %add
1047 %add2 = add i32 %add1, %mul1
1048 %add3 = add i32 %add2, %mul3
1049 %add4 = add i32 %add3, %mul4
1050
1051 store i32 %add4, i32 addrspace(1)* %dst, align 4
1052 ret void
1053}
1054
1055define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +00001056; GFX7-LABEL: udot4_multiuse_add1:
1057; GFX7: ; %bb.0: ; %entry
1058; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1059; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1060; GFX7-NEXT: s_movk_i32 s8, 0xff
1061; GFX7-NEXT: s_mov_b32 s3, 0xf000
1062; GFX7-NEXT: s_mov_b32 s2, -1
1063; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1064; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1065; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1066; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1067; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1068; GFX7-NEXT: s_and_b32 s7, s4, s8
1069; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
1070; GFX7-NEXT: s_and_b32 s8, s5, s8
1071; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
1072; GFX7-NEXT: v_mov_b32_e32 v0, s10
1073; GFX7-NEXT: v_mov_b32_e32 v1, s6
1074; GFX7-NEXT: v_mad_u32_u24 v0, s9, v0, v1
1075; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010
1076; GFX7-NEXT: v_mov_b32_e32 v2, s8
1077; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
1078; GFX7-NEXT: v_add_i32_e32 v1, vcc, s6, v0
1079; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0
1080; GFX7-NEXT: v_mov_b32_e32 v2, s12
1081; GFX7-NEXT: s_lshr_b32 s5, s5, 24
1082; GFX7-NEXT: v_mad_u32_u24 v0, s11, v2, v0
1083; GFX7-NEXT: s_lshr_b32 s4, s4, 24
1084; GFX7-NEXT: v_mov_b32_e32 v2, s5
1085; GFX7-NEXT: v_mad_u32_u24 v0, s4, v2, v0
1086; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
1087; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1088; GFX7-NEXT: s_endpgm
1089;
1090; GFX8-LABEL: udot4_multiuse_add1:
1091; GFX8: ; %bb.0: ; %entry
1092; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1093; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1094; GFX8-NEXT: s_movk_i32 s2, 0xff
1095; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1096; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
1097; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
1098; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
1099; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1100; GFX8-NEXT: s_and_b32 s6, s3, s2
1101; GFX8-NEXT: s_bfe_u32 s8, s4, 0x80008
1102; GFX8-NEXT: s_and_b32 s2, s4, s2
1103; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008
1104; GFX8-NEXT: v_mov_b32_e32 v0, s8
1105; GFX8-NEXT: v_mov_b32_e32 v1, s5
1106; GFX8-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1107; GFX8-NEXT: s_bfe_u32 s10, s4, 0x80010
1108; GFX8-NEXT: v_mov_b32_e32 v2, s2
1109; GFX8-NEXT: s_bfe_u32 s9, s3, 0x80010
1110; GFX8-NEXT: v_add_u32_e32 v1, vcc, s5, v0
1111; GFX8-NEXT: v_mad_u32_u24 v0, s6, v2, v0
1112; GFX8-NEXT: v_mov_b32_e32 v2, s10
1113; GFX8-NEXT: s_lshr_b32 s4, s4, 24
1114; GFX8-NEXT: v_mad_u32_u24 v0, s9, v2, v0
1115; GFX8-NEXT: s_lshr_b32 s3, s3, 24
1116; GFX8-NEXT: v_mov_b32_e32 v2, s4
1117; GFX8-NEXT: v_mad_u32_u24 v0, s3, v2, v0
1118; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
1119; GFX8-NEXT: v_mov_b32_e32 v0, s0
1120; GFX8-NEXT: v_mov_b32_e32 v1, s1
1121; GFX8-NEXT: flat_store_dword v[0:1], v2
1122; GFX8-NEXT: s_endpgm
1123;
1124; GFX9-NODL-LABEL: udot4_multiuse_add1:
1125; GFX9-NODL: ; %bb.0: ; %entry
1126; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1127; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1128; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
1129; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1130; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
1131; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
1132; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
1133; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1134; GFX9-NODL-NEXT: s_and_b32 s6, s3, s2
1135; GFX9-NODL-NEXT: s_bfe_u32 s8, s4, 0x80008
1136; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
1137; GFX9-NODL-NEXT: s_bfe_u32 s7, s3, 0x80008
1138; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s8
1139; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
1140; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1141; GFX9-NODL-NEXT: s_bfe_u32 s10, s4, 0x80010
1142; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
1143; GFX9-NODL-NEXT: s_bfe_u32 s9, s3, 0x80010
1144; GFX9-NODL-NEXT: v_add_u32_e32 v1, s5, v0
1145; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s6, v2, v0
1146; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s10
1147; GFX9-NODL-NEXT: s_lshr_b32 s4, s4, 24
1148; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s9, v2, v0
1149; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
1150; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
1151; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s3, v2, v0
1152; GFX9-NODL-NEXT: v_add_u32_e32 v2, v0, v1
1153; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1154; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1155; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1156; GFX9-NODL-NEXT: s_endpgm
1157;
1158; GFX9-DL-LABEL: udot4_multiuse_add1:
1159; GFX9-DL: ; %bb.0: ; %entry
1160; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1161; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1162; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
1163; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1164; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
1165; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
1166; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
1167; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1168; GFX9-DL-NEXT: s_and_b32 s6, s3, s2
1169; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x80008
1170; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
1171; GFX9-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
1172; GFX9-DL-NEXT: v_mov_b32_e32 v0, s8
1173; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
1174; GFX9-DL-NEXT: v_mad_u32_u24 v0, s7, v0, v1
1175; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x80010
1176; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
1177; GFX9-DL-NEXT: s_bfe_u32 s9, s3, 0x80010
1178; GFX9-DL-NEXT: v_add_u32_e32 v1, s5, v0
1179; GFX9-DL-NEXT: v_mad_u32_u24 v0, s6, v2, v0
1180; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10
1181; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 24
1182; GFX9-DL-NEXT: v_mad_u32_u24 v0, s9, v2, v0
1183; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
1184; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
1185; GFX9-DL-NEXT: v_mad_u32_u24 v0, s3, v2, v0
1186; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1
1187; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1188; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1189; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1190; GFX9-DL-NEXT: s_endpgm
1191 <4 x i8> addrspace(1)* %src2,
1192 i32 addrspace(1)* nocapture %dst) {
Farhana Aleen9250c922018-08-29 16:31:18 +00001193entry:
1194 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1195 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1196
1197 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1198 %cv1e0 = zext i8 %v1e0 to i32
1199 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1200 %cv2e0 = zext i8 %v2e0 to i32
1201 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1202
1203 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1204 %cv1e1 = zext i8 %v1e1 to i32
1205 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1206 %cv2e1 = zext i8 %v2e1 to i32
1207 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1208
1209 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1210 %cv1e2 = zext i8 %v1e2 to i32
1211 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1212 %cv2e2 = zext i8 %v2e2 to i32
1213 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1214
1215 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1216 %cv1e3 = zext i8 %v1e3 to i32
1217 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1218 %cv2e3 = zext i8 %v2e3 to i32
1219 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1220
1221 %acc = load i32, i32 addrspace(1)* %dst, align 4
1222 %add1 = add i32 %mul2, %acc
1223 %add = add i32 %add1, %acc
1224 %add2 = add i32 %add1, %mul1
1225 %add3 = add i32 %add2, %mul3
1226 %add4 = add i32 %add3, %mul4
1227 %res = add i32 %add4, %add
1228 store i32 %res, i32 addrspace(1)* %dst, align 4
1229 ret void
1230}
1231
1232define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
Farhana Aleen4bc597b2018-10-04 16:57:37 +00001233; GFX7-LABEL: notdot4_mixedtypes:
1234; GFX7: ; %bb.0: ; %entry
1235; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1236; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1237; GFX7-NEXT: s_mov_b32 s3, 0xf000
1238; GFX7-NEXT: s_mov_b32 s2, -1
1239; GFX7-NEXT: s_mov_b32 s8, 0xffff
1240; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1241; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1242; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
1243; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1244; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1245; GFX7-NEXT: s_sext_i32_i8 s6, s4
1246; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008
1247; GFX7-NEXT: s_sext_i32_i8 s7, s5
1248; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
1249; GFX7-NEXT: s_and_b32 s7, s7, s8
1250; GFX7-NEXT: v_mov_b32_e32 v1, s9
1251; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010
1252; GFX7-NEXT: s_and_b32 s6, s6, s8
1253; GFX7-NEXT: v_mov_b32_e32 v3, s7
1254; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010
1255; GFX7-NEXT: s_lshr_b32 s5, s5, 24
1256; GFX7-NEXT: v_mov_b32_e32 v2, s11
1257; GFX7-NEXT: s_lshr_b32 s4, s4, 24
1258; GFX7-NEXT: s_waitcnt vmcnt(0)
1259; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0
1260; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0
1261; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0
1262; GFX7-NEXT: v_mov_b32_e32 v1, s5
1263; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1264; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1265; GFX7-NEXT: s_endpgm
1266;
1267; GFX8-LABEL: notdot4_mixedtypes:
1268; GFX8: ; %bb.0: ; %entry
1269; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1270; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1271; GFX8-NEXT: s_mov_b32 s2, 0xffff
1272; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1273; GFX8-NEXT: v_mov_b32_e32 v0, s0
1274; GFX8-NEXT: v_mov_b32_e32 v1, s1
1275; GFX8-NEXT: flat_load_ushort v2, v[0:1]
1276; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1277; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
1278; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1279; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80000
1280; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008
1281; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80000
1282; GFX8-NEXT: s_and_b32 s3, s2, s3
1283; GFX8-NEXT: s_and_b32 s2, s2, s4
1284; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
1285; GFX8-NEXT: v_mov_b32_e32 v3, s6
1286; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80010
1287; GFX8-NEXT: v_mov_b32_e32 v5, s2
1288; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
1289; GFX8-NEXT: s_lshr_b32 s1, s1, 24
1290; GFX8-NEXT: v_mov_b32_e32 v4, s8
1291; GFX8-NEXT: s_lshr_b32 s0, s0, 24
1292; GFX8-NEXT: s_waitcnt vmcnt(0)
1293; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
1294; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
1295; GFX8-NEXT: v_mad_u32_u24 v2, s7, v4, v2
1296; GFX8-NEXT: v_mov_b32_e32 v3, s1
1297; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
1298; GFX8-NEXT: flat_store_short v[0:1], v2
1299; GFX8-NEXT: s_endpgm
1300;
1301; GFX9-NODL-LABEL: notdot4_mixedtypes:
1302; GFX9-NODL: ; %bb.0: ; %entry
1303; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1304; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1305; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
1306; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1307; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1308; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1309; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
1310; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
1311; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
1312; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1313; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80000
1314; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008
1315; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80000
1316; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3
1317; GFX9-NODL-NEXT: s_and_b32 s2, s2, s4
1318; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
1319; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
1320; GFX9-NODL-NEXT: s_bfe_u32 s8, s1, 0x80010
1321; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2
1322; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
1323; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
1324; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
1325; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
1326; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1327; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
1328; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
1329; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
1330; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
1331; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
1332; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
1333; GFX9-NODL-NEXT: s_endpgm
1334;
1335; GFX9-DL-LABEL: notdot4_mixedtypes:
1336; GFX9-DL: ; %bb.0: ; %entry
1337; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1338; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1339; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
1340; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1341; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1342; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1343; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
1344; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
1345; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
1346; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1347; GFX9-DL-NEXT: s_bfe_i32 s3, s0, 0x80000
1348; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80008
1349; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x80000
1350; GFX9-DL-NEXT: s_and_b32 s3, s2, s3
1351; GFX9-DL-NEXT: s_and_b32 s2, s2, s4
1352; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80008
1353; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
1354; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x80010
1355; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2
1356; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x80010
1357; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
1358; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
1359; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
1360; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1361; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
1362; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
1363; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
1364; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
1365; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
1366; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
1367; GFX9-DL-NEXT: s_endpgm
Farhana Aleen9250c922018-08-29 16:31:18 +00001368 <4 x i8> addrspace(1)* %src2,
1369 i16 addrspace(1)* nocapture %dst) {
1370entry:
1371 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1372 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1373
1374 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1375 %cv1e0 = sext i8 %v1e0 to i16
1376 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1377 %cv2e0 = sext i8 %v2e0 to i16
1378 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
1379
1380 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1381 %cv1e1 = zext i8 %v1e1 to i16
1382 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1383 %cv2e1 = zext i8 %v2e1 to i16
1384 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
1385
1386 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1387 %cv1e2 = zext i8 %v1e2 to i16
1388 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1389 %cv2e2 = zext i8 %v2e2 to i16
1390 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
1391
1392 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1393 %cv1e3 = zext i8 %v1e3 to i16
1394 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1395 %cv2e3 = zext i8 %v2e3 to i16
1396 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
1397
1398 %acc = load i16, i16 addrspace(1)* %dst, align 2
1399 %add1 = add i16 %mul2, %acc
1400 %add2 = add i16 %add1, %mul1
1401 %add3 = add i16 %add2, %mul3
1402 %add4 = add i16 %add3, %mul4
1403
1404 store i16 %add4, i16 addrspace(1)* %dst, align 2
1405 ret void
1406}
Farhana Aleen4bc597b2018-10-04 16:57:37 +00001407
1408define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
1409; GFX7-LABEL: idot4_acc32:
1410; GFX7: ; %bb.0: ; %entry
1411; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1412; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1413; GFX7-NEXT: s_mov_b32 s3, 0xf000
1414; GFX7-NEXT: s_mov_b32 s2, -1
1415; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1416; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1417; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1418; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1419; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1420; GFX7-NEXT: s_sext_i32_i8 s7, s4
1421; GFX7-NEXT: s_sext_i32_i8 s8, s5
1422; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
1423; GFX7-NEXT: v_mov_b32_e32 v0, s8
1424; GFX7-NEXT: v_mov_b32_e32 v1, s6
1425; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
1426; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1
1427; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
1428; GFX7-NEXT: v_mov_b32_e32 v1, s10
1429; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
1430; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0
1431; GFX7-NEXT: v_mov_b32_e32 v1, s12
1432; GFX7-NEXT: s_ashr_i32 s5, s5, 24
1433; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0
1434; GFX7-NEXT: s_ashr_i32 s4, s4, 24
1435; GFX7-NEXT: v_mov_b32_e32 v1, s5
1436; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0
1437; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1438; GFX7-NEXT: s_endpgm
1439;
1440; GFX8-LABEL: idot4_acc32:
1441; GFX8: ; %bb.0: ; %entry
1442; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1443; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1444; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1445; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1446; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1447; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1448; GFX8-NEXT: v_mov_b32_e32 v0, s0
1449; GFX8-NEXT: v_mov_b32_e32 v1, s1
1450; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1451; GFX8-NEXT: s_sext_i32_i8 s0, s2
1452; GFX8-NEXT: s_sext_i32_i8 s1, s3
1453; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008
1454; GFX8-NEXT: v_mov_b32_e32 v2, s1
1455; GFX8-NEXT: v_mov_b32_e32 v3, s4
1456; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010
1457; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1458; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
1459; GFX8-NEXT: v_mov_b32_e32 v3, s6
1460; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
1461; GFX8-NEXT: v_mad_i32_i24 v2, s5, v3, v2
1462; GFX8-NEXT: v_mov_b32_e32 v3, s8
1463; GFX8-NEXT: s_ashr_i32 s3, s3, 24
1464; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2
1465; GFX8-NEXT: s_ashr_i32 s2, s2, 24
1466; GFX8-NEXT: v_mov_b32_e32 v3, s3
1467; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
1468; GFX8-NEXT: flat_store_dword v[0:1], v2
1469; GFX8-NEXT: s_endpgm
1470;
1471; GFX9-NODL-LABEL: idot4_acc32:
1472; GFX9-NODL: ; %bb.0: ; %entry
1473; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1474; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1475; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1476; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1477; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1478; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1479; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1480; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1481; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1482; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
1483; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
1484; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008
1485; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1
1486; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
1487; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010
1488; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1489; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
1490; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
1491; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
1492; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s5, v3, v2
1493; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8
1494; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
1495; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
1496; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
1497; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
1498; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
1499; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1500; GFX9-NODL-NEXT: s_endpgm
1501;
1502; GFX9-DL-LABEL: idot4_acc32:
1503; GFX9-DL: ; %bb.0: ; %entry
1504; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1505; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1506; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1507; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1508; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1509; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
1510; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1511; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1512; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1513; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
1514; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
1515; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v2, v3
1516; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
1517; GFX9-DL-NEXT: s_endpgm
1518 <4 x i8> addrspace(1)* %src2,
1519 i32 addrspace(1)* nocapture %dst) {
1520entry:
1521 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1522 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1523
1524 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1525 %cv1e0 = sext i8 %v1e0 to i32
1526 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1527 %cv2e0 = sext i8 %v2e0 to i32
1528 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
1529
1530 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1531 %cv1e1 = sext i8 %v1e1 to i32
1532 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1533 %cv2e1 = sext i8 %v2e1 to i32
1534 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
1535
1536 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1537 %cv1e2 = sext i8 %v1e2 to i32
1538 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1539 %cv2e2 = sext i8 %v2e2 to i32
1540 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
1541
1542 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1543 %cv1e3 = sext i8 %v1e3 to i32
1544 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1545 %cv2e3 = sext i8 %v2e3 to i32
1546 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
1547
1548 %acc = load i32, i32 addrspace(1)* %dst, align 4
1549 %add1 = add i32 %mul1, %acc
1550 %add2 = add i32 %add1, %mul2
1551 %add3 = add i32 %add2, %mul3
1552 %add4 = add i32 %add3, %mul4
1553 store i32 %add4, i32 addrspace(1)* %dst, align 4
1554 ret void
1555}
1556
1557; TODO: Currently, vector elements{0 and 3} get zero_extended from i16 to i32 which should
1558; be sign_extended directly to i32; prevents the pattern recognizer to recognize this pattern.
1559define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
1560; GFX7-LABEL: idot4_acc16:
1561; GFX7: ; %bb.0: ; %entry
1562; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1563; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1564; GFX7-NEXT: s_mov_b32 s3, 0xf000
1565; GFX7-NEXT: s_mov_b32 s2, -1
1566; GFX7-NEXT: s_mov_b32 s8, 0xffff
1567; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1568; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1569; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
1570; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1571; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1572; GFX7-NEXT: s_sext_i32_i8 s6, s4
1573; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
1574; GFX7-NEXT: s_sext_i32_i8 s7, s5
1575; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
1576; GFX7-NEXT: s_and_b32 s7, s7, s8
1577; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
1578; GFX7-NEXT: s_and_b32 s10, s10, s8
1579; GFX7-NEXT: s_and_b32 s6, s6, s8
1580; GFX7-NEXT: v_mov_b32_e32 v1, s7
1581; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
1582; GFX7-NEXT: s_ashr_i32 s5, s5, 24
1583; GFX7-NEXT: s_and_b32 s12, s12, s8
1584; GFX7-NEXT: s_and_b32 s9, s9, s8
1585; GFX7-NEXT: v_mov_b32_e32 v2, s10
1586; GFX7-NEXT: s_ashr_i32 s4, s4, 24
1587; GFX7-NEXT: s_and_b32 s11, s11, s8
1588; GFX7-NEXT: s_and_b32 s5, s5, s8
1589; GFX7-NEXT: v_mov_b32_e32 v3, s12
1590; GFX7-NEXT: s_and_b32 s4, s4, s8
1591; GFX7-NEXT: s_waitcnt vmcnt(0)
1592; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
1593; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
1594; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
1595; GFX7-NEXT: v_mov_b32_e32 v1, s5
1596; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1597; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
1598; GFX7-NEXT: s_endpgm
1599;
1600; GFX8-LABEL: idot4_acc16:
1601; GFX8: ; %bb.0: ; %entry
1602; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1603; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1604; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1605; GFX8-NEXT: v_mov_b32_e32 v0, s0
1606; GFX8-NEXT: v_mov_b32_e32 v1, s1
1607; GFX8-NEXT: flat_load_ushort v2, v[0:1]
1608; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
1609; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
1610; GFX8-NEXT: s_mov_b32 s0, 0xffff
1611; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1612; GFX8-NEXT: s_lshr_b32 s5, s1, 8
1613; GFX8-NEXT: s_lshr_b32 s6, s2, 8
1614; GFX8-NEXT: s_sext_i32_i8 s4, s2
1615; GFX8-NEXT: s_bfe_i32 s5, s5, 0x80000
1616; GFX8-NEXT: s_bfe_i32 s6, s6, 0x80000
1617; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010
1618; GFX8-NEXT: s_lshr_b32 s2, s2, 24
1619; GFX8-NEXT: v_mov_b32_e32 v3, s4
1620; GFX8-NEXT: s_sext_i32_i8 s3, s1
1621; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
1622; GFX8-NEXT: s_lshr_b32 s1, s1, 24
1623; GFX8-NEXT: s_and_b32 s4, s0, s5
1624; GFX8-NEXT: s_and_b32 s5, s0, s6
1625; GFX8-NEXT: s_bfe_i32 s1, s1, 0x80000
1626; GFX8-NEXT: s_bfe_i32 s2, s2, 0x80000
1627; GFX8-NEXT: v_mov_b32_e32 v5, s5
1628; GFX8-NEXT: s_and_b32 s1, s0, s1
1629; GFX8-NEXT: v_mov_b32_e32 v4, s8
1630; GFX8-NEXT: s_and_b32 s0, s0, s2
1631; GFX8-NEXT: s_waitcnt vmcnt(0)
1632; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1633; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
1634; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2
1635; GFX8-NEXT: v_mov_b32_e32 v3, s0
1636; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
1637; GFX8-NEXT: flat_store_short v[0:1], v2
1638; GFX8-NEXT: s_endpgm
1639;
1640; GFX9-NODL-LABEL: idot4_acc16:
1641; GFX9-NODL: ; %bb.0: ; %entry
1642; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1643; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1644; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1645; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1646; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1647; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
1648; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
1649; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
1650; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
1651; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1652; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 8
1653; GFX9-NODL-NEXT: s_lshr_b32 s6, s2, 8
1654; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
1655; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
1656; GFX9-NODL-NEXT: s_bfe_i32 s6, s6, 0x80000
1657; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
1658; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
1659; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
1660; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
1661; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
1662; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
1663; GFX9-NODL-NEXT: s_and_b32 s4, s0, s5
1664; GFX9-NODL-NEXT: s_and_b32 s5, s0, s6
1665; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000
1666; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
1667; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
1668; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1
1669; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
1670; GFX9-NODL-NEXT: s_and_b32 s0, s0, s2
1671; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1672; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1673; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
1674; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
1675; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
1676; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
1677; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
1678; GFX9-NODL-NEXT: s_endpgm
1679;
1680; GFX9-DL-LABEL: idot4_acc16:
1681; GFX9-DL: ; %bb.0: ; %entry
1682; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1683; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1684; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1685; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1686; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1687; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
1688; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
1689; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
1690; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
1691; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1692; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 8
1693; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 8
1694; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
1695; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
1696; GFX9-DL-NEXT: s_bfe_i32 s6, s6, 0x80000
1697; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010
1698; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
1699; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
1700; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
1701; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010
1702; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
1703; GFX9-DL-NEXT: s_and_b32 s4, s0, s5
1704; GFX9-DL-NEXT: s_and_b32 s5, s0, s6
1705; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000
1706; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
1707; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
1708; GFX9-DL-NEXT: s_and_b32 s1, s0, s1
1709; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
1710; GFX9-DL-NEXT: s_and_b32 s0, s0, s2
1711; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1712; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
1713; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
1714; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
1715; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
1716; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
1717; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
1718; GFX9-DL-NEXT: s_endpgm
1719 <4 x i8> addrspace(1)* %src2,
1720 i16 addrspace(1)* nocapture %dst) {
1721entry:
1722 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1723 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1724
1725 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1726 %cv1e0 = sext i8 %v1e0 to i16
1727 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1728 %cv2e0 = sext i8 %v2e0 to i16
1729 %mul1 = mul nsw i16 %cv1e0, %cv2e0
1730
1731 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1732 %cv1e1 = sext i8 %v1e1 to i16
1733 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1734 %cv2e1 = sext i8 %v2e1 to i16
1735 %mul2 = mul nsw i16 %cv1e1, %cv2e1
1736
1737 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1738 %cv1e2 = sext i8 %v1e2 to i16
1739 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1740 %cv2e2 = sext i8 %v2e2 to i16
1741 %mul3 = mul nsw i16 %cv1e2, %cv2e2
1742
1743 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1744 %cv1e3 = sext i8 %v1e3 to i16
1745 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1746 %cv2e3 = sext i8 %v2e3 to i16
1747 %mul4 = mul nsw i16 %cv1e3, %cv2e3
1748
1749 %acc = load i16, i16 addrspace(1)* %dst, align 2
1750 %add1 = add i16 %mul1, %acc
1751 %add2 = add i16 %add1, %mul2
1752 %add3 = add i16 %add2, %mul3
1753 %add4 = add i16 %add3, %mul4
1754 store i16 %add4, i16 addrspace(1)* %dst, align 2
1755 ret void
1756}
1757
1758define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
1759; GFX7-LABEL: idot4_acc8:
1760; GFX7: ; %bb.0: ; %entry
1761; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1762; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1763; GFX7-NEXT: s_mov_b32 s3, 0xf000
1764; GFX7-NEXT: s_mov_b32 s2, -1
1765; GFX7-NEXT: s_movk_i32 s8, 0xff
1766; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1767; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1768; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
1769; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1770; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1771; GFX7-NEXT: s_and_b32 s7, s4, s8
1772; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
1773; GFX7-NEXT: s_and_b32 s6, s5, s8
1774; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
1775; GFX7-NEXT: v_mov_b32_e32 v1, s6
1776; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
1777; GFX7-NEXT: v_mov_b32_e32 v2, s8
1778; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
1779; GFX7-NEXT: s_lshr_b32 s5, s5, 24
1780; GFX7-NEXT: v_mov_b32_e32 v3, s10
1781; GFX7-NEXT: s_lshr_b32 s4, s4, 24
1782; GFX7-NEXT: s_waitcnt vmcnt(0)
1783; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
1784; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
1785; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
1786; GFX7-NEXT: v_mov_b32_e32 v1, s5
1787; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
1788; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
1789; GFX7-NEXT: s_endpgm
1790;
1791; GFX8-LABEL: idot4_acc8:
1792; GFX8: ; %bb.0: ; %entry
1793; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1794; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1795; GFX8-NEXT: s_movk_i32 s2, 0xff
1796; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1797; GFX8-NEXT: v_mov_b32_e32 v0, s0
1798; GFX8-NEXT: v_mov_b32_e32 v1, s1
1799; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
1800; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
1801; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
1802; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1803; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
1804; GFX8-NEXT: s_and_b32 s3, s1, s2
1805; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
1806; GFX8-NEXT: s_and_b32 s2, s0, s2
1807; GFX8-NEXT: v_mov_b32_e32 v3, s3
1808; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
1809; GFX8-NEXT: v_mov_b32_e32 v4, s5
1810; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
1811; GFX8-NEXT: s_lshr_b32 s1, s1, 24
1812; GFX8-NEXT: v_mov_b32_e32 v5, s6
1813; GFX8-NEXT: s_lshr_b32 s0, s0, 24
1814; GFX8-NEXT: s_waitcnt vmcnt(0)
1815; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
1816; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
1817; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
1818; GFX8-NEXT: v_mov_b32_e32 v3, s1
1819; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
1820; GFX8-NEXT: flat_store_byte v[0:1], v2
1821; GFX8-NEXT: s_endpgm
1822;
1823; GFX9-NODL-LABEL: idot4_acc8:
1824; GFX9-NODL: ; %bb.0: ; %entry
1825; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1826; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1827; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
1828; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1829; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1830; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1831; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
1832; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
1833; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
1834; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1835; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
1836; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
1837; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
1838; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
1839; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
1840; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
1841; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
1842; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
1843; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
1844; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
1845; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
1846; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
1847; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
1848; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
1849; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
1850; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
1851; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
1852; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
1853; GFX9-NODL-NEXT: s_endpgm
1854;
1855; GFX9-DL-LABEL: idot4_acc8:
1856; GFX9-DL: ; %bb.0: ; %entry
1857; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1858; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1859; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1860; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
1861; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
1862; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
1863; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
1864; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
1865; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
1866; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
1867; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
1868; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
1869; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
1870; GFX9-DL-NEXT: s_endpgm
1871 <4 x i8> addrspace(1)* %src2,
1872 i8 addrspace(1)* nocapture %dst) {
1873entry:
1874 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
1875 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
1876
1877 %v1e0 = extractelement <4 x i8> %vec1, i64 0
1878 %v2e0 = extractelement <4 x i8> %vec2, i64 0
1879 %mul1 = mul i8 %v1e0, %v2e0
1880
1881 %v1e1 = extractelement <4 x i8> %vec1, i64 1
1882 %v2e1 = extractelement <4 x i8> %vec2, i64 1
1883 %mul2 = mul i8 %v1e1, %v2e1
1884
1885 %v1e2 = extractelement <4 x i8> %vec1, i64 2
1886 %v2e2 = extractelement <4 x i8> %vec2, i64 2
1887 %mul3 = mul i8 %v1e2, %v2e2
1888
1889 %v1e3 = extractelement <4 x i8> %vec1, i64 3
1890 %v2e3 = extractelement <4 x i8> %vec2, i64 3
1891 %mul4 = mul i8 %v1e3, %v2e3
1892
1893 %acc = load i8, i8 addrspace(1)* %dst, align 2
1894 %add1 = add i8 %mul1, %acc
1895 %add2 = add i8 %add1, %mul2
1896 %add3 = add i8 %add2, %mul3
1897 %add4 = add nsw i8 %add3, %mul4
1898 store i8 %add4, i8 addrspace(1)* %dst, align 2
1899 ret void
1900}
1901
1902define amdgpu_kernel void @idot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
1903; GFX7-LABEL: idot4_multiuse_mul1:
1904; GFX7: ; %bb.0: ; %entry
1905; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1906; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1907; GFX7-NEXT: s_mov_b32 s3, 0xf000
1908; GFX7-NEXT: s_mov_b32 s2, -1
1909; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1910; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
1911; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
1912; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
1913; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1914; GFX7-NEXT: s_sext_i32_i8 s7, s4
1915; GFX7-NEXT: s_sext_i32_i8 s8, s5
1916; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80008
1917; GFX7-NEXT: v_mov_b32_e32 v0, s8
1918; GFX7-NEXT: v_mov_b32_e32 v1, s6
1919; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
1920; GFX7-NEXT: v_mad_i32_i24 v1, s7, v0, v1
1921; GFX7-NEXT: v_mov_b32_e32 v2, s10
1922; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
1923; GFX7-NEXT: v_mad_i32_i24 v1, s9, v2, v1
1924; GFX7-NEXT: s_bfe_i32 s11, s4, 0x80010
1925; GFX7-NEXT: v_mad_i32_i24 v0, s7, v0, v1
1926; GFX7-NEXT: v_mov_b32_e32 v1, s12
1927; GFX7-NEXT: s_ashr_i32 s5, s5, 24
1928; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0
1929; GFX7-NEXT: s_ashr_i32 s4, s4, 24
1930; GFX7-NEXT: v_mov_b32_e32 v1, s5
1931; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0
1932; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
1933; GFX7-NEXT: s_endpgm
1934;
1935; GFX8-LABEL: idot4_multiuse_mul1:
1936; GFX8: ; %bb.0: ; %entry
1937; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1938; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1939; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1940; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
1941; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
1942; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
1943; GFX8-NEXT: v_mov_b32_e32 v0, s0
1944; GFX8-NEXT: v_mov_b32_e32 v1, s1
1945; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1946; GFX8-NEXT: s_sext_i32_i8 s0, s2
1947; GFX8-NEXT: s_sext_i32_i8 s1, s3
1948; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80008
1949; GFX8-NEXT: v_mov_b32_e32 v2, s1
1950; GFX8-NEXT: v_mov_b32_e32 v3, s4
1951; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
1952; GFX8-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1953; GFX8-NEXT: v_mov_b32_e32 v4, s6
1954; GFX8-NEXT: s_bfe_i32 s8, s3, 0x80010
1955; GFX8-NEXT: v_mad_i32_i24 v3, s5, v4, v3
1956; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
1957; GFX8-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1958; GFX8-NEXT: v_mov_b32_e32 v3, s8
1959; GFX8-NEXT: s_ashr_i32 s3, s3, 24
1960; GFX8-NEXT: v_mad_i32_i24 v2, s7, v3, v2
1961; GFX8-NEXT: s_ashr_i32 s2, s2, 24
1962; GFX8-NEXT: v_mov_b32_e32 v3, s3
1963; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
1964; GFX8-NEXT: flat_store_dword v[0:1], v2
1965; GFX8-NEXT: s_endpgm
1966;
1967; GFX9-NODL-LABEL: idot4_multiuse_mul1:
1968; GFX9-NODL: ; %bb.0: ; %entry
1969; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1970; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1971; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1972; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
1973; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
1974; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
1975; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
1976; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
1977; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
1978; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
1979; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
1980; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80008
1981; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s1
1982; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
1983; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
1984; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
1985; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s6
1986; GFX9-NODL-NEXT: s_bfe_i32 s8, s3, 0x80010
1987; GFX9-NODL-NEXT: v_mad_i32_i24 v3, s5, v4, v3
1988; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
1989; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
1990; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s8
1991; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
1992; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
1993; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
1994; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
1995; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
1996; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
1997; GFX9-NODL-NEXT: s_endpgm
1998;
1999; GFX9-DL-LABEL: idot4_multiuse_mul1:
2000; GFX9-DL: ; %bb.0: ; %entry
2001; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2002; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2003; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2004; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2005; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2006; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2007; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2008; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2009; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2010; GFX9-DL-NEXT: s_sext_i32_i8 s0, s2
2011; GFX9-DL-NEXT: s_sext_i32_i8 s1, s3
2012; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80008
2013; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1
2014; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
2015; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x80008
2016; GFX9-DL-NEXT: v_mad_i32_i24 v3, s0, v2, v3
2017; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
2018; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x80010
2019; GFX9-DL-NEXT: v_mad_i32_i24 v3, s5, v4, v3
2020; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x80010
2021; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v2, v3
2022; GFX9-DL-NEXT: v_mov_b32_e32 v3, s8
2023; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 24
2024; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v3, v2
2025; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 24
2026; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
2027; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
2028; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2029; GFX9-DL-NEXT: s_endpgm
2030 <4 x i8> addrspace(1)* %src2,
2031 i32 addrspace(1)* nocapture %dst) {
2032entry:
2033 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2034 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2035
2036 %v1e0 = extractelement <4 x i8> %vec1, i64 0
2037 %cv1e0 = sext i8 %v1e0 to i32
2038 %v2e0 = extractelement <4 x i8> %vec2, i64 0
2039 %cv2e0 = sext i8 %v2e0 to i32
2040 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
2041
2042 %v1e1 = extractelement <4 x i8> %vec1, i64 1
2043 %cv1e1 = sext i8 %v1e1 to i32
2044 %v2e1 = extractelement <4 x i8> %vec2, i64 1
2045 %cv2e1 = sext i8 %v2e1 to i32
2046 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
2047
2048 %v1e2 = extractelement <4 x i8> %vec1, i64 2
2049 %cv1e2 = sext i8 %v1e2 to i32
2050 %v2e2 = extractelement <4 x i8> %vec2, i64 2
2051 %cv2e2 = sext i8 %v2e2 to i32
2052 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
2053
2054 %v1e3 = extractelement <4 x i8> %vec1, i64 3
2055 %cv1e3 = sext i8 %v1e3 to i32
2056 %v2e3 = extractelement <4 x i8> %vec2, i64 3
2057 %cv2e3 = sext i8 %v2e3 to i32
2058 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
2059
2060 %acc = load i32, i32 addrspace(1)* %dst, align 4
2061 %add = add i32 %mul1, %acc
2062 %add1 = add i32 %mul2, %add
2063 %add2 = add i32 %add1, %mul1
2064 %add3 = add i32 %add2, %mul3
2065 %add4 = add i32 %add3, %mul4
2066
2067 store i32 %add4, i32 addrspace(1)* %dst, align 4
2068 ret void
2069}
2070
2071; TODO: cleanup s_lshr_b32 and support this pattern.
2072define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
2073; GFX7-LABEL: udot4_acc32_vecMul:
2074; GFX7: ; %bb.0: ; %entry
2075; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2076; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2077; GFX7-NEXT: s_movk_i32 s12, 0xff
2078; GFX7-NEXT: s_mov_b32 s3, 0xf000
2079; GFX7-NEXT: s_mov_b32 s2, -1
2080; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2081; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2082; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2083; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
2084; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2085; GFX7-NEXT: s_lshr_b32 s7, s4, 24
2086; GFX7-NEXT: s_lshr_b32 s9, s5, 24
2087; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
2088; GFX7-NEXT: s_bfe_u32 s13, s5, 0x80010
2089; GFX7-NEXT: s_and_b32 s5, s5, s12
2090; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008
2091; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
2092; GFX7-NEXT: s_and_b32 s4, s4, s12
2093; GFX7-NEXT: v_mov_b32_e32 v0, s5
2094; GFX7-NEXT: v_mov_b32_e32 v1, s6
2095; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1
2096; GFX7-NEXT: v_mov_b32_e32 v1, s10
2097; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0
2098; GFX7-NEXT: v_mov_b32_e32 v1, s13
2099; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0
2100; GFX7-NEXT: v_mov_b32_e32 v1, s9
2101; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
2102; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2103; GFX7-NEXT: s_endpgm
2104;
2105; GFX8-LABEL: udot4_acc32_vecMul:
2106; GFX8: ; %bb.0: ; %entry
2107; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2108; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2109; GFX8-NEXT: s_movk_i32 s2, 0xff
2110; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2111; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0
2112; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0
2113; GFX8-NEXT: s_load_dword s5, s[0:1], 0x0
2114; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2115; GFX8-NEXT: s_lshr_b32 s6, s3, 24
2116; GFX8-NEXT: s_bfe_u32 s8, s3, 0x80010
2117; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3
2118; GFX8-NEXT: s_and_b32 s3, s3, s2
2119; GFX8-NEXT: s_and_b32 s2, s4, s2
2120; GFX8-NEXT: v_mov_b32_e32 v2, s2
2121; GFX8-NEXT: v_mov_b32_e32 v3, s5
2122; GFX8-NEXT: s_bfe_u32 s9, s4, 0x80010
2123; GFX8-NEXT: v_lshrrev_b16_e64 v1, 8, s4
2124; GFX8-NEXT: v_mad_u32_u24 v2, s3, v2, v3
2125; GFX8-NEXT: v_mad_u32_u24 v0, v0, v1, v2
2126; GFX8-NEXT: v_mov_b32_e32 v1, s9
2127; GFX8-NEXT: s_lshr_b32 s7, s4, 24
2128; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0
2129; GFX8-NEXT: v_mov_b32_e32 v1, s7
2130; GFX8-NEXT: v_mad_u32_u24 v2, s6, v1, v0
2131; GFX8-NEXT: v_mov_b32_e32 v0, s0
2132; GFX8-NEXT: v_mov_b32_e32 v1, s1
2133; GFX8-NEXT: flat_store_dword v[0:1], v2
2134; GFX8-NEXT: s_endpgm
2135;
2136; GFX9-NODL-LABEL: udot4_acc32_vecMul:
2137; GFX9-NODL: ; %bb.0: ; %entry
2138; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2139; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2140; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
2141; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2142; GFX9-NODL-NEXT: s_load_dword s3, s[4:5], 0x0
2143; GFX9-NODL-NEXT: s_load_dword s4, s[6:7], 0x0
2144; GFX9-NODL-NEXT: s_load_dword s5, s[0:1], 0x0
2145; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2146; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24
2147; GFX9-NODL-NEXT: s_bfe_u32 s8, s3, 0x80010
2148; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v0, 8, s3
2149; GFX9-NODL-NEXT: s_and_b32 s3, s3, s2
2150; GFX9-NODL-NEXT: s_and_b32 s2, s4, s2
2151; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s2
2152; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
2153; GFX9-NODL-NEXT: s_bfe_u32 s9, s4, 0x80010
2154; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s4
2155; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v2, v3
2156; GFX9-NODL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
2157; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s9
2158; GFX9-NODL-NEXT: s_lshr_b32 s7, s4, 24
2159; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s8, v1, v0
2160; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
2161; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v1, v0
2162; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2163; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2164; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2165; GFX9-NODL-NEXT: s_endpgm
2166;
2167; GFX9-DL-LABEL: udot4_acc32_vecMul:
2168; GFX9-DL: ; %bb.0: ; %entry
2169; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2170; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2171; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
2172; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2173; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0
2174; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0
2175; GFX9-DL-NEXT: s_load_dword s5, s[0:1], 0x0
2176; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2177; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24
2178; GFX9-DL-NEXT: s_bfe_u32 s8, s3, 0x80010
2179; GFX9-DL-NEXT: v_lshrrev_b16_e64 v0, 8, s3
2180; GFX9-DL-NEXT: s_and_b32 s3, s3, s2
2181; GFX9-DL-NEXT: s_and_b32 s2, s4, s2
2182; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
2183; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
2184; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x80010
2185; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s4
2186; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v2, v3
2187; GFX9-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
2188; GFX9-DL-NEXT: v_mov_b32_e32 v1, s9
2189; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 24
2190; GFX9-DL-NEXT: v_mad_u32_u24 v0, s8, v1, v0
2191; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
2192; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v1, v0
2193; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2194; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2195; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2196; GFX9-DL-NEXT: s_endpgm
2197 <4 x i8> addrspace(1)* %src2,
2198 i32 addrspace(1)* nocapture %dst) {
2199entry:
2200 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2201 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2202
2203 %cvec1 = zext <4 x i8> %vec1 to <4 x i32>
2204 %cvec2 = zext <4 x i8> %vec2 to <4 x i32>
2205
2206 %mul = mul <4 x i32> %cvec1, %cvec2
2207 %mul0 = extractelement <4 x i32> %mul, i64 0
2208 %mul1 = extractelement <4 x i32> %mul, i64 1
2209 %mul2 = extractelement <4 x i32> %mul, i64 2
2210 %mul3 = extractelement <4 x i32> %mul, i64 3
2211
2212 %acc = load i32, i32 addrspace(1)* %dst, align 4
2213 %add1 = add i32 %mul0, %acc
2214 %add2 = add i32 %add1, %mul1
2215 %add3 = add i32 %add2, %mul2
2216 %add4 = add i32 %add3, %mul3
2217
2218 store i32 %add4, i32 addrspace(1)* %dst, align 4
2219 ret void
2220}
2221
2222; TODO: Support this pattern.
2223define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
2224; GFX7-LABEL: idot4_acc32_vecMul:
2225; GFX7: ; %bb.0: ; %entry
2226; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2227; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2228; GFX7-NEXT: s_mov_b32 s3, 0xf000
2229; GFX7-NEXT: s_mov_b32 s2, -1
2230; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2231; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2232; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2233; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
2234; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2235; GFX7-NEXT: s_ashr_i32 s7, s4, 24
2236; GFX7-NEXT: s_ashr_i32 s10, s5, 24
2237; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80010
2238; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80008
2239; GFX7-NEXT: s_sext_i32_i8 s5, s5
2240; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80010
2241; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80008
2242; GFX7-NEXT: s_sext_i32_i8 s4, s4
2243; GFX7-NEXT: v_mov_b32_e32 v0, s5
2244; GFX7-NEXT: v_mov_b32_e32 v1, s6
2245; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1
2246; GFX7-NEXT: v_mov_b32_e32 v1, s12
2247; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0
2248; GFX7-NEXT: v_mov_b32_e32 v1, s11
2249; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0
2250; GFX7-NEXT: v_mov_b32_e32 v1, s10
2251; GFX7-NEXT: v_mad_i32_i24 v0, s7, v1, v0
2252; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
2253; GFX7-NEXT: s_endpgm
2254;
2255; GFX8-LABEL: idot4_acc32_vecMul:
2256; GFX8: ; %bb.0: ; %entry
2257; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2258; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2259; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2260; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
2261; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
2262; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0
2263; GFX8-NEXT: v_mov_b32_e32 v0, s0
2264; GFX8-NEXT: v_mov_b32_e32 v1, s1
2265; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2266; GFX8-NEXT: v_lshrrev_b16_e64 v2, 8, s2
2267; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s3
2268; GFX8-NEXT: s_ashr_i32 s5, s3, 24
2269; GFX8-NEXT: s_bfe_i32 s6, s3, 0x80010
2270; GFX8-NEXT: s_sext_i32_i8 s3, s3
2271; GFX8-NEXT: s_ashr_i32 s0, s2, 24
2272; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80010
2273; GFX8-NEXT: s_sext_i32_i8 s2, s2
2274; GFX8-NEXT: v_mov_b32_e32 v4, s3
2275; GFX8-NEXT: v_mov_b32_e32 v5, s4
2276; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
2277; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
2278; GFX8-NEXT: v_mad_i32_i24 v4, s2, v4, v5
2279; GFX8-NEXT: v_mad_i32_i24 v2, v2, v3, v4
2280; GFX8-NEXT: v_mov_b32_e32 v3, s6
2281; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2282; GFX8-NEXT: v_mov_b32_e32 v3, s5
2283; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
2284; GFX8-NEXT: flat_store_dword v[0:1], v2
2285; GFX8-NEXT: s_endpgm
2286;
2287; GFX9-NODL-LABEL: idot4_acc32_vecMul:
2288; GFX9-NODL: ; %bb.0: ; %entry
2289; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2290; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2291; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2292; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2293; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2294; GFX9-NODL-NEXT: s_load_dword s4, s[0:1], 0x0
2295; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2296; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2297; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2298; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s2
2299; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v3, 8, s3
2300; GFX9-NODL-NEXT: s_ashr_i32 s5, s3, 24
2301; GFX9-NODL-NEXT: s_bfe_i32 s6, s3, 0x80010
2302; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s3
2303; GFX9-NODL-NEXT: s_ashr_i32 s0, s2, 24
2304; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80010
2305; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s2
2306; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
2307; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s4
2308; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
2309; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8
2310; GFX9-NODL-NEXT: v_mad_i32_i24 v4, s2, v4, v5
2311; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v2, v3, v4
2312; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
2313; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2314; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
2315; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
2316; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
2317; GFX9-NODL-NEXT: s_endpgm
2318;
2319; GFX9-DL-LABEL: idot4_acc32_vecMul:
2320; GFX9-DL: ; %bb.0: ; %entry
2321; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2322; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2323; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2324; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2325; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2326; GFX9-DL-NEXT: s_load_dword s4, s[0:1], 0x0
2327; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2328; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2329; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2330; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s2
2331; GFX9-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s3
2332; GFX9-DL-NEXT: s_ashr_i32 s5, s3, 24
2333; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x80010
2334; GFX9-DL-NEXT: s_sext_i32_i8 s3, s3
2335; GFX9-DL-NEXT: s_ashr_i32 s0, s2, 24
2336; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x80010
2337; GFX9-DL-NEXT: s_sext_i32_i8 s2, s2
2338; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
2339; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
2340; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
2341; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8
2342; GFX9-DL-NEXT: v_mad_i32_i24 v4, s2, v4, v5
2343; GFX9-DL-NEXT: v_mad_i32_i24 v2, v2, v3, v4
2344; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
2345; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
2346; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
2347; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
2348; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
2349; GFX9-DL-NEXT: s_endpgm
2350 <4 x i8> addrspace(1)* %src2,
2351 i32 addrspace(1)* nocapture %dst) {
2352entry:
2353 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2354 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2355
2356 %cvec1 = sext <4 x i8> %vec1 to <4 x i32>
2357 %cvec2 = sext <4 x i8> %vec2 to <4 x i32>
2358
2359 %mul = mul <4 x i32> %cvec1, %cvec2
2360 %mul0 = extractelement <4 x i32> %mul, i64 0
2361 %mul1 = extractelement <4 x i32> %mul, i64 1
2362 %mul2 = extractelement <4 x i32> %mul, i64 2
2363 %mul3 = extractelement <4 x i32> %mul, i64 3
2364
2365 %acc = load i32, i32 addrspace(1)* %dst, align 4
2366 %add1 = add i32 %mul0, %acc
2367 %add2 = add i32 %add1, %mul1
2368 %add3 = add i32 %add2, %mul2
2369 %add4 = add i32 %add3, %mul3
2370
2371 store i32 %add4, i32 addrspace(1)* %dst, align 4
2372 ret void
2373}
2374
2375; TODO: This pattern should be recognized.
2376define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
2377; GFX7-LABEL: udot4_acc16_vecMul:
2378; GFX7: ; %bb.0: ; %entry
2379; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2380; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2381; GFX7-NEXT: s_mov_b32 s3, 0xf000
2382; GFX7-NEXT: s_mov_b32 s2, -1
2383; GFX7-NEXT: s_movk_i32 s8, 0xff
2384; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2385; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2386; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
2387; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2388; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2389; GFX7-NEXT: s_and_b32 s11, s4, s8
2390; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008
2391; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
2392; GFX7-NEXT: s_lshr_b32 s10, s5, 24
2393; GFX7-NEXT: s_and_b32 s8, s5, s8
2394; GFX7-NEXT: v_mov_b32_e32 v4, s9
2395; GFX7-NEXT: s_lshr_b32 s7, s4, 24
2396; GFX7-NEXT: v_mov_b32_e32 v2, s10
2397; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80010
2398; GFX7-NEXT: v_mov_b32_e32 v3, s8
2399; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2
2400; GFX7-NEXT: v_mul_u32_u24_e32 v4, s6, v4
2401; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80010
2402; GFX7-NEXT: v_mov_b32_e32 v1, s5
2403; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1
2404; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
2405; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3
2406; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
2407; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
2408; GFX7-NEXT: v_or_b32_e32 v2, v3, v4
2409; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2410; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2411; GFX7-NEXT: s_waitcnt vmcnt(0)
2412; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2413; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
2414; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2415; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
2416; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2417; GFX7-NEXT: s_endpgm
2418;
2419; GFX8-LABEL: udot4_acc16_vecMul:
2420; GFX8: ; %bb.0: ; %entry
2421; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2422; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2423; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2424; GFX8-NEXT: v_mov_b32_e32 v0, s0
2425; GFX8-NEXT: v_mov_b32_e32 v1, s1
2426; GFX8-NEXT: flat_load_ushort v2, v[0:1]
2427; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2428; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2429; GFX8-NEXT: s_movk_i32 s0, 0xff
2430; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2431; GFX8-NEXT: s_and_b32 s6, s1, s0
2432; GFX8-NEXT: s_and_b32 s0, s2, s0
2433; GFX8-NEXT: v_mov_b32_e32 v5, s0
2434; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
2435; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2
2436; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1
2437; GFX8-NEXT: s_lshr_b32 s4, s2, 24
2438; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010
2439; GFX8-NEXT: v_mov_b32_e32 v6, s7
2440; GFX8-NEXT: s_lshr_b32 s3, s1, 24
2441; GFX8-NEXT: s_waitcnt vmcnt(0)
2442; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
2443; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2
2444; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
2445; GFX8-NEXT: v_mov_b32_e32 v3, s4
2446; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
2447; GFX8-NEXT: flat_store_short v[0:1], v2
2448; GFX8-NEXT: s_endpgm
2449;
2450; GFX9-NODL-LABEL: udot4_acc16_vecMul:
2451; GFX9-NODL: ; %bb.0: ; %entry
2452; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2453; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2454; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0xffff
2455; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2456; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2457; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2458; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2459; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 16
2460; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 16
2461; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 24
2462; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2463; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 24
2464; GFX9-NODL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2465; GFX9-NODL-NEXT: v_lshl_or_b32 v3, s6, 16, v3
2466; GFX9-NODL-NEXT: v_lshl_or_b32 v4, s4, 16, v4
2467; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v4, v3
2468; GFX9-NODL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2469; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
2470; GFX9-NODL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
2471; GFX9-NODL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2472; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
2473; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2474; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v0, v2
2475; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2476; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2477; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off
2478; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2479; GFX9-NODL-NEXT: v_add_u32_e32 v4, v2, v4
2480; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2481; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v3
2482; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2483; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
2484; GFX9-NODL-NEXT: s_endpgm
2485;
2486; GFX9-DL-LABEL: udot4_acc16_vecMul:
2487; GFX9-DL: ; %bb.0: ; %entry
2488; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2489; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2490; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0xffff
2491; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2492; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2493; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2494; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2495; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 16
2496; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 16
2497; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 24
2498; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2499; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 24
2500; GFX9-DL-NEXT: v_and_b32_sdwa v3, v0, s7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2501; GFX9-DL-NEXT: v_lshl_or_b32 v3, s6, 16, v3
2502; GFX9-DL-NEXT: v_lshl_or_b32 v4, s4, 16, v4
2503; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v4, v3
2504; GFX9-DL-NEXT: v_and_b32_sdwa v4, v0, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2505; GFX9-DL-NEXT: v_lshrrev_b16_e64 v2, 8, s3
2506; GFX9-DL-NEXT: v_lshrrev_b16_e64 v1, 8, s2
2507; GFX9-DL-NEXT: v_and_b32_sdwa v0, v0, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
2508; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v4
2509; GFX9-DL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
2510; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v0, v2
2511; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2512; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2513; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off
2514; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2515; GFX9-DL-NEXT: v_add_u32_e32 v4, v2, v4
2516; GFX9-DL-NEXT: v_add_u32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2517; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3
2518; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2519; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
2520; GFX9-DL-NEXT: s_endpgm
2521 <4 x i8> addrspace(1)* %src2,
2522 i16 addrspace(1)* nocapture %dst) {
2523entry:
2524 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2525 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2526
2527 %cvec1 = zext <4 x i8> %vec1 to <4 x i16>
2528 %cvec2 = zext <4 x i8> %vec2 to <4 x i16>
2529
2530 %mul = mul <4 x i16> %cvec1, %cvec2
2531 %mul0 = extractelement <4 x i16> %mul, i64 0
2532 %mul1 = extractelement <4 x i16> %mul, i64 1
2533 %mul2 = extractelement <4 x i16> %mul, i64 2
2534 %mul3 = extractelement <4 x i16> %mul, i64 3
2535
2536 %acc = load i16, i16 addrspace(1)* %dst, align 4
2537 %add1 = add i16 %mul0, %acc
2538 %add2 = add i16 %add1, %mul1
2539 %add3 = add i16 %add2, %mul2
2540 %add4 = add i16 %add3, %mul3
2541
2542 store i16 %add4, i16 addrspace(1)* %dst, align 4
2543 ret void
2544}
2545
2546define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
2547; GFX7-LABEL: idot4_acc16_vecMul:
2548; GFX7: ; %bb.0: ; %entry
2549; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2550; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2551; GFX7-NEXT: s_mov_b32 s3, 0xf000
2552; GFX7-NEXT: s_mov_b32 s2, -1
2553; GFX7-NEXT: s_mov_b32 s8, 0xffff
2554; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2555; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2556; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
2557; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2558; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2559; GFX7-NEXT: s_sext_i32_i8 s6, s4
2560; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80008
2561; GFX7-NEXT: s_sext_i32_i8 s10, s5
2562; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008
2563; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
2564; GFX7-NEXT: s_ashr_i32 s5, s5, 24
2565; GFX7-NEXT: v_mov_b32_e32 v3, s11
2566; GFX7-NEXT: v_mov_b32_e32 v4, s10
2567; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80010
2568; GFX7-NEXT: v_mov_b32_e32 v2, s12
2569; GFX7-NEXT: s_ashr_i32 s4, s4, 24
2570; GFX7-NEXT: v_mov_b32_e32 v1, s5
2571; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
2572; GFX7-NEXT: v_mul_i32_i24_e32 v2, s9, v2
2573; GFX7-NEXT: v_mul_i32_i24_e32 v3, s7, v3
2574; GFX7-NEXT: v_mul_i32_i24_e32 v4, s6, v4
2575; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2576; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
2577; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
2578; GFX7-NEXT: v_and_b32_e32 v4, s8, v4
2579; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
2580; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
2581; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
2582; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
2583; GFX7-NEXT: s_waitcnt vmcnt(0)
2584; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
2585; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
2586; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2587; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
2588; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
2589; GFX7-NEXT: s_endpgm
2590;
2591; GFX8-LABEL: idot4_acc16_vecMul:
2592; GFX8: ; %bb.0: ; %entry
2593; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2594; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2595; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2596; GFX8-NEXT: v_mov_b32_e32 v0, s0
2597; GFX8-NEXT: v_mov_b32_e32 v1, s1
2598; GFX8-NEXT: flat_load_ushort v2, v[0:1]
2599; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2600; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2601; GFX8-NEXT: s_mov_b32 s0, 0xffff
2602; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2603; GFX8-NEXT: s_lshr_b32 s3, s1, 16
2604; GFX8-NEXT: s_bfe_i32 s6, s2, 0x80000
2605; GFX8-NEXT: s_lshr_b32 s4, s2, 16
2606; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80000
2607; GFX8-NEXT: v_ashrrev_i16_e64 v4, 8, s1
2608; GFX8-NEXT: s_bfe_i32 s1, s3, 0x80000
2609; GFX8-NEXT: v_ashrrev_i16_e64 v6, 8, s3
2610; GFX8-NEXT: s_and_b32 s3, s0, s6
2611; GFX8-NEXT: v_ashrrev_i16_e64 v3, 8, s2
2612; GFX8-NEXT: s_bfe_i32 s2, s4, 0x80000
2613; GFX8-NEXT: v_ashrrev_i16_e64 v5, 8, s4
2614; GFX8-NEXT: s_and_b32 s4, s0, s5
2615; GFX8-NEXT: v_mov_b32_e32 v7, s3
2616; GFX8-NEXT: s_and_b32 s2, s0, s2
2617; GFX8-NEXT: s_and_b32 s0, s0, s1
2618; GFX8-NEXT: s_waitcnt vmcnt(0)
2619; GFX8-NEXT: v_mad_u32_u24 v2, s4, v7, v2
2620; GFX8-NEXT: v_mad_u32_u24 v2, v4, v3, v2
2621; GFX8-NEXT: v_mov_b32_e32 v3, s2
2622; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
2623; GFX8-NEXT: v_mad_u32_u24 v2, v6, v5, v2
2624; GFX8-NEXT: flat_store_short v[0:1], v2
2625; GFX8-NEXT: s_endpgm
2626;
2627; GFX9-NODL-LABEL: idot4_acc16_vecMul:
2628; GFX9-NODL: ; %bb.0: ; %entry
2629; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2630; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2631; GFX9-NODL-NEXT: v_mov_b32_e32 v4, 0xffff
2632; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2633; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2634; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2635; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2636; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
2637; GFX9-NODL-NEXT: s_lshr_b32 s5, s3, 16
2638; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
2639; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
2640; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
2641; GFX9-NODL-NEXT: v_and_b32_e32 v5, s5, v4
2642; GFX9-NODL-NEXT: s_bfe_i32 s4, s4, 0x80000
2643; GFX9-NODL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
2644; GFX9-NODL-NEXT: v_and_b32_e32 v5, s4, v4
2645; GFX9-NODL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
2646; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
2647; GFX9-NODL-NEXT: s_bfe_i32 s3, s3, 0x80000
2648; GFX9-NODL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
2649; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
2650; GFX9-NODL-NEXT: v_and_b32_e32 v3, s3, v4
2651; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
2652; GFX9-NODL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
2653; GFX9-NODL-NEXT: v_and_b32_e32 v3, s2, v4
2654; GFX9-NODL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
2655; GFX9-NODL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
2656; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2657; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2658; GFX9-NODL-NEXT: global_load_ushort v4, v[0:1], off
2659; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2660; GFX9-NODL-NEXT: v_add_u32_e32 v4, v3, v4
2661; GFX9-NODL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2662; GFX9-NODL-NEXT: v_add_u32_e32 v3, v3, v2
2663; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2664; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
2665; GFX9-NODL-NEXT: s_endpgm
2666;
2667; GFX9-DL-LABEL: idot4_acc16_vecMul:
2668; GFX9-DL: ; %bb.0: ; %entry
2669; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2670; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2671; GFX9-DL-NEXT: v_mov_b32_e32 v4, 0xffff
2672; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2673; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2674; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2675; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2676; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
2677; GFX9-DL-NEXT: s_lshr_b32 s5, s3, 16
2678; GFX9-DL-NEXT: v_ashrrev_i16_e64 v3, 8, s5
2679; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
2680; GFX9-DL-NEXT: v_ashrrev_i16_e64 v2, 8, s4
2681; GFX9-DL-NEXT: v_and_b32_e32 v5, s5, v4
2682; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x80000
2683; GFX9-DL-NEXT: v_lshl_or_b32 v3, v3, 16, v5
2684; GFX9-DL-NEXT: v_and_b32_e32 v5, s4, v4
2685; GFX9-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
2686; GFX9-DL-NEXT: v_ashrrev_i16_e64 v1, 8, s3
2687; GFX9-DL-NEXT: s_bfe_i32 s3, s3, 0x80000
2688; GFX9-DL-NEXT: v_ashrrev_i16_e64 v0, 8, s2
2689; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v3
2690; GFX9-DL-NEXT: v_and_b32_e32 v3, s3, v4
2691; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
2692; GFX9-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v3
2693; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v4
2694; GFX9-DL-NEXT: v_lshl_or_b32 v0, v0, 16, v3
2695; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, v0, v1
2696; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2697; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2698; GFX9-DL-NEXT: global_load_ushort v4, v[0:1], off
2699; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2700; GFX9-DL-NEXT: v_add_u32_e32 v4, v3, v4
2701; GFX9-DL-NEXT: v_add_u32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2702; GFX9-DL-NEXT: v_add_u32_e32 v3, v3, v2
2703; GFX9-DL-NEXT: v_add_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2704; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
2705; GFX9-DL-NEXT: s_endpgm
2706 <4 x i8> addrspace(1)* %src2,
2707 i16 addrspace(1)* nocapture %dst) {
2708entry:
2709 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2710 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2711
2712 %cvec1 = sext <4 x i8> %vec1 to <4 x i16>
2713 %cvec2 = sext <4 x i8> %vec2 to <4 x i16>
2714
2715 %mul = mul <4 x i16> %cvec1, %cvec2
2716 %mul0 = extractelement <4 x i16> %mul, i64 0
2717 %mul1 = extractelement <4 x i16> %mul, i64 1
2718 %mul2 = extractelement <4 x i16> %mul, i64 2
2719 %mul3 = extractelement <4 x i16> %mul, i64 3
2720
2721 %acc = load i16, i16 addrspace(1)* %dst, align 4
2722 %add1 = add i16 %mul0, %acc
2723 %add2 = add i16 %add1, %mul1
2724 %add3 = add i16 %add2, %mul2
2725 %add4 = add i16 %add3, %mul3
2726
2727 store i16 %add4, i16 addrspace(1)* %dst, align 4
2728 ret void
2729}
2730
2731; TODO: Support this pattern.
2732define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
2733; GFX7-LABEL: udot4_acc8_vecMul:
2734; GFX7: ; %bb.0: ; %entry
2735; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2736; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2737; GFX7-NEXT: s_mov_b32 s3, 0xf000
2738; GFX7-NEXT: s_mov_b32 s2, -1
2739; GFX7-NEXT: s_movk_i32 s8, 0xff
2740; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2741; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
2742; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
2743; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
2744; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2745; GFX7-NEXT: s_bfe_u32 s6, s4, 0x80008
2746; GFX7-NEXT: s_lshr_b32 s7, s4, 16
2747; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
2748; GFX7-NEXT: s_lshr_b32 s11, s5, 16
2749; GFX7-NEXT: s_lshr_b32 s12, s5, 24
2750; GFX7-NEXT: v_mov_b32_e32 v2, s11
2751; GFX7-NEXT: v_mov_b32_e32 v3, s10
2752; GFX7-NEXT: s_lshr_b32 s9, s4, 24
2753; GFX7-NEXT: v_mov_b32_e32 v1, s12
2754; GFX7-NEXT: s_mul_i32 s4, s4, s5
2755; GFX7-NEXT: v_mul_u32_u24_e32 v1, s9, v1
2756; GFX7-NEXT: v_mul_u32_u24_e32 v2, s7, v2
2757; GFX7-NEXT: v_mul_u32_u24_e32 v3, s6, v3
2758; GFX7-NEXT: s_and_b32 s4, s4, s8
2759; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
2760; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
2761; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
2762; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
2763; GFX7-NEXT: v_or_b32_e32 v2, s4, v3
2764; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
2765; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
2766; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
2767; GFX7-NEXT: v_lshrrev_b32_e32 v2, 8, v1
2768; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v1
2769; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1
2770; GFX7-NEXT: s_waitcnt vmcnt(0)
2771; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
2772; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
2773; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
2774; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
2775; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
2776; GFX7-NEXT: s_endpgm
2777;
2778; GFX8-LABEL: udot4_acc8_vecMul:
2779; GFX8: ; %bb.0: ; %entry
2780; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2781; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2782; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2783; GFX8-NEXT: v_mov_b32_e32 v0, s0
2784; GFX8-NEXT: v_mov_b32_e32 v1, s1
2785; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
2786; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
2787; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
2788; GFX8-NEXT: s_movk_i32 s0, 0xff
2789; GFX8-NEXT: s_waitcnt lgkmcnt(0)
2790; GFX8-NEXT: s_lshr_b32 s3, s1, 24
2791; GFX8-NEXT: s_lshr_b32 s4, s2, 24
2792; GFX8-NEXT: s_and_b32 s6, s1, s0
2793; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
2794; GFX8-NEXT: s_and_b32 s0, s2, s0
2795; GFX8-NEXT: v_mov_b32_e32 v3, s1
2796; GFX8-NEXT: v_mov_b32_e32 v4, s2
2797; GFX8-NEXT: v_mul_u32_u24_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2798; GFX8-NEXT: v_mov_b32_e32 v4, s0
2799; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80010
2800; GFX8-NEXT: v_mov_b32_e32 v5, s7
2801; GFX8-NEXT: v_mov_b32_e32 v6, s4
2802; GFX8-NEXT: v_mov_b32_e32 v7, s3
2803; GFX8-NEXT: v_mul_u32_u24_e32 v4, s6, v4
2804; GFX8-NEXT: v_mul_u32_u24_e32 v5, s5, v5
2805; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2806; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2807; GFX8-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2808; GFX8-NEXT: v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2809; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v3
2810; GFX8-NEXT: s_waitcnt vmcnt(0)
2811; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
2812; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
2813; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
2814; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
2815; GFX8-NEXT: flat_store_byte v[0:1], v2
2816; GFX8-NEXT: s_endpgm
2817;
2818; GFX9-NODL-LABEL: udot4_acc8_vecMul:
2819; GFX9-NODL: ; %bb.0: ; %entry
2820; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2821; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2822; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2823; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
2824; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
2825; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
2826; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
2827; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
2828; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
2829; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16
2830; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
2831; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
2832; GFX9-NODL-NEXT: s_lshr_b32 s1, s3, 16
2833; GFX9-NODL-NEXT: s_lshr_b32 s4, s3, 24
2834; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s2, v3
2835; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2836; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1
2837; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2838; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24
2839; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
2840; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2841; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s0, v5
2842; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2843; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2844; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
2845; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
2846; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2
2847; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4
2848; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2849; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2850; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
2851; GFX9-NODL-NEXT: s_endpgm
2852;
2853; GFX9-DL-LABEL: udot4_acc8_vecMul:
2854; GFX9-DL: ; %bb.0: ; %entry
2855; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2856; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2857; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2858; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
2859; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
2860; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
2861; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
2862; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
2863; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
2864; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16
2865; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
2866; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
2867; GFX9-DL-NEXT: s_lshr_b32 s1, s3, 16
2868; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 24
2869; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s2, v3
2870; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
2871; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1
2872; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2873; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24
2874; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
2875; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
2876; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s0, v5
2877; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2878; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2879; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
2880; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
2881; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
2882; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
2883; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
2884; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
2885; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
2886; GFX9-DL-NEXT: s_endpgm
2887 <4 x i8> addrspace(1)* %src2,
2888 i8 addrspace(1)* nocapture %dst) {
2889entry:
2890 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
2891 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
2892
2893 %mul = mul <4 x i8> %vec1, %vec2
2894 %mul0 = extractelement <4 x i8> %mul, i64 0
2895 %mul1 = extractelement <4 x i8> %mul, i64 1
2896 %mul2 = extractelement <4 x i8> %mul, i64 2
2897 %mul3 = extractelement <4 x i8> %mul, i64 3
2898
2899 %acc = load i8, i8 addrspace(1)* %dst, align 4
2900 %add1 = add i8 %mul0, %acc
2901 %add2 = add i8 %add1, %mul1
2902 %add3 = add i8 %add2, %mul2
2903 %add4 = add i8 %add3, %mul3
2904
2905 store i8 %add4, i8 addrspace(1)* %dst, align 4
2906 ret void
2907}