blob: d3c30607f8c6aee94d3b720f6c9d52a79b55d0e8 [file] [log] [blame]
Farhana Aleen9250c922018-08-29 16:31:18 +00001; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s
2; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX789 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-DL %s
5
6; GCN-LABEL: {{^}}udot4_acc32:
7; GCN: ; %bb.0: ; %entry
8; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
9; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
10
11; GFX789-NEXT: s_movk_i32 s{{[0-9]+}}, 0xff
12; GFX789: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0
13; GFX789-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
14; GFX789-NEXT: s_load_dword [[S2:s[0-9]+]], s{{\[}}[[SRC2_LO]]:[[SRC2_HI]]{{\]}}, 0x0
15; GFX789-NEXT: s_waitcnt lgkmcnt(0)
16; GFX789-NEXT: s_and_b32 [[V1E1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
17; GFX789-NEXT: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
18; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008
19; GFX789-NEXT: v_mov_b32_e32 [[V2E1:v[0-9]+]], s{{[0-9]+}}
20; GFX789-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]], s{{[0-9]+}}
21; GFX789-NEXT: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80010
22; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], [[V1E1]], [[V2E1]], [[SRC2]]
23
24; GFX789-NEXT: s_bfe_u32 [[V1E2:s[0-9]+]], s{{[0-9]+}}, 0x80008
25; GFX789-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]], s{{[0-9]+}}
26; GFX789-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], s{{[0-9]+}}, 0x80010
27; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], [[V1E2]], [[V2E2]], [[MAD1]]
28
29; GFX789-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]], s{{[0-9]+}}
30; GFX789-NEXT: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
31; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
32
33; GFX789-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], s{{[0-9]+}}, 24
34; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], s{{[0-9]+}}
35; GFX789-NEXT: v_mad_u32_u24 [[RES:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
36; GFX789: {{buffer|flat|global}}_store_dword
37; GFX789-NEXT: s_endpgm
38
39; GCN-DL: s_waitcnt lgkmcnt(0)
40; GCN-DL-NEXT: s_load_dword [[SRC0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
41; GCN-DL-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
42; GCN-DL-NEXT: s_load_dword [[S2:s[0-9]+]], s{{\[}}[[SRC2_LO]]:[[SRC2_HI]]{{\]}}, 0x0
43; GCN-DL-NEXT: v_mov_b32_e32 v[[STLO:[0-9]+]], s[[SRC2_LO]]
44; GCN-DL-NEXT: v_mov_b32_e32 v[[STHI:[0-9]+]], s[[SRC2_HI]]
45; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
46; GCN-DL-NEXT: v_mov_b32_e32 [[SRC1:v[0-9]+]], [[S1]]
47; GCN-DL-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]], [[S2]]
48; GCN-DL-NEXT: v_dot4_u32_u8 [[DOT:v[0-9]+]], [[SRC0]], [[SRC1]], [[SRC2]]
49; GCN-DL-NEXT: global_store_dword v{{\[}}[[STLO]]:[[STHI]]{{\]}}, [[DOT]], off
50; GCN-DL-NEXT: s_endpgm
51
52
53define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
54 <4 x i8> addrspace(1)* %src2,
55 i32 addrspace(1)* nocapture %dst) {
56entry:
57 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
58 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
59
60 %v1e0 = extractelement <4 x i8> %vec1, i64 0
61 %cv1e0 = zext i8 %v1e0 to i32
62 %v2e0 = extractelement <4 x i8> %vec2, i64 0
63 %cv2e0 = zext i8 %v2e0 to i32
64 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
65
66 %v1e1 = extractelement <4 x i8> %vec1, i64 1
67 %cv1e1 = zext i8 %v1e1 to i32
68 %v2e1 = extractelement <4 x i8> %vec2, i64 1
69 %cv2e1 = zext i8 %v2e1 to i32
70 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
71
72 %v1e2 = extractelement <4 x i8> %vec1, i64 2
73 %cv1e2 = zext i8 %v1e2 to i32
74 %v2e2 = extractelement <4 x i8> %vec2, i64 2
75 %cv2e2 = zext i8 %v2e2 to i32
76 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
77
78 %v1e3 = extractelement <4 x i8> %vec1, i64 3
79 %cv1e3 = zext i8 %v1e3 to i32
80 %v2e3 = extractelement <4 x i8> %vec2, i64 3
81 %cv2e3 = zext i8 %v2e3 to i32
82 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
83
84 %acc = load i32, i32 addrspace(1)* %dst, align 4
85 %mad1 = add i32 %mul1, %acc
86 %mad2 = add i32 %mad1, %mul2
87 %mad3 = add i32 %mad2, %mul3
88 %mad4 = add i32 %mad3, %mul4
89
90 store i32 %mad4, i32 addrspace(1)* %dst, align 4
91 ret void
92}
93
94define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
95; GCN-LABEL: udot4_acc16:
96; GCN: ; %bb.0: ; %entry
97; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
98; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
99
100; GFX789: {{buffer|flat|global}}_load_ushort [[SRC2:v[0-9]+]]
101; GFX789: s_load_dword
102; GFX789: s_waitcnt lgkmcnt(0)
103; GFX789: s_and_b32
104; GFX789: s_bfe_u32 [[V1E2:s[0-9]+]], s{{[0-9]+}}, 0x80008
105; GFX789: s_bfe_u32
106; GFX789: s_bfe_u32
107; GFX789-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]], s{{[0-9]+}}
108; GFX789-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], s{{[0-9]+}}, 0x80010
109; GFX789-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], s{{[0-9]+}}, 24
110; GFX789-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]]
111; GFX789-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], s{{[0-9]+}}, 24
112; GFX789-NEXT: s_waitcnt vmcnt(0)
113; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], {{s[0-9]+}}, {{v[0-9]+}}, [[SRC2]]
114; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], {{s[0-9]+}}, [[V2E2]], [[MAD1]]
115; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
116; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], s{{[0-9]+}}
117; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
118; GFX789-NEXT: {{buffer|flat|global}}_store_short
119; GFX789-NEXT: s_endpgm
120
121; GCN-DL: s_waitcnt lgkmcnt(0)
122; GCN-DL-NEXT: s_load_dword [[SRC0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
123; GCN-DL-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
124; GCN-DL-NEXT: v_mov_b32_e32 v[[STLO:[0-9]+]], s[[SRC2_LO]]
125; GCN-DL-NEXT: v_mov_b32_e32 v[[STHI:[0-9]+]], s[[SRC2_HI]]
126; GCN-DL-NEXT: global_load_ushort [[SRC2:v[0-9]+]], v{{\[}}[[STLO]]:[[STHI]]{{\]}}, off
127; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
128; GCN-DL-NEXT: v_mov_b32_e32 [[SRC1:v[0-9]+]], [[S1]]
129; GCN-DL-NEXT: s_waitcnt vmcnt(0)
130; GCN-DL-NEXT: v_dot4_u32_u8 [[DOT:v[0-9]+]], [[SRC0]], [[SRC1]], [[SRC2]]
131; GCN-DL-NEXT: global_store_short v{{\[}}[[STLO]]:[[STHI]]{{\]}}, [[DOT]], off
132; GCN-DL-NEXT: s_endpgm
133 <4 x i8> addrspace(1)* %src2,
134 i16 addrspace(1)* nocapture %dst) {
135entry:
136 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
137 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
138
139 %v1e0 = extractelement <4 x i8> %vec1, i64 0
140 %cv1e0 = zext i8 %v1e0 to i16
141 %v2e0 = extractelement <4 x i8> %vec2, i64 0
142 %cv2e0 = zext i8 %v2e0 to i16
143 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
144
145 %v1e1 = extractelement <4 x i8> %vec1, i64 1
146 %cv1e1 = zext i8 %v1e1 to i16
147 %v2e1 = extractelement <4 x i8> %vec2, i64 1
148 %cv2e1 = zext i8 %v2e1 to i16
149 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
150
151 %v1e2 = extractelement <4 x i8> %vec1, i64 2
152 %cv1e2 = zext i8 %v1e2 to i16
153 %v2e2 = extractelement <4 x i8> %vec2, i64 2
154 %cv2e2 = zext i8 %v2e2 to i16
155 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
156
157 %v1e3 = extractelement <4 x i8> %vec1, i64 3
158 %cv1e3 = zext i8 %v1e3 to i16
159 %v2e3 = extractelement <4 x i8> %vec2, i64 3
160 %cv2e3 = zext i8 %v2e3 to i16
161 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
162
163 %acc = load i16, i16 addrspace(1)* %dst, align 2
164 %mad1 = add i16 %mul1, %acc
165 %mad2 = add i16 %mad1, %mul2
166 %mad3 = add i16 %mad2, %mul3
167 %mad4 = add i16 %mad3, %mul4
168
169 store i16 %mad4, i16 addrspace(1)* %dst, align 2
170 ret void
171}
172
173define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
174; GCN-LABEL: udot4_acc8:
175; GCN: ; %bb.0: ; %entry
176; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
177; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
178; GFX789: s_movk_i32 s{{[0-9]+}}, 0xff
179; GFX789: s_waitcnt lgkmcnt(0)
180; GFX789: {{buffer|flat|global}}_load_ubyte [[SRC2:v[0-9]+]]
181; GFX789: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
182; GFX789: s_waitcnt lgkmcnt(0)
183; GFX789: s_bfe_u32 [[V1E2:s[0-9]+]], s{{[0-9]+}}, 0x80008
184; GFX789: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
185; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008
186; GFX789: v_mov_b32_e32 [[V2E1:v[0-9]+]]
187; GFX789: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80010
188; GFX789-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]]
189; GFX789-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], s{{[0-9]+}}, 0x80010
190; GFX789-NEXT: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 24
191; GFX789-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]]
192; GFX789-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], s{{[0-9]+}}, 24
193; GFX789-NEXT: s_waitcnt vmcnt(0)
194; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, [[V2E1]], [[SRC2]]
195; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], [[V1E2]], [[V2E2]], [[MAD1]]
196; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
197; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]]
198; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
199; GFX789-NEXT: {{buffer|flat|global}}_store_byte
200; GFX789-NEXT: s_endpgm
201
202; GCN-DL: s_waitcnt lgkmcnt(0)
203; GCN-DL-NEXT: s_load_dword [[SRC0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
204; GCN-DL-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
205; GCN-DL-NEXT: v_mov_b32_e32 v[[STLO:[0-9]+]], s[[SRC2_LO]]
206; GCN-DL-NEXT: v_mov_b32_e32 v[[STHI:[0-9]+]], s[[SRC2_HI]]
207; GCN-DL-NEXT: global_load_ubyte [[SRC2:v[0-9]+]], v{{\[}}[[STLO]]:[[STHI]]{{\]}}, off
208; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
209; GCN-DL-NEXT: v_mov_b32_e32 [[SRC1:v[0-9]+]], [[S1]]
210; GCN-DL-NEXT: s_waitcnt vmcnt(0)
211; GCN-DL-NEXT: v_dot4_u32_u8 [[DOT:v[0-9]+]], [[SRC0]], [[SRC1]], [[SRC2]]
212; GCN-DL-NEXT: global_store_byte v{{\[}}[[STLO]]:[[STHI]]{{\]}}, [[DOT]], off
213; GCN-DL-NEXT: s_endpgm
214 <4 x i8> addrspace(1)* %src2,
215 i8 addrspace(1)* nocapture %dst) {
216entry:
217 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
218 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
219
220 %v1e0 = extractelement <4 x i8> %vec1, i64 0
221 %v2e0 = extractelement <4 x i8> %vec2, i64 0
222 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
223
224 %v1e1 = extractelement <4 x i8> %vec1, i64 1
225 %v2e1 = extractelement <4 x i8> %vec2, i64 1
226 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
227
228 %v1e2 = extractelement <4 x i8> %vec1, i64 2
229 %v2e2 = extractelement <4 x i8> %vec2, i64 2
230 %mul3 = mul nuw nsw i8 %v1e2, %v2e2
231
232 %v1e3 = extractelement <4 x i8> %vec1, i64 3
233 %v2e3 = extractelement <4 x i8> %vec2, i64 3
234 %mul4 = mul nuw nsw i8 %v1e3, %v2e3
235
236 %acc = load i8, i8 addrspace(1)* %dst, align 2
237 %mad1 = add i8 %mul1, %acc
238 %mad2 = add i8 %mad1, %mul2
239 %mad3 = add i8 %mad2, %mul3
240 %mad4 = add i8 %mad3, %mul4
241
242 store i8 %mad4, i8 addrspace(1)* %dst, align 2
243 ret void
244}
245
246; TODO: Generate udot4?
247define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
248; GCN-LABEL: udot2_8:
249; GCN-NEXT: ; %bb.0: ; %entry
250; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
251; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}
252; GCN: s_movk_i32 [[FF:s[0-9]+]], 0xff
253; GCN-NEXT: s_waitcnt lgkmcnt(0)
254; GCN: s_load_dword [[V1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
255; GCN: s_load_dword [[V2:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
256; GCN-NEXT: s_waitcnt lgkmcnt(0)
257; GCN: s_and_b32 [[V1E1:s[0-9]+]], [[V1]], [[FF]]
258; GCN: s_bfe_u32 [[VE2:s[0-9]+]], {{s[0-9]+}}, 0x80008
259; GCN: s_bfe_u32 [[V1E2:s[0-9]+]], {{s[0-9]+}}, 0x80008
260; GCN-NEXT: s_waitcnt vmcnt(0)
261; GCN-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], [[V1E1]]
262; GCN-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]]
263; GCN-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], {{s[0-9]+}}, [[V2E2]], [[MAD1]]
264; GCN-NEXT: {{buffer|flat|global}}_store_byte
265; GCN-NEXT: s_endpgm
266 <4 x i8> addrspace(1)* %src2,
267 i8 addrspace(1)* nocapture %dst) {
268entry:
269 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
270 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
271
272 %v1e0 = extractelement <4 x i8> %vec1, i64 0
273 %v2e0 = extractelement <4 x i8> %vec2, i64 0
274 %mul1 = mul nuw nsw i8 %v1e0, %v2e0
275
276 %v1e1 = extractelement <4 x i8> %vec1, i64 1
277 %v2e1 = extractelement <4 x i8> %vec2, i64 1
278 %mul2 = mul nuw nsw i8 %v1e1, %v2e1
279
280 %acc = load i8, i8 addrspace(1)* %dst, align 2
281 %mad1 = add i8 %mul1, %acc
282 %mad2 = add i8 %mad1, %mul2
283 store i8 %mad2, i8 addrspace(1)* %dst, align 2
284 ret void
285}
286
287define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %src1,
288; GCN-LABEL: udot4_CommutationInsideMAD:
289; GCN: ; %bb.0: ; %entry
290; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
291; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
292; GFX789: s_waitcnt lgkmcnt(0)
293; GFX789: {{buffer|flat|global}}_load_ubyte [[SRC2:v[0-9]+]]
294; GFX789: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
295; GFX789: s_waitcnt lgkmcnt(0)
296; GFX789: s_bfe_u32
297; GFX789: s_bfe_u32
298; GFX789-NEXT: s_bfe_u32 [[V1E2:s[0-9]+]], s{{[0-9]+}}, 0x80008
299; GFX789-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]]
300; GFX789-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], s{{[0-9]+}}, 0x80010
301; GFX789-NEXT: s_lshr_b32 [[VE4:s[0-9]+]], s{{[0-9]+}}, 24
302; GFX789-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]]
303; GFX789-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], s{{[0-9]+}}, 24
304; GFX789-NEXT: s_waitcnt vmcnt(0)
305
306; GFX789-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, [[SRC2]]
307; GFX789-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], [[V1E2]], [[V2E2]], [[MAD1]]
308; GFX789-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
309; GFX789-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], [[VE4]]
310; GFX789-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
311; GFX789-NEXT: {{buffer|flat|global}}_store_byte
312; GFX789-NEXT: s_endpgm
313
314; GCN-DL: s_waitcnt lgkmcnt(0)
315; GCN-DL-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
316; GCN-DL-NEXT: s_load_dword [[SRC0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
317; GCN-DL-NEXT: v_mov_b32_e32 v[[STLO:[0-9]+]], s[[SRC2_LO]]
318; GCN-DL-NEXT: v_mov_b32_e32 v[[STHI:[0-9]+]], s[[SRC2_HI]]
319; GCN-DL-NEXT: global_load_ubyte [[SRC2:v[0-9]+]], v{{\[}}[[STLO]]:[[STHI]]{{\]}}, off
320; GCN-DL-NEXT: s_waitcnt lgkmcnt(0)
321; GCN-DL-NEXT: v_mov_b32_e32 [[SRC1:v[0-9]+]], [[S1]]
322; GCN-DL-NEXT: s_waitcnt vmcnt(0)
323; GCN-DL-NEXT: v_dot4_u32_u8 [[DOT:v[0-9]+]], [[SRC0]], [[SRC1]], [[SRC2]]
324; GCN-DL-NEXT: global_store_byte v{{\[}}[[STLO]]:[[STHI]]{{\]}}, [[DOT]], off
325; GCN-DL-NEXT: s_endpgm
326 <4 x i8> addrspace(1)* %src2,
327 i8 addrspace(1)* nocapture %dst) {
328entry:
329 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
330 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
331
332 %v1e0 = extractelement <4 x i8> %vec1, i64 0
333 %v2e0 = extractelement <4 x i8> %vec2, i64 0
334 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
335
336 %v1e1 = extractelement <4 x i8> %vec1, i64 1
337 %v2e1 = extractelement <4 x i8> %vec2, i64 1
338 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
339
340 %v1e2 = extractelement <4 x i8> %vec1, i64 2
341 %v2e2 = extractelement <4 x i8> %vec2, i64 2
342 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
343
344 %v1e3 = extractelement <4 x i8> %vec1, i64 3
345 %v2e3 = extractelement <4 x i8> %vec2, i64 3
346 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
347
348 %acc = load i8, i8 addrspace(1)* %dst, align 2
349 %mad1 = add i8 %acc, %mul1
350 %mad2 = add i8 %mul2, %mad1
351 %mad3 = add i8 %mul3, %mad2
352 %mad4 = add i8 %mul4, %mad3
353
354 store i8 %mad4, i8 addrspace(1)* %dst, align 2
355 ret void
356}
357
358; TODO: Support commutation accross the adds.
359define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %src1,
360; GCN-LABEL: udot4_CommutationAccrossMADs:
361; GCN: ; %bb.0: ; %entry
362; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
363; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}
364; GCN: s_waitcnt lgkmcnt(0)
365; GCN: {{buffer|flat|global}}_load_ubyte [[SRC2:v[0-9]+]]
366; GCN: s_load_dword [[V2:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
367; GCN: s_waitcnt lgkmcnt(0)
368; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80008
369; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80008
370; GCN: v_mov_b32_e32 [[V2E1:v[0-9]+]]
371; GCN: s_bfe_u32 [[V1E3:s[0-9]+]], {{s[0-9]+}}, 0x80010
372; GCN: s_lshr_b32 [[VE4:s[0-9]+]], {{s[0-9]+}}, 24
373; GCN-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]], {{s[0-9]+}}
374; GCN-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], {{s[0-9]+}}, 24
375; GCN-NEXT: s_waitcnt vmcnt(0)
376; GCN-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], {{s[0-9]+}}, [[V2E1]], [[SRC2]]
377; GCN-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], {{s[0-9]+}}, {{v[0-9]+}}, [[MAD1]]
378; GCN-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], {{s[0-9]+}}, {{v[0-9]+}}, [[MAD2]]
379; GCN-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], [[VE4]]
380; GCN-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
381; GCN-NEXT: {{buffer|flat|global}}_store_byte
382; GCN-NEXT: s_endpgm
383 <4 x i8> addrspace(1)* %src2,
384 i8 addrspace(1)* nocapture %dst) {
385entry:
386 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
387 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
388
389 %v1e0 = extractelement <4 x i8> %vec1, i64 0
390 %v2e0 = extractelement <4 x i8> %vec2, i64 0
391 %mul1 = mul nuw nsw i8 %v2e0, %v1e0
392
393 %v1e1 = extractelement <4 x i8> %vec1, i64 1
394 %v2e1 = extractelement <4 x i8> %vec2, i64 1
395 %mul2 = mul nuw nsw i8 %v2e1, %v1e1
396
397 %v1e2 = extractelement <4 x i8> %vec1, i64 2
398 %v2e2 = extractelement <4 x i8> %vec2, i64 2
399 %mul3 = mul nuw nsw i8 %v2e2, %v1e2
400
401 %v1e3 = extractelement <4 x i8> %vec1, i64 3
402 %v2e3 = extractelement <4 x i8> %vec2, i64 3
403 %mul4 = mul nuw nsw i8 %v2e3, %v1e3
404
405 %acc = load i8, i8 addrspace(1)* %dst, align 2
406 %mad1 = add i8 %acc, %mul2
407 %mad2 = add i8 %mad1, %mul1
408 %mad3 = add i8 %mad2, %mul3
409 %mad4 = add i8 %mad3, %mul4
410
411 store i8 %mad4, i8 addrspace(1)* %dst, align 2
412 ret void
413}
414
415define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
416; GCN-LABEL: udot4_multiuse_mul1:
417; GCN: ; %bb.0: ; %entry
418; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
419; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
420; GCN-NEXT: s_movk_i32 [[FF:s[0-9]+]], 0xff
421; GCN: s_load_dword [[S0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
422; GCN-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
423; GCN-NEXT: s_load_dword [[S2:s[0-9]+]], s{{\[}}[[SRC2_LO]]:[[SRC2_HI]]{{\]}}, 0x0
424; GCN-NEXT: s_waitcnt lgkmcnt(0)
425; GCN-NEXT: s_and_b32 [[V1E1:s[0-9]+]], [[S0]], [[FF]]
426; GCN-NEXT: s_and_b32 [[SV2E1:s[0-9]+]], [[S1]], [[FF]]
427; GCN-NEXT: s_bfe_u32 [[SV2E2:s[0-9]+]], [[S1]], 0x80008
428; GCN-NEXT: v_mov_b32_e32 [[V2E1:v[0-9]+]], [[SV2E1]]
429; GCN-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]], [[S2]]
430; GCN-NEXT: s_bfe_u32 [[V1E2:s[0-9]+]], [[S0]], 0x80008
431; GCN-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], [[V1E1]], [[V2E1]], [[SRC2]]
432; GCN-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]], [[SV2E2]]
433; GCN-NEXT: s_bfe_u32 [[VE4:s[0-9]+]], [[S1]], 0x80010
434; GCN-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], [[V1E2]], [[V2E2]], [[MAD1]]
435; GCN-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], [[S0]], 0x80010
436; GCN-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E1]], [[V2E1]], [[MAD2]]
437; GCN-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]], [[VE4]]
438; GCN-NEXT: s_lshr_b32 [[VE4:s[0-9]+]], [[S1]], 24
439; GCN-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD3]]
440; GCN-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], [[S0]], 24
441; GCN-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]]
442; GCN-NEXT: v_mad_u32_u24 [[MAD5:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD4]]
443; GCN: {{buffer|flat|global}}_store_dword
444; GCN-NEXT: s_endpgm
445
446
447 <4 x i8> addrspace(1)* %src2,
448 i32 addrspace(1)* nocapture %dst) {
449entry:
450 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
451 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
452
453 %v1e0 = extractelement <4 x i8> %vec1, i64 0
454 %cv1e0 = zext i8 %v1e0 to i32
455 %v2e0 = extractelement <4 x i8> %vec2, i64 0
456 %cv2e0 = zext i8 %v2e0 to i32
457 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
458
459 %v1e1 = extractelement <4 x i8> %vec1, i64 1
460 %cv1e1 = zext i8 %v1e1 to i32
461 %v2e1 = extractelement <4 x i8> %vec2, i64 1
462 %cv2e1 = zext i8 %v2e1 to i32
463 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
464
465 %v1e2 = extractelement <4 x i8> %vec1, i64 2
466 %cv1e2 = zext i8 %v1e2 to i32
467 %v2e2 = extractelement <4 x i8> %vec2, i64 2
468 %cv2e2 = zext i8 %v2e2 to i32
469 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
470
471 %v1e3 = extractelement <4 x i8> %vec1, i64 3
472 %cv1e3 = zext i8 %v1e3 to i32
473 %v2e3 = extractelement <4 x i8> %vec2, i64 3
474 %cv2e3 = zext i8 %v2e3 to i32
475 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
476
477 %acc = load i32, i32 addrspace(1)* %dst, align 4
478 %add = add i32 %mul1, %acc
479 %add1 = add i32 %mul2, %add
480 %add2 = add i32 %add1, %mul1
481 %add3 = add i32 %add2, %mul3
482 %add4 = add i32 %add3, %mul4
483
484 store i32 %add4, i32 addrspace(1)* %dst, align 4
485 ret void
486}
487
488define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
489; GCN-LABEL: udot4_multiuse_add1:
490; GCN: ; %bb.0: ; %entry
491; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
492; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
493; GCN-NEXT: s_movk_i32 [[FF:s[0-9]+]], 0xff
494; GCN: s_load_dword [[S0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
495; GCN-NEXT: s_load_dword [[S1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
496; GCN-NEXT: s_load_dword [[S2:s[0-9]+]], s{{\[}}[[SRC2_LO]]:[[SRC2_HI]]{{\]}}, 0x0
497; GCN-NEXT: s_waitcnt lgkmcnt(0)
498; GCN-NEXT: s_and_b32 [[V1E2:s[0-9]+]], [[S0]], [[FF]]
499; GCN-NEXT: s_bfe_u32 [[SV2E1:s[0-9]+]], [[S1]], 0x80008
500; GCN-NEXT: s_and_b32 [[SV2E2:s[0-9]+]], [[S1]], [[FF]]
501; GCN-NEXT: s_bfe_u32 [[V1E1:s[0-9]+]], [[S0]], 0x80008
502; GCN-NEXT: v_mov_b32_e32 [[V2E1:v[0-9]+]], [[SV2E1]]
503; GCN-NEXT: v_mov_b32_e32 [[SRC2:v[0-9]+]], [[S2]]
504; GCN-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], [[V1E1]], [[V2E1]], [[SRC2]]
505; GCN-NEXT: s_bfe_u32 [[SV2E3:s[0-9]+]], [[S1]], 0x80010
506; GCN-NEXT: v_mov_b32_e32 [[V2E2:v[0-9]+]], [[SV2E2]]
507; GCN-NEXT: s_bfe_u32 [[V1E3:s[0-9]+]], [[S0]], 0x80010
508; GCN-NEXT: v_add_{{i|u}}32_e32 [[ADD1:v[0-9]+]]
509; GCN-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], [[V1E2]], [[V2E2]], [[MAD1]]
510; GCN-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]], [[SV2E3]]
511; GCN-NEXT: s_lshr_b32 [[SV2E4:s[0-9]+]], [[S1]], 24
512; GCN-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
513; GCN-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], [[S0]], 24
514; GCN-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], [[SV2E4]]
515; GCN-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
516; GCN-NEXT: v_add_{{i|u}}32_e32 [[RES:v[0-9]+]]
517; GCN: {{buffer|flat|global}}_store_dword
518; GCN-NEXT: s_endpgm
519 <4 x i8> addrspace(1)* %src2,
520 i32 addrspace(1)* nocapture %dst) {
521entry:
522 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
523 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
524
525 %v1e0 = extractelement <4 x i8> %vec1, i64 0
526 %cv1e0 = zext i8 %v1e0 to i32
527 %v2e0 = extractelement <4 x i8> %vec2, i64 0
528 %cv2e0 = zext i8 %v2e0 to i32
529 %mul1 = mul nuw nsw i32 %cv1e0, %cv2e0
530
531 %v1e1 = extractelement <4 x i8> %vec1, i64 1
532 %cv1e1 = zext i8 %v1e1 to i32
533 %v2e1 = extractelement <4 x i8> %vec2, i64 1
534 %cv2e1 = zext i8 %v2e1 to i32
535 %mul2 = mul nuw nsw i32 %cv1e1, %cv2e1
536
537 %v1e2 = extractelement <4 x i8> %vec1, i64 2
538 %cv1e2 = zext i8 %v1e2 to i32
539 %v2e2 = extractelement <4 x i8> %vec2, i64 2
540 %cv2e2 = zext i8 %v2e2 to i32
541 %mul3 = mul nuw nsw i32 %cv1e2, %cv2e2
542
543 %v1e3 = extractelement <4 x i8> %vec1, i64 3
544 %cv1e3 = zext i8 %v1e3 to i32
545 %v2e3 = extractelement <4 x i8> %vec2, i64 3
546 %cv2e3 = zext i8 %v2e3 to i32
547 %mul4 = mul nuw nsw i32 %cv1e3, %cv2e3
548
549 %acc = load i32, i32 addrspace(1)* %dst, align 4
550 %add1 = add i32 %mul2, %acc
551 %add = add i32 %add1, %acc
552 %add2 = add i32 %add1, %mul1
553 %add3 = add i32 %add2, %mul3
554 %add4 = add i32 %add3, %mul4
555 %res = add i32 %add4, %add
556 store i32 %res, i32 addrspace(1)* %dst, align 4
557 ret void
558}
559
560define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
561; GCN-LABEL: notdot4_mixedtypes:
562; GCN: ; %bb.0: ; %entry
563; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x9|0x24}}
564; GCN-NEXT: s_load_dwordx2 s{{\[}}[[SRC2_LO:[0-9]+]]:[[SRC2_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
565; GCN: s_mov_b32 [[FFFF:s[0-9]+]], 0xffff
566; GCN-NEXT: s_waitcnt lgkmcnt(0)
567; GCN: {{buffer|flat|global}}_load_ushort [[SRC2:v[0-9]+]]
568; GCN: s_load_dword [[S1:s[0-9]+]], s[6:7], 0x0
569; GCN: s_waitcnt lgkmcnt(0)
570; GCN: s_bfe_u32 [[SV2E1:s[0-9]+]], [[S1]], 0x80008
571; GCN: v_mov_b32_e32 [[V2E1:v[0-9]+]], [[SV2E1]]
572; GCN: s_bfe_u32 [[SV2E3:s[0-9]+]], [[S1]], 0x80010
573; GCN: v_mov_b32_e32 [[V2E2:v[0-9]+]]
574; GCN: s_bfe_u32 [[V1E3:s[0-9]+]], {{s[0-9]+}}, 0x80010
575; GCN-NEXT: s_lshr_b32 [[SV2E4:s[0-9]+]], [[S1]], 24
576; GCN-NEXT: v_mov_b32_e32 [[V2E3:v[0-9]+]], [[SV2E3]]
577; GCN-NEXT: s_lshr_b32 [[V1E4:s[0-9]+]], {{s[0-9]+}}, 24
578; GCN-NEXT: s_waitcnt vmcnt(0)
579; GCN-NEXT: v_mad_u32_u24 [[MAD1:v[0-9]+]], {{s[0-9]+}}, [[V2E1]], [[SRC2]]
580; GCN-NEXT: v_mad_u32_u24 [[MAD2:v[0-9]+]], {{s[0-9]+}}, [[V2E2]], [[MAD1]]
581; GCN-NEXT: v_mad_u32_u24 [[MAD3:v[0-9]+]], [[V1E3]], [[V2E3]], [[MAD2]]
582; GCN-NEXT: v_mov_b32_e32 [[V2E4:v[0-9]+]], [[SV2E4]]
583; GCN-NEXT: v_mad_u32_u24 [[MAD4:v[0-9]+]], [[V1E4]], [[V2E4]], [[MAD3]]
584; GCN-NEXT: {{buffer|flat|global}}_store_short
585; GCN-NEXT: s_endpgm
586 <4 x i8> addrspace(1)* %src2,
587 i16 addrspace(1)* nocapture %dst) {
588entry:
589 %vec1 = load <4 x i8>, <4 x i8> addrspace(1)* %src1
590 %vec2 = load <4 x i8>, <4 x i8> addrspace(1)* %src2
591
592 %v1e0 = extractelement <4 x i8> %vec1, i64 0
593 %cv1e0 = sext i8 %v1e0 to i16
594 %v2e0 = extractelement <4 x i8> %vec2, i64 0
595 %cv2e0 = sext i8 %v2e0 to i16
596 %mul1 = mul nuw nsw i16 %cv1e0, %cv2e0
597
598 %v1e1 = extractelement <4 x i8> %vec1, i64 1
599 %cv1e1 = zext i8 %v1e1 to i16
600 %v2e1 = extractelement <4 x i8> %vec2, i64 1
601 %cv2e1 = zext i8 %v2e1 to i16
602 %mul2 = mul nuw nsw i16 %cv1e1, %cv2e1
603
604 %v1e2 = extractelement <4 x i8> %vec1, i64 2
605 %cv1e2 = zext i8 %v1e2 to i16
606 %v2e2 = extractelement <4 x i8> %vec2, i64 2
607 %cv2e2 = zext i8 %v2e2 to i16
608 %mul3 = mul nuw nsw i16 %cv1e2, %cv2e2
609
610 %v1e3 = extractelement <4 x i8> %vec1, i64 3
611 %cv1e3 = zext i8 %v1e3 to i16
612 %v2e3 = extractelement <4 x i8> %vec2, i64 3
613 %cv2e3 = zext i8 %v2e3 to i16
614 %mul4 = mul nuw nsw i16 %cv1e3, %cv2e3
615
616 %acc = load i16, i16 addrspace(1)* %dst, align 2
617 %add1 = add i16 %mul2, %acc
618 %add2 = add i16 %add1, %mul1
619 %add3 = add i16 %add2, %mul3
620 %add4 = add i16 %add3, %mul4
621
622 store i16 %add4, i16 addrspace(1)* %dst, align 2
623 ret void
624}