blob: 83a2e584312f56895cc44a08b45edf57d9ba8d2b [file] [log] [blame]
Matt Arsenault8c4a3522018-06-26 19:10:00 +00001; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; FIXME: Manually added checks for metadata nodes at bottom
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
4; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
5
6define amdgpu_kernel void @kern_noargs() {
7; HSA-LABEL: @kern_noargs(
8; HSA-NEXT: ret void
9;
10; MESA-LABEL: @kern_noargs(
11; MESA-NEXT: ret void
12;
13 ret void
14}
15
16define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
17; HSA-LABEL: @kern_i8(
18; HSA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
19; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]] to [[KERN_I8:%.*]] addrspace(4)*
20; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
21; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
22; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
23; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
24; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
25; HSA-NEXT: ret void
26;
27; MESA-LABEL: @kern_i8(
28; MESA-NEXT: [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
29; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
30; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8:%.*]] addrspace(4)*
31; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
32; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
33; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
34; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
35; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
36; MESA-NEXT: ret void
37;
38 store i8 %arg, i8 addrspace(1)* undef, align 1
39 ret void
40}
41
42define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
43; HSA-LABEL: @kern_i16(
44; HSA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
45; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]] to [[KERN_I16:%.*]] addrspace(4)*
46; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
47; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
48; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
49; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
50; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
51; HSA-NEXT: ret void
52;
53; MESA-LABEL: @kern_i16(
54; MESA-NEXT: [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
55; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
56; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I16:%.*]] addrspace(4)*
57; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
58; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
59; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
60; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
61; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
62; MESA-NEXT: ret void
63;
64 store i16 %arg, i16 addrspace(1)* undef, align 1
65 ret void
66}
67
68define amdgpu_kernel void @kern_f16(half %arg) #0 {
69; HSA-LABEL: @kern_f16(
70; HSA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
71; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]] to [[KERN_F16:%.*]] addrspace(4)*
72; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
73; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
74; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
75; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
76; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
77; HSA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
78; HSA-NEXT: ret void
79;
80; MESA-LABEL: @kern_f16(
81; MESA-NEXT: [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
82; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
83; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F16:%.*]] addrspace(4)*
84; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
85; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
86; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
87; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
88; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
89; MESA-NEXT: store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
90; MESA-NEXT: ret void
91;
92 store half %arg, half addrspace(1)* undef, align 1
93 ret void
94}
95
96define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
97; HSA-LABEL: @kern_zeroext_i8(
98; HSA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
99; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
100; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
101; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
102; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
103; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
104; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
105; HSA-NEXT: ret void
106;
107; MESA-LABEL: @kern_zeroext_i8(
108; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
109; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
110; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I8:%.*]] addrspace(4)*
111; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
112; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
113; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !1, !invariant.load !0
114; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
115; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
116; MESA-NEXT: ret void
117;
118 store i8 %arg, i8 addrspace(1)* undef, align 1
119 ret void
120}
121
122define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
123; HSA-LABEL: @kern_zeroext_i16(
124; HSA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
125; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
126; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
127; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
128; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
129; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
130; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
131; HSA-NEXT: ret void
132;
133; MESA-LABEL: @kern_zeroext_i16(
134; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
135; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
136; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ZEROEXT_I16:%.*]] addrspace(4)*
137; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
138; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
139; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !2, !invariant.load !0
140; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
141; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
142; MESA-NEXT: ret void
143;
144 store i16 %arg, i16 addrspace(1)* undef, align 1
145 ret void
146}
147
148define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
149; HSA-LABEL: @kern_signext_i8(
150; HSA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
151; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
152; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
153; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
154; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
155; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
156; HSA-NEXT: store i8 [[TMP4]], i8 addrspace(1)* undef, align 1
157; HSA-NEXT: ret void
158;
159; MESA-LABEL: @kern_signext_i8(
160; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
161; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
162; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I8:%.*]] addrspace(4)*
163; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
164; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
165; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !3, !invariant.load !0
166; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
167; MESA-NEXT: store i8 [[TMP5]], i8 addrspace(1)* undef, align 1
168; MESA-NEXT: ret void
169;
170 store i8 %arg, i8 addrspace(1)* undef, align 1
171 ret void
172}
173
174define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
175; HSA-LABEL: @kern_signext_i16(
176; HSA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
177; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
178; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
179; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
180; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
181; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
182; HSA-NEXT: store i16 [[TMP4]], i16 addrspace(1)* undef, align 1
183; HSA-NEXT: ret void
184;
185; MESA-LABEL: @kern_signext_i16(
186; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
187; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
188; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_SIGNEXT_I16:%.*]] addrspace(4)*
189; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
190; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
191; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !range !4, !invariant.load !0
192; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
193; MESA-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef, align 1
194; MESA-NEXT: ret void
195;
196 store i16 %arg, i16 addrspace(1)* undef, align 1
197 ret void
198}
199
200define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
201; HSA-LABEL: @kern_i8_i8(
202; HSA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
203; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]] to [[KERN_I8_I8:%.*]] addrspace(4)*
204; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
205; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
206; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
207; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
208; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
209; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
210; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
211; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
212; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
213; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef, align 1
214; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
215; HSA-NEXT: ret void
216;
217; MESA-LABEL: @kern_i8_i8(
218; MESA-NEXT: [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
219; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
220; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I8_I8:%.*]] addrspace(4)*
221; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
222; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
223; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
224; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
225; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
226; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
227; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
228; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
229; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
230; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
231; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef, align 1
232; MESA-NEXT: ret void
233;
234 store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
235 store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
236 ret void
237}
238
239define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
240; HSA-LABEL: @kern_v3i8(
241; HSA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
242; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]] to [[KERN_V3I8:%.*]] addrspace(4)*
243; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
244; HSA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
245; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
246; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
247; HSA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
248; HSA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
249; HSA-NEXT: ret void
250;
251; MESA-LABEL: @kern_v3i8(
252; MESA-NEXT: [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
253; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
254; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I8:%.*]] addrspace(4)*
255; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
256; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
257; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
258; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
259; MESA-NEXT: [[ARG_LOAD:%.*]] = bitcast i24 [[TMP5]] to <3 x i8>
260; MESA-NEXT: store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
261; MESA-NEXT: ret void
262;
263 store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
264 ret void
265}
266
267define amdgpu_kernel void @kern_i24(i24 %arg0) {
268; HSA-LABEL: @kern_i24(
269; HSA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
270; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]] to [[KERN_I24:%.*]] addrspace(4)*
271; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
272; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
273; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
274; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
275; HSA-NEXT: store i24 [[TMP4]], i24 addrspace(1)* undef
276; HSA-NEXT: ret void
277;
278; MESA-LABEL: @kern_i24(
279; MESA-NEXT: [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
280; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
281; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I24:%.*]] addrspace(4)*
282; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
283; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
284; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
285; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24
286; MESA-NEXT: store i24 [[TMP5]], i24 addrspace(1)* undef
287; MESA-NEXT: ret void
288;
289 store i24 %arg0, i24 addrspace(1)* undef
290 ret void
291}
292
293define amdgpu_kernel void @kern_i32(i32 %arg0) {
294; HSA-LABEL: @kern_i32(
295; HSA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
296; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]] to [[KERN_I32:%.*]] addrspace(4)*
297; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
298; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
299; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
300; HSA-NEXT: ret void
301;
302; MESA-LABEL: @kern_i32(
303; MESA-NEXT: [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
304; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
305; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32:%.*]] addrspace(4)*
306; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32]], [[KERN_I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
307; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
308; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
309; MESA-NEXT: ret void
310;
311 store i32 %arg0, i32 addrspace(1)* undef
312 ret void
313}
314
315define amdgpu_kernel void @kern_f32(float %arg0) {
316; HSA-LABEL: @kern_f32(
317; HSA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
318; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]] to [[KERN_F32:%.*]] addrspace(4)*
319; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP1]], i32 0, i32 0
320; HSA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
321; HSA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef
322; HSA-NEXT: ret void
323;
324; MESA-LABEL: @kern_f32(
325; MESA-NEXT: [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
326; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
327; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_F32:%.*]] addrspace(4)*
328; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_F32]], [[KERN_F32]] addrspace(4)* [[TMP2]], i32 0, i32 0
329; MESA-NEXT: [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
330; MESA-NEXT: store float [[ARG0_LOAD]], float addrspace(1)* undef
331; MESA-NEXT: ret void
332;
333 store float %arg0, float addrspace(1)* undef
334 ret void
335}
336
337define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
338; HSA-LABEL: @kern_v3i32(
339; HSA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
340; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]] to [[KERN_V3I32:%.*]] addrspace(4)*
341; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
342; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
343; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
344; HSA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
345; HSA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
346; HSA-NEXT: ret void
347;
348; MESA-LABEL: @kern_v3i32(
349; MESA-NEXT: [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
350; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
351; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_V3I32:%.*]] addrspace(4)*
352; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_V3I32]], [[KERN_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
353; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
354; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
355; MESA-NEXT: [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
356; MESA-NEXT: store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
357; MESA-NEXT: ret void
358;
359 store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
360 ret void
361}
362
363define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
364; HSA-LABEL: @kern_i32_v3i32(
365; HSA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
366; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
367; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 0
368; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
369; HSA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP1]], i32 0, i32 1
370; HSA-NEXT: [[TMP2:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
371; HSA-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP2]], align 16, !invariant.load !0
372; HSA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
373; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
374; HSA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
375; HSA-NEXT: ret void
376;
377; MESA-LABEL: @kern_i32_v3i32(
378; MESA-NEXT: [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
379; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
380; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_I32_V3I32:%.*]] addrspace(4)*
381; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 0
382; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
383; MESA-NEXT: [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_I32_V3I32]], [[KERN_I32_V3I32]] addrspace(4)* [[TMP2]], i32 0, i32 1
384; MESA-NEXT: [[TMP3:%.*]] = bitcast <3 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
385; MESA-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP3]], align 4, !invariant.load !0
386; MESA-NEXT: [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
387; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
388; MESA-NEXT: store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
389; MESA-NEXT: ret void
390;
391 store i32 %arg0, i32 addrspace(1)* undef
392 store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
393 ret void
394}
395
396%struct.a = type { i32, i8, [4 x i8] }
397%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
398
399define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
400; HSA-LABEL: @kern_struct_a(
401; HSA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
402; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
403; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP1]], i32 0, i32 0
404; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
405; HSA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
406; HSA-NEXT: ret void
407;
408; MESA-LABEL: @kern_struct_a(
409; MESA-NEXT: [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
410; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
411; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_A:%.*]] addrspace(4)*
412; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_A]], [[KERN_STRUCT_A]] addrspace(4)* [[TMP2]], i32 0, i32 0
413; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_A:%.*]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
414; MESA-NEXT: store [[STRUCT_A]] %arg0.load, [[STRUCT_A]] addrspace(1)* undef
415; MESA-NEXT: ret void
416;
417 store %struct.a %arg0, %struct.a addrspace(1)* undef
418 ret void
419}
420
421define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
422; HSA-LABEL: @kern_struct_b_packed(
423; HSA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
424; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
425; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP1]], i32 0, i32 0
426; HSA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
427; HSA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
428; HSA-NEXT: ret void
429;
430; MESA-LABEL: @kern_struct_b_packed(
431; MESA-NEXT: [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
432; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
433; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_STRUCT_B_PACKED:%.*]] addrspace(4)*
434; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_STRUCT_B_PACKED]], [[KERN_STRUCT_B_PACKED]] addrspace(4)* [[TMP2]], i32 0, i32 0
435; MESA-NEXT: [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED:%.*]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
436; MESA-NEXT: store [[STRUCT_B_PACKED]] %arg0.load, [[STRUCT_B_PACKED]] addrspace(1)* undef
437; MESA-NEXT: ret void
438;
439 store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
440 ret void
441}
442
443define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
444; HSA-LABEL: @kern_implicit_arg_num_bytes(
445; HSA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
446; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
447; HSA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP1]], i32 0, i32 0
448; HSA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 16, !invariant.load !0
449; HSA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
450; HSA-NEXT: ret void
451;
452; MESA-LABEL: @kern_implicit_arg_num_bytes(
453; MESA-NEXT: [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(80) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
454; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
455; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_IMPLICIT_ARG_NUM_BYTES:%.*]] addrspace(4)*
456; MESA-NEXT: [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_IMPLICIT_ARG_NUM_BYTES]], [[KERN_IMPLICIT_ARG_NUM_BYTES]] addrspace(4)* [[TMP2]], i32 0, i32 0
457; MESA-NEXT: [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET]], align 4, !invariant.load !0
458; MESA-NEXT: store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef
459; MESA-NEXT: ret void
460;
461 store i32 %arg0, i32 addrspace(1)* undef
462 ret void
463}
464
465define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
466; HSA-LABEL: @kern_lds_ptr(
467; HSA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
468; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
469; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
470; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 16, !invariant.load !0
471; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
472; HSA-NEXT: ret void
473;
474; MESA-LABEL: @kern_lds_ptr(
475; MESA-NEXT: [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
476; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
477; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR:%.*]] addrspace(4)*
478; MESA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_LDS_PTR]], [[KERN_LDS_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
479; MESA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET]], align 4, !invariant.load !0
480; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
481; MESA-NEXT: ret void
482;
483 store i32 0, i32 addrspace(3)* %lds, align 4
484 ret void
485}
486
487define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
488; HSA-LABEL: @kern_lds_ptr_si(
489; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
490; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
491; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
492; HSA-NEXT: ret void
493;
494; MESA-LABEL: @kern_lds_ptr_si(
495; MESA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
496; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 36
497; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_LDS_PTR_SI:%.*]] addrspace(4)*
498; MESA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
499; MESA-NEXT: ret void
500;
501 store i32 0, i32 addrspace(3)* %lds, align 4
502 ret void
503}
504
505define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
506; HSA-LABEL: @kern_realign_i8_i8(
507; HSA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
508; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
509; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
510; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
511; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
512; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
513; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
514; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
515; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
516; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
517; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
518; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
519; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef
520; HSA-NEXT: ret void
521;
522; MESA-LABEL: @kern_realign_i8_i8(
523; MESA-NEXT: [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
524; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
525; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8:%.*]] addrspace(4)*
526; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
527; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
528; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
529; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
530; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
531; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
532; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
533; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
534; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
535; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
536; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef
537; MESA-NEXT: ret void
538;
539 store volatile i8 %arg0, i8 addrspace(1)* undef
540 store volatile i8 %arg1, i8 addrspace(1)* undef
541 ret void
542}
543
544define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
545; HSA-LABEL: @kern_realign_i8_i8_i8(
546; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
547; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
548; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
549; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
550; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
551; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
552; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
553; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
554; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
555; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
556; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
557; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
558; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
559; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
560; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
561; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
562; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
563; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef
564; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef
565; HSA-NEXT: ret void
566;
567; MESA-LABEL: @kern_realign_i8_i8_i8(
568; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
569; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
570; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8:%.*]] addrspace(4)*
571; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
572; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
573; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
574; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
575; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
576; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
577; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
578; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
579; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
580; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
581; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
582; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
583; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
584; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
585; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
586; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef
587; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef
588; MESA-NEXT: ret void
589;
590 store volatile i8 %arg0, i8 addrspace(1)* undef
591 store volatile i8 %arg1, i8 addrspace(1)* undef
592 store volatile i8 %arg2, i8 addrspace(1)* undef
593 ret void
594}
595
596define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
597; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
598; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
599; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
600; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
601; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
602; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
603; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
604; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
605; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
606; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
607; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
608; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
609; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
610; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
611; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
612; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
613; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
614; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
615; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
616; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
617; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
618; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
619; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
620; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef
621; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef
622; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef
623; HSA-NEXT: ret void
624;
625; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
626; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
627; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
628; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8:%.*]] addrspace(4)*
629; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
630; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
631; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
632; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
633; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
634; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
635; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
636; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
637; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
638; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
639; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
640; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
641; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
642; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
643; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
644; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
645; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
646; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
647; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
648; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
649; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef
650; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef
651; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef
652; MESA-NEXT: ret void
653;
654 store volatile i8 %arg0, i8 addrspace(1)* undef
655 store volatile i8 %arg1, i8 addrspace(1)* undef
656 store volatile i8 %arg2, i8 addrspace(1)* undef
657 store volatile i8 %arg3, i8 addrspace(1)* undef
658 ret void
659}
660
661define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
662; HSA-LABEL: @kern_realign_i8_v3i8(
663; HSA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
664; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
665; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
666; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
667; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
668; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
669; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
670; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
671; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
672; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i24
673; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP7]] to <3 x i8>
674; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
675; HSA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
676; HSA-NEXT: ret void
677;
678; MESA-LABEL: @kern_realign_i8_v3i8(
679; MESA-NEXT: [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
680; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
681; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_V3I8:%.*]] addrspace(4)*
682; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
683; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
684; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
685; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
686; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
687; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
688; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
689; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i24
690; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP8]] to <3 x i8>
691; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
692; MESA-NEXT: store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef
693; MESA-NEXT: ret void
694;
695 store volatile i8 %arg0, i8 addrspace(1)* undef
696 store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
697 ret void
698}
699
700define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
701; HSA-LABEL: @kern_realign_i8_i16(
702; HSA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
703; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
704; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
705; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
706; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
707; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
708; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
709; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
710; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
711; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
712; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
713; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
714; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef
715; HSA-NEXT: ret void
716;
717; MESA-LABEL: @kern_realign_i8_i16(
718; MESA-NEXT: [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
719; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
720; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I16:%.*]] addrspace(4)*
721; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
722; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
723; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
724; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
725; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
726; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
727; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
728; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
729; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
730; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
731; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef
732; MESA-NEXT: ret void
733;
734 store volatile i8 %arg0, i8 addrspace(1)* undef
735 store volatile i16 %arg1, i16 addrspace(1)* undef
736 ret void
737}
738
739define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
740; HSA-LABEL: @kern_realign_i1_i1(
741; HSA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
742; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
743; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
744; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
745; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
746; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
747; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
748; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
749; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
750; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
751; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
752; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef
753; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef
754; HSA-NEXT: ret void
755;
756; MESA-LABEL: @kern_realign_i1_i1(
757; MESA-NEXT: [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
758; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
759; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1:%.*]] addrspace(4)*
760; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
761; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
762; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
763; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
764; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
765; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
766; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
767; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
768; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
769; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef
770; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef
771; MESA-NEXT: ret void
772;
773 store volatile i1 %arg0, i1 addrspace(1)* undef
774 store volatile i1 %arg1, i1 addrspace(1)* undef
775 ret void
776}
777
778define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
779; HSA-LABEL: @kern_realign_i1_i1_i1(
780; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
781; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
782; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
783; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
784; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
785; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
786; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
787; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
788; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
789; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
790; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
791; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
792; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
793; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
794; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
795; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
796; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef
797; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef
798; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef
799; HSA-NEXT: ret void
800;
801; MESA-LABEL: @kern_realign_i1_i1_i1(
802; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
803; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
804; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1:%.*]] addrspace(4)*
805; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
806; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
807; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
808; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
809; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
810; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
811; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
812; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
813; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
814; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
815; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
816; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
817; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
818; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
819; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef
820; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef
821; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef
822; MESA-NEXT: ret void
823;
824 store volatile i1 %arg0, i1 addrspace(1)* undef
825 store volatile i1 %arg1, i1 addrspace(1)* undef
826 store volatile i1 %arg2, i1 addrspace(1)* undef
827 ret void
828}
829
830define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
831; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
832; HSA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
833; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
834; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
835; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
836; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
837; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
838; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
839; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
840; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
841; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
842; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
843; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
844; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
845; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
846; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
847; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i1
848; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
849; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
850; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
851; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
852; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i1
853; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef
854; HSA-NEXT: store volatile i1 [[TMP8]], i1 addrspace(1)* undef
855; HSA-NEXT: store volatile i1 [[TMP12]], i1 addrspace(1)* undef
856; HSA-NEXT: store volatile i1 [[TMP16]], i1 addrspace(1)* undef
857; HSA-NEXT: ret void
858;
859; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
860; MESA-NEXT: [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
861; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
862; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I1_I1_I1:%.*]] addrspace(4)*
863; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
864; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
865; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
866; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
867; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
868; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
869; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
870; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
871; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i1
872; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
873; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
874; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
875; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
876; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i1
877; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
878; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
879; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
880; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
881; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i1
882; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef
883; MESA-NEXT: store volatile i1 [[TMP9]], i1 addrspace(1)* undef
884; MESA-NEXT: store volatile i1 [[TMP13]], i1 addrspace(1)* undef
885; MESA-NEXT: store volatile i1 [[TMP17]], i1 addrspace(1)* undef
886; MESA-NEXT: ret void
887;
888 store volatile i1 %arg0, i1 addrspace(1)* undef
889 store volatile i1 %arg1, i1 addrspace(1)* undef
890 store volatile i1 %arg2, i1 addrspace(1)* undef
891 store volatile i1 %arg3, i1 addrspace(1)* undef
892 ret void
893}
894
895define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
896; HSA-LABEL: @kern_realign_i1_v3i1(
897; HSA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
898; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
899; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
900; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
901; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
902; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
903; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
904; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
905; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
906; HSA-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i3
907; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP7]] to <3 x i1>
908; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef
909; HSA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
910; HSA-NEXT: ret void
911;
912; MESA-LABEL: @kern_realign_i1_v3i1(
913; MESA-NEXT: [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
914; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
915; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_V3I1:%.*]] addrspace(4)*
916; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
917; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
918; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
919; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
920; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
921; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
922; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
923; MESA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i3
924; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP8]] to <3 x i1>
925; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef
926; MESA-NEXT: store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef
927; MESA-NEXT: ret void
928;
929 store volatile i1 %arg0, i1 addrspace(1)* undef
930 store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
931 ret void
932}
933
934define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
935; HSA-LABEL: @kern_realign_i1_i16(
936; HSA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
937; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
938; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
939; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
940; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
941; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i1
942; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
943; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
944; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
945; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
946; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
947; HSA-NEXT: store volatile i1 [[TMP4]], i1 addrspace(1)* undef
948; HSA-NEXT: store volatile i16 [[TMP8]], i16 addrspace(1)* undef
949; HSA-NEXT: ret void
950;
951; MESA-LABEL: @kern_realign_i1_i16(
952; MESA-NEXT: [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
953; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
954; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I1_I16:%.*]] addrspace(4)*
955; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
956; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
957; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
958; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
959; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
960; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
961; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
962; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
963; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
964; MESA-NEXT: store volatile i1 [[TMP5]], i1 addrspace(1)* undef
965; MESA-NEXT: store volatile i16 [[TMP9]], i16 addrspace(1)* undef
966; MESA-NEXT: ret void
967;
968 store volatile i1 %arg0, i1 addrspace(1)* undef
969 store volatile i16 %arg1, i16 addrspace(1)* undef
970 ret void
971}
972
973define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
974; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
975; HSA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
976; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
977; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
978; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
979; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
980; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
981; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
982; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
983; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
984; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 8
985; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
986; HSA-NEXT: [[TMP9:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
987; HSA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP9]] to i32 addrspace(4)*
988; HSA-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
989; HSA-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP10]], 16
990; HSA-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
991; HSA-NEXT: [[TMP13:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
992; HSA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP13]] to i32 addrspace(4)*
993; HSA-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
994; HSA-NEXT: [[TMP15:%.*]] = lshr i32 [[TMP14]], 24
995; HSA-NEXT: [[TMP16:%.*]] = trunc i32 [[TMP15]] to i8
996; HSA-NEXT: [[TMP17:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
997; HSA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP17]] to i32 addrspace(4)*
998; HSA-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
999; HSA-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP18]], 8
1000; HSA-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1001; HSA-NEXT: [[TMP21:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1002; HSA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP21]] to i32 addrspace(4)*
1003; HSA-NEXT: [[TMP22:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1004; HSA-NEXT: [[TMP23:%.*]] = lshr i32 [[TMP22]], 16
1005; HSA-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i8
1006; HSA-NEXT: [[TMP25:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1007; HSA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP25]] to i32 addrspace(4)*
1008; HSA-NEXT: [[TMP26:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1009; HSA-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP26]], 24
1010; HSA-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i8
1011; HSA-NEXT: store volatile i8 [[TMP4]], i8 addrspace(1)* undef
1012; HSA-NEXT: store volatile i8 [[TMP8]], i8 addrspace(1)* undef
1013; HSA-NEXT: store volatile i8 [[TMP12]], i8 addrspace(1)* undef
1014; HSA-NEXT: store volatile i8 [[TMP16]], i8 addrspace(1)* undef
1015; HSA-NEXT: store volatile i8 [[TMP20]], i8 addrspace(1)* undef
1016; HSA-NEXT: store volatile i8 [[TMP24]], i8 addrspace(1)* undef
1017; HSA-NEXT: store volatile i8 [[TMP28]], i8 addrspace(1)* undef
1018; HSA-NEXT: ret void
1019;
1020; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
1021; MESA-NEXT: [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1022; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1023; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8:%.*]] addrspace(4)*
1024; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1025; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
1026; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1027; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1028; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1029; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
1030; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1031; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 8
1032; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i8
1033; MESA-NEXT: [[TMP10:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1034; MESA-NEXT: [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP10]] to i32 addrspace(4)*
1035; MESA-NEXT: [[TMP11:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1036; MESA-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP11]], 16
1037; MESA-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
1038; MESA-NEXT: [[TMP14:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1039; MESA-NEXT: [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP14]] to i32 addrspace(4)*
1040; MESA-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1041; MESA-NEXT: [[TMP16:%.*]] = lshr i32 [[TMP15]], 24
1042; MESA-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1043; MESA-NEXT: [[TMP18:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
1044; MESA-NEXT: [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP18]] to i32 addrspace(4)*
1045; MESA-NEXT: [[TMP19:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1046; MESA-NEXT: [[TMP20:%.*]] = lshr i32 [[TMP19]], 8
1047; MESA-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
1048; MESA-NEXT: [[TMP22:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
1049; MESA-NEXT: [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP22]] to i32 addrspace(4)*
1050; MESA-NEXT: [[TMP23:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1051; MESA-NEXT: [[TMP24:%.*]] = lshr i32 [[TMP23]], 16
1052; MESA-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i8
1053; MESA-NEXT: [[TMP26:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 4
1054; MESA-NEXT: [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP26]] to i32 addrspace(4)*
1055; MESA-NEXT: [[TMP27:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1056; MESA-NEXT: [[TMP28:%.*]] = lshr i32 [[TMP27]], 24
1057; MESA-NEXT: [[TMP29:%.*]] = trunc i32 [[TMP28]] to i8
1058; MESA-NEXT: store volatile i8 [[TMP5]], i8 addrspace(1)* undef
1059; MESA-NEXT: store volatile i8 [[TMP9]], i8 addrspace(1)* undef
1060; MESA-NEXT: store volatile i8 [[TMP13]], i8 addrspace(1)* undef
1061; MESA-NEXT: store volatile i8 [[TMP17]], i8 addrspace(1)* undef
1062; MESA-NEXT: store volatile i8 [[TMP21]], i8 addrspace(1)* undef
1063; MESA-NEXT: store volatile i8 [[TMP25]], i8 addrspace(1)* undef
1064; MESA-NEXT: store volatile i8 [[TMP29]], i8 addrspace(1)* undef
1065; MESA-NEXT: ret void
1066;
1067 store volatile i8 %arg0, i8 addrspace(1)* undef
1068 store volatile i8 %arg1, i8 addrspace(1)* undef
1069 store volatile i8 %arg2, i8 addrspace(1)* undef
1070 store volatile i8 %arg3, i8 addrspace(1)* undef
1071 store volatile i8 %arg5, i8 addrspace(1)* undef
1072 store volatile i8 %arg6, i8 addrspace(1)* undef
1073 store volatile i8 %arg7, i8 addrspace(1)* undef
1074 ret void
1075}
1076
1077define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
1078; HSA-LABEL: @kern_realign_f16_f16(
1079; HSA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1080; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
1081; HSA-NEXT: [[TMP2:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1082; HSA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP2]] to i32 addrspace(4)*
1083; HSA-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
1084; HSA-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
1085; HSA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP4]] to half
1086; HSA-NEXT: [[TMP5:%.*]] = getelementptr i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1087; HSA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP5]] to i32 addrspace(4)*
1088; HSA-NEXT: [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 16, !invariant.load !0
1089; HSA-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1090; HSA-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
1091; HSA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP8]] to half
1092; HSA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
1093; HSA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
1094; HSA-NEXT: ret void
1095;
1096; MESA-LABEL: @kern_realign_f16_f16(
1097; MESA-NEXT: [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1098; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1099; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_REALIGN_F16_F16:%.*]] addrspace(4)*
1100; MESA-NEXT: [[TMP3:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1101; MESA-NEXT: [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP3]] to i32 addrspace(4)*
1102; MESA-NEXT: [[TMP4:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1103; MESA-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1104; MESA-NEXT: [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1105; MESA-NEXT: [[TMP6:%.*]] = getelementptr i8, i8 addrspace(4)* [[TMP1]], i64 0
1106; MESA-NEXT: [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = bitcast i8 addrspace(4)* [[TMP6]] to i32 addrspace(4)*
1107; MESA-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]], align 4, !invariant.load !0
1108; MESA-NEXT: [[TMP8:%.*]] = lshr i32 [[TMP7]], 16
1109; MESA-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16
1110; MESA-NEXT: [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP9]] to half
1111; MESA-NEXT: store volatile half [[ARG0_LOAD]], half addrspace(1)* undef
1112; MESA-NEXT: store volatile half [[ARG1_LOAD]], half addrspace(1)* undef
1113; MESA-NEXT: ret void
1114;
1115 store volatile half %arg0, half addrspace(1)* undef
1116 store volatile half %arg1, half addrspace(1)* undef
1117 ret void
1118}
1119
1120define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
1121; HSA-LABEL: @kern_global_ptr(
1122; HSA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1123; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
1124; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
1125; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0
1126; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1127; HSA-NEXT: ret void
1128;
1129; MESA-LABEL: @kern_global_ptr(
1130; MESA-NEXT: [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1131; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1132; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR:%.*]] addrspace(4)*
1133; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR]], [[KERN_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
1134; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0
1135; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1136; MESA-NEXT: ret void
1137;
1138 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1139 ret void
1140}
1141
1142define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
1143; HSA-LABEL: @kern_global_ptr_dereferencable(
1144; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1145; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
1146; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP1]], i32 0, i32 0
1147; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable !1
1148; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1149; HSA-NEXT: ret void
1150;
1151; MESA-LABEL: @kern_global_ptr_dereferencable(
1152; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1153; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1154; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE:%.*]] addrspace(4)*
1155; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE]], [[KERN_GLOBAL_PTR_DEREFERENCABLE]] addrspace(4)* [[TMP2]], i32 0, i32 0
1156; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable !5
1157; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1158; MESA-NEXT: ret void
1159;
1160 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1161 ret void
1162}
1163
1164define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
1165; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1166; HSA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1167; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
1168; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP1]], i32 0, i32 0
1169; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !dereferenceable_or_null !2
1170; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1171; HSA-NEXT: ret void
1172;
1173; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1174; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1175; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1176; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL:%.*]] addrspace(4)*
1177; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]], [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL]] addrspace(4)* [[TMP2]], i32 0, i32 0
1178; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !dereferenceable_or_null !6
1179; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1180; MESA-NEXT: ret void
1181;
1182 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1183 ret void
1184}
1185
1186define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
1187; HSA-LABEL: @kern_nonnull_global_ptr(
1188; HSA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1189; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
1190; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
1191; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !nonnull !0
1192; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1193; HSA-NEXT: ret void
1194;
1195; MESA-LABEL: @kern_nonnull_global_ptr(
1196; MESA-NEXT: [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1197; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1198; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NONNULL_GLOBAL_PTR:%.*]] addrspace(4)*
1199; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_NONNULL_GLOBAL_PTR]], [[KERN_NONNULL_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
1200; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !nonnull !0
1201; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1202; MESA-NEXT: ret void
1203;
1204 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1205 ret void
1206}
1207
1208define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
1209; HSA-LABEL: @kern_align32_global_ptr(
1210; HSA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1211; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
1212; HSA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP1]], i32 0, i32 0
1213; HSA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 16, !invariant.load !0, !align !3
1214; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1215; HSA-NEXT: ret void
1216;
1217; MESA-LABEL: @kern_align32_global_ptr(
1218; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1219; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1220; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_ALIGN32_GLOBAL_PTR:%.*]] addrspace(4)*
1221; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds [[KERN_ALIGN32_GLOBAL_PTR]], [[KERN_ALIGN32_GLOBAL_PTR]] addrspace(4)* [[TMP2]], i32 0, i32 0
1222; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET]], align 4, !invariant.load !0, !align !7
1223; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef
1224; MESA-NEXT: ret void
1225;
1226 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1227 ret void
1228}
1229
1230define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
1231; HSA-LABEL: @kern_noalias_global_ptr(
1232; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1233; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
1234; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
1235; HSA-NEXT: ret void
1236;
1237; MESA-LABEL: @kern_noalias_global_ptr(
1238; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1239; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1240; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR:%.*]] addrspace(4)*
1241; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef
1242; MESA-NEXT: ret void
1243;
1244 store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1245 ret void
1246}
1247
1248define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
1249; HSA-LABEL: @kern_noalias_global_ptr_x2(
1250; HSA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1251; HSA-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
1252; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
1253; HSA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
1254; HSA-NEXT: ret void
1255;
1256; MESA-LABEL: @kern_noalias_global_ptr_x2(
1257; MESA-NEXT: [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1258; MESA-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT]], i64 36
1259; MESA-NEXT: [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[TMP1]] to [[KERN_NOALIAS_GLOBAL_PTR_X2:%.*]] addrspace(4)*
1260; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef
1261; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef
1262; MESA-NEXT: ret void
1263;
1264 store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
1265 store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
1266 ret void
1267}
1268
1269attributes #0 = { nounwind "target-cpu"="kaveri" }
1270attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1271attributes #2 = { nounwind "target-cpu"="tahiti" }
1272
1273; HSA: 0 = !{}
1274; HSA: !1 = !{i64 42}
1275; HSA: !2 = !{i64 128}
1276; HSA: !3 = !{i64 1024}
1277
1278
1279; MESA: !0 = !{}
1280; MESA: !1 = !{i32 0, i32 256}
1281; MESA: !2 = !{i32 0, i32 65536}
1282; MESA: !3 = !{i32 -128, i32 128}
1283; MESA: !4 = !{i32 -32768, i32 32768}
1284; MESA: !5 = !{i64 42}
1285; MESA: !6 = !{i64 128}
1286; MESA: !7 = !{i64 1024}