blob: 0e5ac1e6addcc351f701b79b21261dd982a57eaf [file] [log] [blame]
Matt Arsenault3f71c0e2017-11-29 00:55:57 +00001; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
3; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s
4; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s
Tom Stellardeef2ad92013-08-05 22:45:56 +00005
6; Tests for indirect addressing on SI, which is implemented using dynamic
7; indexing of vectors.
8
Matt Arsenault93401f42016-10-07 03:55:04 +00009; GCN-LABEL: {{^}}extract_w_offset:
10; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
11; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
12; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
13; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 2.0
14; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000015
16; MOVREL-DAG: s_mov_b32 m0, [[IN]]
17; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
18
19; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
20; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
21; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000022define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
Tom Stellardeef2ad92013-08-05 22:45:56 +000023entry:
Matt Arsenault28419272015-10-07 00:42:51 +000024 %idx = add i32 %in, 1
25 %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
26 store float %elt, float addrspace(1)* %out
27 ret void
28}
29
30; XXX: Could do v_or_b32 directly
Matt Arsenault93401f42016-10-07 03:55:04 +000031; GCN-LABEL: {{^}}extract_w_offset_salu_use_vector:
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000032; MOVREL: s_mov_b32 m0
Matt Arsenault93401f42016-10-07 03:55:04 +000033; GCN-DAG: s_or_b32
34; GCN-DAG: s_or_b32
35; GCN-DAG: s_or_b32
36; GCN-DAG: s_or_b32
37; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
38; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
39; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
40; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000041
42; MOVREL: v_movrels_b32_e32
43
44; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
45; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
46; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000047define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
Matt Arsenault28419272015-10-07 00:42:51 +000048entry:
49 %idx = add i32 %in, 1
50 %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
51 %elt = extractelement <4 x i32> %vec, i32 %idx
52 store i32 %elt, i32 addrspace(1)* %out
Tom Stellardeef2ad92013-08-05 22:45:56 +000053 ret void
54}
55
Matt Arsenault93401f42016-10-07 03:55:04 +000056; GCN-LABEL: {{^}}extract_wo_offset:
57; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
58; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 4.0
59; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x40400000
60; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
61; GCN-DAG: v_mov_b32_e32 [[BASEREG:v[0-9]+]], 1.0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000062
63; MOVREL-DAG: s_mov_b32 m0, [[IN]]
64; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, [[BASEREG]]
65
66; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
67; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
68; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000069define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
Tom Stellardeef2ad92013-08-05 22:45:56 +000070entry:
Matt Arsenault28419272015-10-07 00:42:51 +000071 %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
72 store float %elt, float addrspace(1)* %out
Tom Stellardeef2ad92013-08-05 22:45:56 +000073 ret void
74}
75
Matt Arsenault93401f42016-10-07 03:55:04 +000076; GCN-LABEL: {{^}}extract_neg_offset_sgpr:
Tom Stellard8b0182a2015-04-23 20:32:01 +000077; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000078; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
79; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
80
81; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
Matthias Braun325cd2c2016-11-11 01:34:21 +000082; IDXMODE: v_mov_b32_e32 v2, 2
83; IDXMODE: v_mov_b32_e32 v3, 3
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000084; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
85; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
86; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +000087define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
Tom Stellard8b0182a2015-04-23 20:32:01 +000088entry:
89 %index = add i32 %offset, -512
90 %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
91 store i32 %value, i32 addrspace(1)* %out
92 ret void
93}
94
Matt Arsenault93401f42016-10-07 03:55:04 +000095; GCN-LABEL: {{^}}extract_neg_offset_sgpr_loaded:
Matt Arsenault28419272015-10-07 00:42:51 +000096; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultd486d3f2016-10-12 18:49:05 +000097; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
98; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
99
100; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
Matthias Braun325cd2c2016-11-11 01:34:21 +0000101; IDXMODE: v_mov_b32_e32 v0,
Konstantin Zhuravlyov0a1a7b62016-11-17 16:41:49 +0000102; IDXMODE: v_mov_b32_e32 v1,
103; IDXMODE: v_mov_b32_e32 v2,
104; IDXMODE: v_mov_b32_e32 v3,
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000105; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
106; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
107; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000108define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
Matt Arsenault28419272015-10-07 00:42:51 +0000109entry:
110 %index = add i32 %offset, -512
111 %or = or <4 x i32> %vec0, %vec1
112 %value = extractelement <4 x i32> %or, i32 %index
113 store i32 %value, i32 addrspace(1)* %out
114 ret void
115}
116
Matt Arsenault93401f42016-10-07 03:55:04 +0000117; GCN-LABEL: {{^}}extract_neg_offset_vgpr:
Tom Stellard8b0182a2015-04-23 20:32:01 +0000118; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000119
120; FIXME: The waitcnt for the argument load can go after the loop
Matt Arsenault93401f42016-10-07 03:55:04 +0000121; GCN: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
Mark Searles70359ac2017-06-02 14:19:25 +0000122; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
Matt Arsenault93401f42016-10-07 03:55:04 +0000123; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v{{[0-9]+}}
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000124; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000125
126; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000127; MOVREL: v_movrels_b32_e32 [[RESULT:v[0-9]+]], v1
128
129; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000130; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000131; IDXMODE: v_mov_b32_e32 [[RESULT:v[0-9]+]], v1
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000132; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000133
Matt Arsenault93401f42016-10-07 03:55:04 +0000134; GCN: s_cbranch_execnz
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000135
Matt Arsenault93401f42016-10-07 03:55:04 +0000136; GCN: buffer_store_dword [[RESULT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000137define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
Tom Stellard8b0182a2015-04-23 20:32:01 +0000138entry:
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000139 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
Tom Stellard8b0182a2015-04-23 20:32:01 +0000140 %index = add i32 %id, -512
141 %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
142 store i32 %value, i32 addrspace(1)* %out
143 ret void
144}
145
Matt Arsenault93401f42016-10-07 03:55:04 +0000146; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
Philip Reames3580c902017-12-30 18:42:37 +0000147; undefined behavior, but shouldn't crash compiler
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000148define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
Matt Arsenault21a46252016-06-27 19:57:44 +0000149entry:
150 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
151 %value = extractelement <4 x i32> %ld, i32 undef
152 store i32 %value, i32 addrspace(1)* %out
153 ret void
154}
155
Matt Arsenault93401f42016-10-07 03:55:04 +0000156; GCN-LABEL: {{^}}insert_undef_offset_sgpr_vector_src:
Philip Reames3580c902017-12-30 18:42:37 +0000157; undefined behavior, but shouldn't crash compiler
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000158define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
Matt Arsenault21a46252016-06-27 19:57:44 +0000159entry:
160 %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
161 %value = insertelement <4 x i32> %ld, i32 5, i32 undef
162 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
163 ret void
164}
165
Matt Arsenault93401f42016-10-07 03:55:04 +0000166; GCN-LABEL: {{^}}insert_w_offset:
167; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000168; MOVREL-DAG: s_mov_b32 m0, [[IN]]
Matt Arsenault93401f42016-10-07 03:55:04 +0000169; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
170; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
171; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
172; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
173; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000174
175; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
176; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000177define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
Tom Stellardeef2ad92013-08-05 22:45:56 +0000178entry:
179 %0 = add i32 %in, 1
180 %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
Matt Arsenaultf403df32016-08-26 06:31:32 +0000181 store <4 x float> %1, <4 x float> addrspace(1)* %out
Tom Stellardeef2ad92013-08-05 22:45:56 +0000182 ret void
183}
184
Matt Arsenault93401f42016-10-07 03:55:04 +0000185; GCN-LABEL: {{^}}insert_wo_offset:
186; GCN: s_load_dword [[IN:s[0-9]+]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000187
188; MOVREL: s_mov_b32 m0, [[IN]]
189; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
190
191; IDXMODE: s_set_gpr_idx_on [[IN]], dst
192; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
193; IDXMODE-NEXT: s_set_gpr_idx_off
194
Matt Arsenault93401f42016-10-07 03:55:04 +0000195; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000196define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
Tom Stellardeef2ad92013-08-05 22:45:56 +0000197entry:
198 %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
Matt Arsenaultf403df32016-08-26 06:31:32 +0000199 store <4 x float> %0, <4 x float> addrspace(1)* %out
Tom Stellardeef2ad92013-08-05 22:45:56 +0000200 ret void
201}
Tom Stellard8b0182a2015-04-23 20:32:01 +0000202
Matt Arsenault93401f42016-10-07 03:55:04 +0000203; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
Tom Stellard8b0182a2015-04-23 20:32:01 +0000204; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000205; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
206; MOVREL: v_movreld_b32_e32 v0, 5
207
208; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
209; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
210; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
211; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000212define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
Tom Stellard8b0182a2015-04-23 20:32:01 +0000213entry:
214 %index = add i32 %offset, -512
215 %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
216 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
217 ret void
218}
219
Matt Arsenault28419272015-10-07 00:42:51 +0000220; The vector indexed into is originally loaded into an SGPR rather
221; than built with a reg_sequence
222
Matt Arsenault93401f42016-10-07 03:55:04 +0000223; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
Matt Arsenault28419272015-10-07 00:42:51 +0000224; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000225; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
226; MOVREL: v_movreld_b32_e32 v0, 5
227
228; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
229; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
230; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
231; IDXMODE-NEXT: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000232define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
Matt Arsenault28419272015-10-07 00:42:51 +0000233entry:
234 %index = add i32 %offset, -512
235 %value = insertelement <4 x i32> %vec, i32 5, i32 %index
236 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
237 ret void
238}
239
Matt Arsenault93401f42016-10-07 03:55:04 +0000240; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
Tom Stellard8b0182a2015-04-23 20:32:01 +0000241; The offset depends on the register that holds the first element of the vector.
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000242
Matt Arsenault93401f42016-10-07 03:55:04 +0000243; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
244; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
245; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
246; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000247
Matt Arsenault93401f42016-10-07 03:55:04 +0000248; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
Matt Arsenault93401f42016-10-07 03:55:04 +0000249; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
250; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000251; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000252
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000253; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000254; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5
255
256; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000257; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000258; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000259; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000260
261; GCN: s_cbranch_execnz [[LOOPBB]]
Matt Arsenault93401f42016-10-07 03:55:04 +0000262; GCN: s_mov_b64 exec, [[SAVEEXEC]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000263
Matt Arsenault93401f42016-10-07 03:55:04 +0000264; GCN: buffer_store_dword
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000265define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
Tom Stellard8b0182a2015-04-23 20:32:01 +0000266entry:
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000267 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
Tom Stellard8b0182a2015-04-23 20:32:01 +0000268 %index = add i32 %id, -512
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000269 %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 5, i32 %index
Tom Stellard8b0182a2015-04-23 20:32:01 +0000270 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
271 ret void
272}
273
Matt Arsenault93401f42016-10-07 03:55:04 +0000274; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000275
Matt Arsenault93401f42016-10-07 03:55:04 +0000276; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
277; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
278; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
279; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
280; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000281
Matt Arsenault93401f42016-10-07 03:55:04 +0000282; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000283
Tom Stellard8b0182a2015-04-23 20:32:01 +0000284; The offset depends on the register that holds the first element of the vector.
Matt Arsenault93401f42016-10-07 03:55:04 +0000285; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000286
287; MOVREL: s_add_i32 m0, [[READLANE]], -16
288; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]
289
290; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000291; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000292; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000293; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000294
Matt Arsenault93401f42016-10-07 03:55:04 +0000295; GCN: s_cbranch_execnz
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000296define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
Tom Stellard8b0182a2015-04-23 20:32:01 +0000297entry:
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000298 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
Tom Stellard8b0182a2015-04-23 20:32:01 +0000299 %index = add i32 %id, -16
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000300 %value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 500, i32 %index
Tom Stellard8b0182a2015-04-23 20:32:01 +0000301 store <4 x i32> %value, <4 x i32> addrspace(1)* %out
302 ret void
303}
304
Matt Arsenault9babdf42016-06-22 20:15:28 +0000305; When the block is split to insert the loop, make sure any other
306; places that need to be expanded in the same block are also handled.
307
Matt Arsenault93401f42016-10-07 03:55:04 +0000308; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
Matt Arsenault9babdf42016-06-22 20:15:28 +0000309
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000310; FIXME: Why is vector copied in between?
311
Matt Arsenault4e309b02017-07-29 01:03:53 +0000312; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
Matt Arsenault93401f42016-10-07 03:55:04 +0000313; GCN-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
314; GCN-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
315; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
316; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000317
Matt Arsenault93401f42016-10-07 03:55:04 +0000318; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
Matt Arsenault9babdf42016-06-22 20:15:28 +0000319
Matt Arsenault93401f42016-10-07 03:55:04 +0000320; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
Mark Searles70359ac2017-06-02 14:19:25 +0000321; GCN-NEXT: s_waitcnt vmcnt(0)
Matt Arsenault93401f42016-10-07 03:55:04 +0000322; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
323; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000324; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000325
326; MOVREL: s_mov_b32 m0, [[READLANE]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000327; MOVREL: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
328
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000329; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000330; IDXMODE: v_mov_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000331; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000332
Matt Arsenault93401f42016-10-07 03:55:04 +0000333; GCN-NEXT: s_xor_b64 exec, exec, vcc
334; GCN-NEXT: s_cbranch_execnz [[LOOP0]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000335
336; FIXME: Redundant copy
Matt Arsenault93401f42016-10-07 03:55:04 +0000337; GCN: s_mov_b64 exec, [[MASK]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000338
Matt Arsenault93401f42016-10-07 03:55:04 +0000339; GCN: v_mov_b32_e32 [[VEC_ELT1_2:v[0-9]+]], [[S_ELT1]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000340
Matt Arsenault93401f42016-10-07 03:55:04 +0000341; GCN: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
Matt Arsenault9babdf42016-06-22 20:15:28 +0000342
Matt Arsenault93401f42016-10-07 03:55:04 +0000343; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
344; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
345; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000346; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000347
348; MOVREL: s_mov_b32 m0, [[READLANE]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000349; MOVREL-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
350
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000351; IDXMODE: s_set_gpr_idx_on [[READLANE]], src0
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000352; IDXMODE-NEXT: v_mov_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1_2]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000353; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000354
Matt Arsenault93401f42016-10-07 03:55:04 +0000355; GCN-NEXT: s_xor_b64 exec, exec, vcc
356; GCN: s_cbranch_execnz [[LOOP1]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000357
Matt Arsenault93401f42016-10-07 03:55:04 +0000358; GCN: buffer_store_dword [[MOVREL0]]
359; GCN: buffer_store_dword [[MOVREL1]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000360define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
Matt Arsenault9babdf42016-06-22 20:15:28 +0000361entry:
362 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
363 %id.ext = zext i32 %id to i64
364 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
365 %idx0 = load volatile i32, i32 addrspace(1)* %gep
366 %idx1 = add i32 %idx0, 1
367 %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
Matt Arsenault3c7581b2017-06-08 19:03:20 +0000368 %live.out.reg = call i32 asm sideeffect "s_mov_b32 $0, 17", "={s4}" ()
Matt Arsenault9babdf42016-06-22 20:15:28 +0000369 %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
370 store volatile i32 %val0, i32 addrspace(1)* %out0
371 store volatile i32 %val1, i32 addrspace(1)* %out0
Matt Arsenault3cb4dde2016-06-22 23:40:57 +0000372 %cmp = icmp eq i32 %id, 0
373 br i1 %cmp, label %bb1, label %bb2
374
375bb1:
376 store volatile i32 %live.out.reg, i32 addrspace(1)* undef
377 br label %bb2
378
379bb2:
Matt Arsenault9babdf42016-06-22 20:15:28 +0000380 ret void
381}
382
Matt Arsenault93401f42016-10-07 03:55:04 +0000383; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
384; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
Matt Arsenault4e309b02017-07-29 01:03:53 +0000385; GCN-DAG: {{buffer|flat|global}}_load_dword [[IDX0:v[0-9]+]]
Matt Arsenault93401f42016-10-07 03:55:04 +0000386; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
Matt Arsenault9babdf42016-06-22 20:15:28 +0000387
Matt Arsenault93401f42016-10-07 03:55:04 +0000388; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
389; GCN: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
390; GCN: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
391; GCN: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000392
Matt Arsenault93401f42016-10-07 03:55:04 +0000393; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
Mark Searles70359ac2017-06-02 14:19:25 +0000394; GCN-NEXT: s_waitcnt vmcnt(0)
Matt Arsenault93401f42016-10-07 03:55:04 +0000395; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
396; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000397; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000398
399; MOVREL: s_mov_b32 m0, [[READLANE]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000400; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT0]], [[INS0]]
401
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000402; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000403; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT0]], [[INS0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000404; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000405
Matt Arsenault93401f42016-10-07 03:55:04 +0000406; GCN-NEXT: s_xor_b64 exec, exec, vcc
407; GCN: s_cbranch_execnz [[LOOP0]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000408
409; FIXME: Redundant copy
Matt Arsenault93401f42016-10-07 03:55:04 +0000410; GCN: s_mov_b64 exec, [[MASK:s\[[0-9]+:[0-9]+\]]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000411
Matt Arsenault93401f42016-10-07 03:55:04 +0000412; GCN: s_mov_b64 [[MASK]], exec
Matt Arsenault9babdf42016-06-22 20:15:28 +0000413
Matt Arsenault93401f42016-10-07 03:55:04 +0000414; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
415; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
416; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000417; GCN: s_and_saveexec_b64 vcc, vcc
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000418
419; MOVREL: s_mov_b32 m0, [[READLANE]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000420; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
421
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000422; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000423; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000424; IDXMODE: s_set_gpr_idx_off
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000425
Matt Arsenault93401f42016-10-07 03:55:04 +0000426; GCN-NEXT: s_xor_b64 exec, exec, vcc
427; GCN: s_cbranch_execnz [[LOOP1]]
Matt Arsenault9babdf42016-06-22 20:15:28 +0000428
Matt Arsenault93401f42016-10-07 03:55:04 +0000429; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
Matt Arsenault3cb4dde2016-06-22 23:40:57 +0000430
Matt Arsenault93401f42016-10-07 03:55:04 +0000431; GCN: buffer_store_dword [[INS0]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000432define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
Matt Arsenault9babdf42016-06-22 20:15:28 +0000433entry:
434 %id = call i32 @llvm.amdgcn.workitem.id.x() #1
435 %id.ext = zext i32 %id to i64
436 %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
437 %idx0 = load volatile i32, i32 addrspace(1)* %gep
438 %idx1 = add i32 %idx0, 1
Matt Arsenault3cb4dde2016-06-22 23:40:57 +0000439 %live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
440 %vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0
Matt Arsenault9babdf42016-06-22 20:15:28 +0000441 %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
442 store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
Matt Arsenault3cb4dde2016-06-22 23:40:57 +0000443 %cmp = icmp eq i32 %id, 0
444 br i1 %cmp, label %bb1, label %bb2
445
446bb1:
447 store volatile i32 %live.out.val, i32 addrspace(1)* undef
448 br label %bb2
449
450bb2:
Matt Arsenault9babdf42016-06-22 20:15:28 +0000451 ret void
452}
453
Matt Arsenault9babdf42016-06-22 20:15:28 +0000454
Matt Arsenault93401f42016-10-07 03:55:04 +0000455; GCN-LABEL: {{^}}insert_adjacent_blocks:
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000456define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
Matt Arsenault9babdf42016-06-22 20:15:28 +0000457bb:
458 %tmp = icmp eq i32 %arg, 0
459 br i1 %tmp, label %bb1, label %bb4
460
461bb1: ; preds = %bb
462 %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
463 %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
Matt Arsenaultad55ee52016-12-06 01:02:51 +0000464 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp3) #0 ; Prevent block optimize out
Matt Arsenault9babdf42016-06-22 20:15:28 +0000465 br label %bb7
466
467bb4: ; preds = %bb
468 %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
469 %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
Matt Arsenaultad55ee52016-12-06 01:02:51 +0000470 call void asm sideeffect "; reg use $0", "v"(<4 x float> %tmp6) #0 ; Prevent block optimize out
Matt Arsenault9babdf42016-06-22 20:15:28 +0000471 br label %bb7
472
473bb7: ; preds = %bb4, %bb1
474 %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
475 store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
476 ret void
477}
478
479; FIXME: Should be able to fold zero input to movreld to inline imm?
480
Matt Arsenault93401f42016-10-07 03:55:04 +0000481; GCN-LABEL: {{^}}multi_same_block:
Matt Arsenault9babdf42016-06-22 20:15:28 +0000482
Matt Arsenault93401f42016-10-07 03:55:04 +0000483; GCN-DAG: v_mov_b32_e32 v[[VEC0_ELT0:[0-9]+]], 0x41880000
484; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
485; GCN-DAG: v_mov_b32_e32 v[[VEC0_ELT2:[0-9]+]], 0x41980000
486; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a00000
487; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
488; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
489; GCN-DAG: s_load_dword [[ARG:s[0-9]+]]
Matthias Braun325cd2c2016-11-11 01:34:21 +0000490; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000491
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000492; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16
493; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
Matt Arsenault93401f42016-10-07 03:55:04 +0000494; GCN-NOT: m0
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000495
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000496; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
497; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0
498; IDXMODE: s_set_gpr_idx_off
499
Matt Arsenault93401f42016-10-07 03:55:04 +0000500; GCN: v_mov_b32_e32 v[[VEC0_ELT2]], 0x4188cccd
501; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4190cccd
502; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x4198cccd
503; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a0cccd
504; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a8cccd
505; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000506
507; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT2]], -4.0
508
509; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
510; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT2]], -4.0
511; IDXMODE: s_set_gpr_idx_off
Matt Arsenault9babdf42016-06-22 20:15:28 +0000512
Matt Arsenault3f71c0e2017-11-29 00:55:57 +0000513; PREGFX9: s_mov_b32 m0, -1
514; GFX9-NOT: s_mov_b32 m0
Matt Arsenault93401f42016-10-07 03:55:04 +0000515; GCN: ds_write_b32
516; GCN: ds_write_b32
517; GCN: s_endpgm
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000518define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
Matt Arsenault9babdf42016-06-22 20:15:28 +0000519bb:
520 %tmp1 = add i32 %arg, -16
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000521 %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
Matt Arsenault9babdf42016-06-22 20:15:28 +0000522 %tmp3 = add i32 %arg, -16
Matt Arsenaultcb540bc2016-07-19 00:35:03 +0000523 %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float -4.0, i32 %tmp3
Matt Arsenault9babdf42016-06-22 20:15:28 +0000524 %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
525 %tmp6 = extractelement <6 x i32> %tmp5, i32 1
526 %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
527 %tmp8 = extractelement <6 x i32> %tmp7, i32 5
528 store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
529 store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
530 ret void
531}
532
Matt Arsenaultb4d95032016-06-28 01:09:00 +0000533; offset puts outside of superegister bounaries, so clamp to 1st element.
Matt Arsenault93401f42016-10-07 03:55:04 +0000534; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
535; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
536; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000537; MOVREL: s_mov_b32 m0, [[IDX]]
538; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
539
540; IDXMODE: s_set_gpr_idx_on [[IDX]], src0
541; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[HI_ELT]]
542; IDXMODE: s_set_gpr_idx_off
543
Matt Arsenault93401f42016-10-07 03:55:04 +0000544; GCN: buffer_store_dword [[EXTRACT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000545define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
Matt Arsenaultb4d95032016-06-28 01:09:00 +0000546entry:
547 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
548 %offset = add i32 %idx, 3
549 %value = extractelement <4 x i32> %ld, i32 %offset
550 store i32 %value, i32 addrspace(1)* %out
551 ret void
552}
553
Matt Arsenault93401f42016-10-07 03:55:04 +0000554; GCN-LABEL: {{^}}extract_out_of_bounds_offset:
555; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\]}}
556; GCN-DAG: s_load_dword [[IDX:s[0-9]+]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000557; MOVREL: s_add_i32 m0, [[IDX]], 4
558; MOVREL: v_movrels_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
559
560; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[IDX]], 4
561; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], src0
562; IDXMODE: v_mov_b32_e32 [[EXTRACT:v[0-9]+]], v[[LO_ELT]]
563; IDXMODE: s_set_gpr_idx_off
564
Matt Arsenault93401f42016-10-07 03:55:04 +0000565; GCN: buffer_store_dword [[EXTRACT]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000566define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
Matt Arsenaultb4d95032016-06-28 01:09:00 +0000567entry:
568 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
569 %offset = add i32 %idx, 4
570 %value = extractelement <4 x i32> %ld, i32 %offset
571 store i32 %value, i32 addrspace(1)* %out
572 ret void
573}
574
Matt Arsenault1322b6f2016-07-09 01:13:56 +0000575; Test that the or is folded into the base address register instead of
576; added to m0
577
Matt Arsenault93401f42016-10-07 03:55:04 +0000578; GCN-LABEL: {{^}}extractelement_v4i32_or_index:
579; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
580; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
581; GCN-NOT: [[IDX_SHL]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000582
583; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
584; MOVREL: v_movrels_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
585
586; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
587; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
588; IDXMODE: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000589define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
Matt Arsenault1322b6f2016-07-09 01:13:56 +0000590entry:
591 %ld = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
592 %idx.shl = shl i32 %idx.in, 2
593 %idx = or i32 %idx.shl, 1
594 %value = extractelement <4 x i32> %ld, i32 %idx
595 store i32 %value, i32 addrspace(1)* %out
596 ret void
597}
598
Matt Arsenault93401f42016-10-07 03:55:04 +0000599; GCN-LABEL: {{^}}insertelement_v4f32_or_index:
600; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
601; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
602; GCN-NOT: [[IDX_SHL]]
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000603
604; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
605; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
606
607; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
608; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
609; IDXMODE: s_set_gpr_idx_off
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000610define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
Matt Arsenault1322b6f2016-07-09 01:13:56 +0000611 %idx.shl = shl i32 %idx.in, 2
612 %idx = or i32 %idx.shl, 1
613 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
614 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
615 ret void
616}
617
Matt Arsenault93401f42016-10-07 03:55:04 +0000618; GCN-LABEL: {{^}}broken_phi_bb:
619; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +0000620
Matt Arsenault93401f42016-10-07 03:55:04 +0000621; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +0000622
Matt Arsenault93401f42016-10-07 03:55:04 +0000623; GCN: {{^BB[0-9]+_[0-9]+}}:
624; GCN: s_mov_b64 exec,
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +0000625
Matt Arsenault93401f42016-10-07 03:55:04 +0000626; GCN: [[BB2]]:
627; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
628; GCN: buffer_load_dword
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +0000629
Matt Arsenault93401f42016-10-07 03:55:04 +0000630; GCN: [[REGLOOP:BB[0-9]+_[0-9]+]]:
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000631; MOVREL: v_movreld_b32_e32
632
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000633; IDXMODE: s_set_gpr_idx_on
Matt Arsenaultd486d3f2016-10-12 18:49:05 +0000634; IDXMODE: v_mov_b32_e32
Changpeng Fangda38b5f2018-02-16 16:31:30 +0000635; IDXMODE: s_set_gpr_idx_off
636
Matt Arsenault93401f42016-10-07 03:55:04 +0000637; GCN: s_cbranch_execnz [[REGLOOP]]
Matt Arsenault3dbeefa2017-03-21 21:39:51 +0000638define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
Matt Arsenaultf0ba86a2016-07-21 09:40:57 +0000639bb:
640 br label %bb2
641
642bb2: ; preds = %bb4, %bb
643 %tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
644 %tmp3 = icmp slt i32 %tmp, %arg
645 br i1 %tmp3, label %bb4, label %bb8
646
647bb4: ; preds = %bb2
648 %vgpr = load volatile i32, i32 addrspace(1)* undef
649 %tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr
650 %tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr
651 %tmp7 = extractelement <8 x i32> %tmp6, i32 0
652 br label %bb2
653
654bb8: ; preds = %bb2
655 ret void
656}
657
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000658declare i32 @llvm.amdgcn.workitem.id.x() #1
Matt Arsenaultad55ee52016-12-06 01:02:51 +0000659declare void @llvm.amdgcn.s.barrier() #2
Matt Arsenault9c47dd52016-02-11 06:02:01 +0000660
Matt Arsenault9babdf42016-06-22 20:15:28 +0000661attributes #0 = { nounwind }
Tom Stellard8b0182a2015-04-23 20:32:01 +0000662attributes #1 = { nounwind readnone }
Matt Arsenaultad55ee52016-12-06 01:02:51 +0000663attributes #2 = { nounwind convergent }