blob: 805b2f5ad2bf8e4cd6b06cccf1d01205b363d616 [file] [log] [blame]
Matt Arsenault167601e2018-08-30 05:49:28 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; SelectionDAG builder was using the IR value kind to decide how to
5; split the types for copyToRegs/copyFromRegs in all contexts. This
6; was incorrect if the ABI-like value such as a call was used outside
7; of the block. The value in that case is not used directly, but
8; through another set of copies to potentially different register
9; types in the parent block.
10
11; This would then end up producing inconsistent pairs of copies with
12; the wrong sizes when the vector type result from the call was split
13; into multiple pieces, but expected to be a single register in the
14; cross-block copy.
15;
16; This isn't exactly ideal for AMDGPU, since in reality the
17; intermediate vector register type is undesirable anyway, but it
18; requires more work to be able to split all vector copies in all
19; contexts.
20;
21; This was only an issue if the value was used directly in another
22; block. If there was an intermediate operation or a phi it was fine,
23; since that didn't look like an ABI copy.
24
25
26define float @call_split_type_used_outside_block_v2f32() #0 {
27; GCN-LABEL: call_split_type_used_outside_block_v2f32:
28; GCN: ; %bb.0: ; %bb0
29; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GCN-NEXT: s_mov_b32 s5, s32
31; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
32; GCN-NEXT: v_writelane_b32 v32, s33, 0
33; GCN-NEXT: v_writelane_b32 v32, s34, 1
34; GCN-NEXT: s_add_u32 s32, s32, 0x400
35; GCN-NEXT: v_writelane_b32 v32, s35, 2
36; GCN-NEXT: s_getpc_b64 s[6:7]
37; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4
38; GCN-NEXT: s_addc_u32 s7, s7, func_v2f32@rel32@hi+4
39; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
40; GCN-NEXT: s_mov_b32 s33, s5
41; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
42; GCN-NEXT: s_mov_b32 s5, s33
43; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
44; GCN-NEXT: v_readlane_b32 s35, v32, 2
45; GCN-NEXT: v_readlane_b32 s34, v32, 1
46; GCN-NEXT: v_readlane_b32 s33, v32, 0
47; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
48; GCN-NEXT: s_sub_u32 s32, s32, 0x400
49; GCN-NEXT: s_waitcnt vmcnt(0)
50; GCN-NEXT: s_setpc_b64 s[30:31]
51bb0:
52 %split.ret.type = call <2 x float> @func_v2f32()
53 br label %bb1
54
55bb1:
56 %extract = extractelement <2 x float> %split.ret.type, i32 0
57 ret float %extract
58}
59
60define float @call_split_type_used_outside_block_v3f32() #0 {
61; GCN-LABEL: call_split_type_used_outside_block_v3f32:
62; GCN: ; %bb.0: ; %bb0
63; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64; GCN-NEXT: s_mov_b32 s5, s32
65; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
66; GCN-NEXT: v_writelane_b32 v32, s33, 0
67; GCN-NEXT: v_writelane_b32 v32, s34, 1
68; GCN-NEXT: s_add_u32 s32, s32, 0x400
69; GCN-NEXT: v_writelane_b32 v32, s35, 2
70; GCN-NEXT: s_getpc_b64 s[6:7]
71; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4
72; GCN-NEXT: s_addc_u32 s7, s7, func_v3f32@rel32@hi+4
73; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
74; GCN-NEXT: s_mov_b32 s33, s5
75; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
76; GCN-NEXT: s_mov_b32 s5, s33
77; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
78; GCN-NEXT: v_readlane_b32 s35, v32, 2
79; GCN-NEXT: v_readlane_b32 s34, v32, 1
80; GCN-NEXT: v_readlane_b32 s33, v32, 0
81; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
82; GCN-NEXT: s_sub_u32 s32, s32, 0x400
83; GCN-NEXT: s_waitcnt vmcnt(0)
84; GCN-NEXT: s_setpc_b64 s[30:31]
85bb0:
86 %split.ret.type = call <3 x float> @func_v3f32()
87 br label %bb1
88
89bb1:
90 %extract = extractelement <3 x float> %split.ret.type, i32 0
91 ret float %extract
92}
93
94define half @call_split_type_used_outside_block_v4f16() #0 {
95; GCN-LABEL: call_split_type_used_outside_block_v4f16:
96; GCN: ; %bb.0: ; %bb0
97; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GCN-NEXT: s_mov_b32 s5, s32
99; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
100; GCN-NEXT: v_writelane_b32 v32, s33, 0
101; GCN-NEXT: v_writelane_b32 v32, s34, 1
102; GCN-NEXT: s_add_u32 s32, s32, 0x400
103; GCN-NEXT: v_writelane_b32 v32, s35, 2
104; GCN-NEXT: s_getpc_b64 s[6:7]
105; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4
106; GCN-NEXT: s_addc_u32 s7, s7, func_v4f16@rel32@hi+4
107; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
108; GCN-NEXT: s_mov_b32 s33, s5
109; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
110; GCN-NEXT: s_mov_b32 s5, s33
111; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
112; GCN-NEXT: v_readlane_b32 s35, v32, 2
113; GCN-NEXT: v_readlane_b32 s34, v32, 1
114; GCN-NEXT: v_readlane_b32 s33, v32, 0
115; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
116; GCN-NEXT: s_sub_u32 s32, s32, 0x400
117; GCN-NEXT: s_waitcnt vmcnt(0)
118; GCN-NEXT: s_setpc_b64 s[30:31]
119bb0:
120 %split.ret.type = call <4 x half> @func_v4f16()
121 br label %bb1
122
123bb1:
124 %extract = extractelement <4 x half> %split.ret.type, i32 0
125 ret half %extract
126}
127
128define { i32, half } @call_split_type_used_outside_block_struct() #0 {
129; GCN-LABEL: call_split_type_used_outside_block_struct:
130; GCN: ; %bb.0: ; %bb0
131; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GCN-NEXT: s_mov_b32 s5, s32
133; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
134; GCN-NEXT: v_writelane_b32 v32, s33, 0
135; GCN-NEXT: v_writelane_b32 v32, s34, 1
136; GCN-NEXT: s_add_u32 s32, s32, 0x400
137; GCN-NEXT: v_writelane_b32 v32, s35, 2
138; GCN-NEXT: s_getpc_b64 s[6:7]
139; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4
140; GCN-NEXT: s_addc_u32 s7, s7, func_struct@rel32@hi+4
141; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
142; GCN-NEXT: s_mov_b32 s33, s5
143; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
144; GCN-NEXT: s_mov_b32 s5, s33
145; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
146; GCN-NEXT: v_readlane_b32 s35, v32, 2
147; GCN-NEXT: v_readlane_b32 s34, v32, 1
148; GCN-NEXT: v_readlane_b32 s33, v32, 0
149; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
150; GCN-NEXT: v_mov_b32_e32 v1, v4
151; GCN-NEXT: s_sub_u32 s32, s32, 0x400
152; GCN-NEXT: s_waitcnt vmcnt(0)
153; GCN-NEXT: s_setpc_b64 s[30:31]
154bb0:
155 %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
156 br label %bb1
157
158bb1:
159 %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
160 %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
161 %extract0 = extractelement <4 x i32> %val0, i32 0
162 %extract1 = extractelement <4 x half> %val1, i32 0
163 %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
164 %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
165 ret { i32, half } %ins1
166}
167
168
169declare <2 x float> @func_v2f32() #0
170declare <3 x float> @func_v3f32() #0
171declare <4 x float> @func_v4f32() #0
172declare <4 x half> @func_v4f16() #0
173
174declare { <4 x i32>, <4 x half> } @func_struct() #0
175
176attributes #0 = { nounwind}