blob: 37b4db3392c888c6faa6ef7077fbe05c20730967 [file] [log] [blame]
Simon Pilgrimc6b55722017-03-10 16:59:43 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
4
5; FIXME: If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
Simon Pilgrimed655f02017-03-10 17:23:55 +00006; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
Simon Pilgrimc6b55722017-03-10 16:59:43 +00007
8define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
9; X86-LABEL: cvt_v2f64_v2i32:
10; X86: # BB#0:
11; X86-NEXT: pushl %ebp
12; X86-NEXT: movl %esp, %ebp
13; X86-NEXT: andl $-8, %esp
14; X86-NEXT: subl $16, %esp
15; X86-NEXT: movl 8(%ebp), %eax
16; X86-NEXT: cvtpd2dq %xmm0, %xmm0
17; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
18; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
19; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
20; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
21; X86-NEXT: paddd %mm0, %mm0
22; X86-NEXT: movq %mm0, (%esp)
23; X86-NEXT: movl (%esp), %ecx
24; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
25; X86-NEXT: movl %edx, 4(%eax)
26; X86-NEXT: movl %ecx, (%eax)
27; X86-NEXT: movl %ebp, %esp
28; X86-NEXT: popl %ebp
29; X86-NEXT: retl
30;
31; X64-LABEL: cvt_v2f64_v2i32:
32; X64: # BB#0:
33; X64-NEXT: cvtpd2dq %xmm0, %xmm0
34; X64-NEXT: movdq2q %xmm0, %mm0
35; X64-NEXT: paddd %mm0, %mm0
36; X64-NEXT: movq %mm0, (%rdi)
37; X64-NEXT: retq
38 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
39 %4 = bitcast <4 x i32> %3 to <2 x i64>
40 %5 = extractelement <2 x i64> %4, i32 0
41 %6 = bitcast i64 %5 to x86_mmx
42 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
43 %8 = bitcast x86_mmx %7 to i64
44 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
45 store <1 x i64> %9, <1 x i64>* %1
46 ret void
47}
48
49define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
50; X86-LABEL: cvtt_v2f64_v2i32:
51; X86: # BB#0:
52; X86-NEXT: pushl %ebp
53; X86-NEXT: movl %esp, %ebp
54; X86-NEXT: andl $-8, %esp
55; X86-NEXT: subl $16, %esp
56; X86-NEXT: movl 8(%ebp), %eax
57; X86-NEXT: cvttpd2dq %xmm0, %xmm0
58; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
59; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
60; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
61; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
62; X86-NEXT: paddd %mm0, %mm0
63; X86-NEXT: movq %mm0, (%esp)
64; X86-NEXT: movl (%esp), %ecx
65; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
66; X86-NEXT: movl %edx, 4(%eax)
67; X86-NEXT: movl %ecx, (%eax)
68; X86-NEXT: movl %ebp, %esp
69; X86-NEXT: popl %ebp
70; X86-NEXT: retl
71;
72; X64-LABEL: cvtt_v2f64_v2i32:
73; X64: # BB#0:
74; X64-NEXT: cvttpd2dq %xmm0, %xmm0
75; X64-NEXT: movdq2q %xmm0, %mm0
76; X64-NEXT: paddd %mm0, %mm0
77; X64-NEXT: movq %mm0, (%rdi)
78; X64-NEXT: retq
79 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
80 %4 = bitcast <4 x i32> %3 to <2 x i64>
81 %5 = extractelement <2 x i64> %4, i32 0
82 %6 = bitcast i64 %5 to x86_mmx
83 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
84 %8 = bitcast x86_mmx %7 to i64
85 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
86 store <1 x i64> %9, <1 x i64>* %1
87 ret void
88}
89
90define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
91; X86-LABEL: fptosi_v2f64_v2i32:
92; X86: # BB#0:
93; X86-NEXT: pushl %ebp
94; X86-NEXT: movl %esp, %ebp
95; X86-NEXT: andl $-8, %esp
96; X86-NEXT: subl $16, %esp
97; X86-NEXT: movl 8(%ebp), %eax
98; X86-NEXT: cvttpd2dq %xmm0, %xmm0
99; X86-NEXT: movlpd %xmm0, {{[0-9]+}}(%esp)
100; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
101; X86-NEXT: paddd %mm0, %mm0
102; X86-NEXT: movq %mm0, (%esp)
103; X86-NEXT: movl (%esp), %ecx
104; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
105; X86-NEXT: movl %edx, 4(%eax)
106; X86-NEXT: movl %ecx, (%eax)
107; X86-NEXT: movl %ebp, %esp
108; X86-NEXT: popl %ebp
109; X86-NEXT: retl
110;
111; X64-LABEL: fptosi_v2f64_v2i32:
112; X64: # BB#0:
113; X64-NEXT: cvttpd2dq %xmm0, %xmm0
114; X64-NEXT: movlpd %xmm0, -{{[0-9]+}}(%rsp)
115; X64-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
116; X64-NEXT: paddd %mm0, %mm0
117; X64-NEXT: movq %mm0, (%rdi)
118; X64-NEXT: retq
119 %3 = fptosi <2 x double> %0 to <2 x i32>
120 %4 = bitcast <2 x i32> %3 to x86_mmx
121 %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
122 %6 = bitcast x86_mmx %5 to i64
123 %7 = insertelement <1 x i64> undef, i64 %6, i32 0
124 store <1 x i64> %7, <1 x i64>* %1
125 ret void
126}
127
128define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
129; X86-LABEL: cvt_v2f32_v2i32:
130; X86: # BB#0:
131; X86-NEXT: pushl %ebp
132; X86-NEXT: movl %esp, %ebp
133; X86-NEXT: andl $-8, %esp
134; X86-NEXT: subl $16, %esp
135; X86-NEXT: movl 8(%ebp), %eax
136; X86-NEXT: cvtps2dq %xmm0, %xmm0
137; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
138; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
139; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
140; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
141; X86-NEXT: paddd %mm0, %mm0
142; X86-NEXT: movq %mm0, (%esp)
143; X86-NEXT: movl (%esp), %ecx
144; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
145; X86-NEXT: movl %edx, 4(%eax)
146; X86-NEXT: movl %ecx, (%eax)
147; X86-NEXT: movl %ebp, %esp
148; X86-NEXT: popl %ebp
149; X86-NEXT: retl
150;
151; X64-LABEL: cvt_v2f32_v2i32:
152; X64: # BB#0:
153; X64-NEXT: cvtps2dq %xmm0, %xmm0
154; X64-NEXT: movdq2q %xmm0, %mm0
155; X64-NEXT: paddd %mm0, %mm0
156; X64-NEXT: movq %mm0, (%rdi)
157; X64-NEXT: retq
158 %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
159 %4 = bitcast <4 x i32> %3 to <2 x i64>
160 %5 = extractelement <2 x i64> %4, i32 0
161 %6 = bitcast i64 %5 to x86_mmx
162 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
163 %8 = bitcast x86_mmx %7 to i64
164 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
165 store <1 x i64> %9, <1 x i64>* %1
166 ret void
167}
168
169define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
170; X86-LABEL: cvtt_v2f32_v2i32:
171; X86: # BB#0:
172; X86-NEXT: pushl %ebp
173; X86-NEXT: movl %esp, %ebp
174; X86-NEXT: andl $-8, %esp
175; X86-NEXT: subl $16, %esp
176; X86-NEXT: movl 8(%ebp), %eax
177; X86-NEXT: cvttps2dq %xmm0, %xmm0
178; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
179; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
180; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
181; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
182; X86-NEXT: paddd %mm0, %mm0
183; X86-NEXT: movq %mm0, (%esp)
184; X86-NEXT: movl (%esp), %ecx
185; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
186; X86-NEXT: movl %edx, 4(%eax)
187; X86-NEXT: movl %ecx, (%eax)
188; X86-NEXT: movl %ebp, %esp
189; X86-NEXT: popl %ebp
190; X86-NEXT: retl
191;
192; X64-LABEL: cvtt_v2f32_v2i32:
193; X64: # BB#0:
194; X64-NEXT: cvttps2dq %xmm0, %xmm0
195; X64-NEXT: movdq2q %xmm0, %mm0
196; X64-NEXT: paddd %mm0, %mm0
197; X64-NEXT: movq %mm0, (%rdi)
198; X64-NEXT: retq
199 %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
200 %4 = bitcast <4 x i32> %3 to <2 x i64>
201 %5 = extractelement <2 x i64> %4, i32 0
202 %6 = bitcast i64 %5 to x86_mmx
203 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
204 %8 = bitcast x86_mmx %7 to i64
205 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
206 store <1 x i64> %9, <1 x i64>* %1
207 ret void
208}
209
210define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
211; X86-LABEL: fptosi_v2f32_v2i32:
212; X86: # BB#0:
213; X86-NEXT: pushl %ebp
214; X86-NEXT: movl %esp, %ebp
215; X86-NEXT: andl $-8, %esp
216; X86-NEXT: subl $16, %esp
217; X86-NEXT: movl 8(%ebp), %eax
218; X86-NEXT: cvttps2dq %xmm0, %xmm0
219; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
220; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
221; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
222; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
223; X86-NEXT: paddd %mm0, %mm0
224; X86-NEXT: movq %mm0, (%esp)
225; X86-NEXT: movl (%esp), %ecx
226; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
227; X86-NEXT: movl %edx, 4(%eax)
228; X86-NEXT: movl %ecx, (%eax)
229; X86-NEXT: movl %ebp, %esp
230; X86-NEXT: popl %ebp
231; X86-NEXT: retl
232;
233; X64-LABEL: fptosi_v2f32_v2i32:
234; X64: # BB#0:
235; X64-NEXT: cvttps2dq %xmm0, %xmm0
236; X64-NEXT: movdq2q %xmm0, %mm0
237; X64-NEXT: paddd %mm0, %mm0
238; X64-NEXT: movq %mm0, (%rdi)
239; X64-NEXT: retq
240 %3 = fptosi <4 x float> %0 to <4 x i32>
241 %4 = bitcast <4 x i32> %3 to <2 x i64>
242 %5 = extractelement <2 x i64> %4, i32 0
243 %6 = bitcast i64 %5 to x86_mmx
244 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
245 %8 = bitcast x86_mmx %7 to i64
246 %9 = insertelement <1 x i64> undef, i64 %8, i32 0
247 store <1 x i64> %9, <1 x i64>* %1
248 ret void
249}
250
Simon Pilgrime1a72a92017-03-14 12:13:41 +0000251; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
Simon Pilgrimed655f02017-03-10 17:23:55 +0000252; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
253
254define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
255; X86-LABEL: sitofp_v2i32_v2f64:
256; X86: # BB#0:
257; X86-NEXT: pushl %ebp
258; X86-NEXT: movl %esp, %ebp
259; X86-NEXT: andl $-8, %esp
260; X86-NEXT: subl $8, %esp
261; X86-NEXT: movl 8(%ebp), %eax
262; X86-NEXT: movq (%eax), %mm0
263; X86-NEXT: paddd %mm0, %mm0
264; X86-NEXT: movq %mm0, (%esp)
Simon Pilgrim128a10a2017-03-10 22:35:07 +0000265; X86-NEXT: cvtdq2pd (%esp), %xmm0
Simon Pilgrimed655f02017-03-10 17:23:55 +0000266; X86-NEXT: movl %ebp, %esp
267; X86-NEXT: popl %ebp
268; X86-NEXT: retl
269;
270; X64-LABEL: sitofp_v2i32_v2f64:
271; X64: # BB#0:
272; X64-NEXT: movq (%rdi), %mm0
273; X64-NEXT: paddd %mm0, %mm0
274; X64-NEXT: movq2dq %mm0, %xmm0
275; X64-NEXT: cvtdq2pd %xmm0, %xmm0
276; X64-NEXT: retq
277 %2 = bitcast <1 x i64>* %0 to x86_mmx*
278 %3 = load x86_mmx, x86_mmx* %2, align 8
279 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
280 %5 = bitcast x86_mmx %4 to i64
281 %6 = insertelement <2 x i64> undef, i64 %5, i32 0
282 %7 = bitcast <2 x i64> %6 to <4 x i32>
283 %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
284 %9 = sitofp <2 x i32> %8 to <2 x double>
285 ret <2 x double> %9
286}
287
288define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
289; X86-LABEL: sitofp_v2i32_v2f32:
290; X86: # BB#0:
291; X86-NEXT: pushl %ebp
292; X86-NEXT: movl %esp, %ebp
293; X86-NEXT: andl $-8, %esp
294; X86-NEXT: subl $8, %esp
295; X86-NEXT: movl 8(%ebp), %eax
296; X86-NEXT: movq (%eax), %mm0
297; X86-NEXT: paddd %mm0, %mm0
298; X86-NEXT: movq %mm0, (%esp)
299; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
300; X86-NEXT: cvtdq2ps %xmm0, %xmm0
301; X86-NEXT: movl %ebp, %esp
302; X86-NEXT: popl %ebp
303; X86-NEXT: retl
304;
305; X64-LABEL: sitofp_v2i32_v2f32:
306; X64: # BB#0:
307; X64-NEXT: movq (%rdi), %mm0
308; X64-NEXT: paddd %mm0, %mm0
309; X64-NEXT: movd %mm0, %rax
310; X64-NEXT: movd %rax, %xmm0
311; X64-NEXT: cvtdq2ps %xmm0, %xmm0
312; X64-NEXT: retq
313 %2 = bitcast <1 x i64>* %0 to x86_mmx*
314 %3 = load x86_mmx, x86_mmx* %2, align 8
315 %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
316 %5 = bitcast x86_mmx %4 to i64
317 %6 = insertelement <2 x i64> undef, i64 %5, i32 0
318 %7 = insertelement <2 x i64> %6, i64 0, i32 1
319 %8 = bitcast <2 x i64> %7 to <4 x i32>
320 %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
321 ret <4 x float> %9
322}
323
Simon Pilgrimc6b55722017-03-10 16:59:43 +0000324declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
325declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
326declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
327declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
328declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
Simon Pilgrimed655f02017-03-10 17:23:55 +0000329declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)