blob: 86a62c5e3f2d402dd280737442845bc88fb4c953 [file] [log] [blame]
Simon Pilgrim20acf932018-01-13 12:29:06 +00001; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=CHECK,X64
4
5define double @mmx_zero(double, double, double, double) nounwind {
6; X86-LABEL: mmx_zero:
7; X86: # %bb.0:
8; X86-NEXT: pushl %ebp
9; X86-NEXT: movl %esp, %ebp
10; X86-NEXT: andl $-8, %esp
11; X86-NEXT: subl $24, %esp
12; X86-NEXT: movq 8(%ebp), %mm0
13; X86-NEXT: movq 16(%ebp), %mm5
14; X86-NEXT: movq %mm5, {{[0-9]+}}(%esp) # 8-byte Spill
15; X86-NEXT: movq %mm0, %mm3
16; X86-NEXT: paddd %mm5, %mm3
17; X86-NEXT: xorps %xmm0, %xmm0
18; X86-NEXT: movdq2q %xmm0, %mm1
19; X86-NEXT: movq %mm1, (%esp) # 8-byte Spill
20; X86-NEXT: movq %mm3, %mm6
21; X86-NEXT: pmuludq %mm1, %mm6
22; X86-NEXT: movq 24(%ebp), %mm4
23; X86-NEXT: movq %mm6, %mm2
24; X86-NEXT: paddd %mm4, %mm2
25; X86-NEXT: paddw %mm2, %mm0
26; X86-NEXT: movq %mm5, %mm1
27; X86-NEXT: paddw %mm0, %mm1
28; X86-NEXT: movq 32(%ebp), %mm5
29; X86-NEXT: movq %mm1, %mm7
30; X86-NEXT: pmuludq %mm5, %mm7
31; X86-NEXT: paddw %mm4, %mm7
32; X86-NEXT: paddw %mm7, %mm5
33; X86-NEXT: paddw %mm5, %mm2
34; X86-NEXT: paddw %mm2, %mm0
35; X86-NEXT: paddw %mm6, %mm0
36; X86-NEXT: pmuludq %mm3, %mm0
37; X86-NEXT: paddw (%esp), %mm0 # 8-byte Folded Reload
38; X86-NEXT: paddw %mm1, %mm0
39; X86-NEXT: pmuludq %mm7, %mm0
40; X86-NEXT: pmuludq {{[0-9]+}}(%esp), %mm0 # 8-byte Folded Reload
41; X86-NEXT: paddw %mm5, %mm0
42; X86-NEXT: paddw %mm2, %mm0
43; X86-NEXT: movq2dq %mm0, %xmm0
44; X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp)
45; X86-NEXT: fldl {{[0-9]+}}(%esp)
46; X86-NEXT: movl %ebp, %esp
47; X86-NEXT: popl %ebp
48; X86-NEXT: retl
49;
50; X64-LABEL: mmx_zero:
51; X64: # %bb.0:
52; X64-NEXT: movdq2q %xmm0, %mm0
53; X64-NEXT: movdq2q %xmm1, %mm5
54; X64-NEXT: movq %mm5, -{{[0-9]+}}(%rsp) # 8-byte Spill
55; X64-NEXT: movq %mm0, %mm3
56; X64-NEXT: paddd %mm5, %mm3
57; X64-NEXT: xorps %xmm0, %xmm0
58; X64-NEXT: movdq2q %xmm0, %mm1
59; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) # 8-byte Spill
60; X64-NEXT: movq %mm3, %mm6
61; X64-NEXT: pmuludq %mm1, %mm6
62; X64-NEXT: movdq2q %xmm2, %mm4
63; X64-NEXT: movq %mm6, %mm2
64; X64-NEXT: paddd %mm4, %mm2
65; X64-NEXT: paddw %mm2, %mm0
66; X64-NEXT: movq %mm5, %mm1
67; X64-NEXT: paddw %mm0, %mm1
68; X64-NEXT: movdq2q %xmm3, %mm5
69; X64-NEXT: movq %mm1, %mm7
70; X64-NEXT: pmuludq %mm5, %mm7
71; X64-NEXT: paddw %mm4, %mm7
72; X64-NEXT: paddw %mm7, %mm5
73; X64-NEXT: paddw %mm5, %mm2
74; X64-NEXT: paddw %mm2, %mm0
75; X64-NEXT: paddw %mm6, %mm0
76; X64-NEXT: pmuludq %mm3, %mm0
77; X64-NEXT: paddw -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload
78; X64-NEXT: paddw %mm1, %mm0
79; X64-NEXT: pmuludq %mm7, %mm0
80; X64-NEXT: pmuludq -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload
81; X64-NEXT: paddw %mm5, %mm0
82; X64-NEXT: paddw %mm2, %mm0
83; X64-NEXT: movq2dq %mm0, %xmm0
84; X64-NEXT: retq
85 %5 = bitcast double %0 to x86_mmx
86 %6 = bitcast double %1 to x86_mmx
87 %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6)
88 %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
89 %9 = bitcast double %2 to x86_mmx
90 %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9)
91 %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10)
92 %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11)
93 %13 = bitcast double %3 to x86_mmx
94 %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13)
95 %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9)
96 %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13)
97 %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10)
98 %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11)
99 %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8)
100 %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7)
101 %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx))
102 %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12)
103 %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15)
104 %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6)
105 %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16)
106 %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17)
107 %27 = bitcast x86_mmx %26 to double
108 ret double %27
109}
110
111declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
112declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
113declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx)