Simon Pilgrim | 20acf93 | 2018-01-13 12:29:06 +0000 | [diff] [blame^] | 1 | ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| 2 | ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=CHECK,X86 |
| 3 | ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefixes=CHECK,X64 |
| 4 | |
| 5 | define double @mmx_zero(double, double, double, double) nounwind { |
| 6 | ; X86-LABEL: mmx_zero: |
| 7 | ; X86: # %bb.0: |
| 8 | ; X86-NEXT: pushl %ebp |
| 9 | ; X86-NEXT: movl %esp, %ebp |
| 10 | ; X86-NEXT: andl $-8, %esp |
| 11 | ; X86-NEXT: subl $24, %esp |
| 12 | ; X86-NEXT: movq 8(%ebp), %mm0 |
| 13 | ; X86-NEXT: movq 16(%ebp), %mm5 |
| 14 | ; X86-NEXT: movq %mm5, {{[0-9]+}}(%esp) # 8-byte Spill |
| 15 | ; X86-NEXT: movq %mm0, %mm3 |
| 16 | ; X86-NEXT: paddd %mm5, %mm3 |
| 17 | ; X86-NEXT: xorps %xmm0, %xmm0 |
| 18 | ; X86-NEXT: movdq2q %xmm0, %mm1 |
| 19 | ; X86-NEXT: movq %mm1, (%esp) # 8-byte Spill |
| 20 | ; X86-NEXT: movq %mm3, %mm6 |
| 21 | ; X86-NEXT: pmuludq %mm1, %mm6 |
| 22 | ; X86-NEXT: movq 24(%ebp), %mm4 |
| 23 | ; X86-NEXT: movq %mm6, %mm2 |
| 24 | ; X86-NEXT: paddd %mm4, %mm2 |
| 25 | ; X86-NEXT: paddw %mm2, %mm0 |
| 26 | ; X86-NEXT: movq %mm5, %mm1 |
| 27 | ; X86-NEXT: paddw %mm0, %mm1 |
| 28 | ; X86-NEXT: movq 32(%ebp), %mm5 |
| 29 | ; X86-NEXT: movq %mm1, %mm7 |
| 30 | ; X86-NEXT: pmuludq %mm5, %mm7 |
| 31 | ; X86-NEXT: paddw %mm4, %mm7 |
| 32 | ; X86-NEXT: paddw %mm7, %mm5 |
| 33 | ; X86-NEXT: paddw %mm5, %mm2 |
| 34 | ; X86-NEXT: paddw %mm2, %mm0 |
| 35 | ; X86-NEXT: paddw %mm6, %mm0 |
| 36 | ; X86-NEXT: pmuludq %mm3, %mm0 |
| 37 | ; X86-NEXT: paddw (%esp), %mm0 # 8-byte Folded Reload |
| 38 | ; X86-NEXT: paddw %mm1, %mm0 |
| 39 | ; X86-NEXT: pmuludq %mm7, %mm0 |
| 40 | ; X86-NEXT: pmuludq {{[0-9]+}}(%esp), %mm0 # 8-byte Folded Reload |
| 41 | ; X86-NEXT: paddw %mm5, %mm0 |
| 42 | ; X86-NEXT: paddw %mm2, %mm0 |
| 43 | ; X86-NEXT: movq2dq %mm0, %xmm0 |
| 44 | ; X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) |
| 45 | ; X86-NEXT: fldl {{[0-9]+}}(%esp) |
| 46 | ; X86-NEXT: movl %ebp, %esp |
| 47 | ; X86-NEXT: popl %ebp |
| 48 | ; X86-NEXT: retl |
| 49 | ; |
| 50 | ; X64-LABEL: mmx_zero: |
| 51 | ; X64: # %bb.0: |
| 52 | ; X64-NEXT: movdq2q %xmm0, %mm0 |
| 53 | ; X64-NEXT: movdq2q %xmm1, %mm5 |
| 54 | ; X64-NEXT: movq %mm5, -{{[0-9]+}}(%rsp) # 8-byte Spill |
| 55 | ; X64-NEXT: movq %mm0, %mm3 |
| 56 | ; X64-NEXT: paddd %mm5, %mm3 |
| 57 | ; X64-NEXT: xorps %xmm0, %xmm0 |
| 58 | ; X64-NEXT: movdq2q %xmm0, %mm1 |
| 59 | ; X64-NEXT: movq %mm1, -{{[0-9]+}}(%rsp) # 8-byte Spill |
| 60 | ; X64-NEXT: movq %mm3, %mm6 |
| 61 | ; X64-NEXT: pmuludq %mm1, %mm6 |
| 62 | ; X64-NEXT: movdq2q %xmm2, %mm4 |
| 63 | ; X64-NEXT: movq %mm6, %mm2 |
| 64 | ; X64-NEXT: paddd %mm4, %mm2 |
| 65 | ; X64-NEXT: paddw %mm2, %mm0 |
| 66 | ; X64-NEXT: movq %mm5, %mm1 |
| 67 | ; X64-NEXT: paddw %mm0, %mm1 |
| 68 | ; X64-NEXT: movdq2q %xmm3, %mm5 |
| 69 | ; X64-NEXT: movq %mm1, %mm7 |
| 70 | ; X64-NEXT: pmuludq %mm5, %mm7 |
| 71 | ; X64-NEXT: paddw %mm4, %mm7 |
| 72 | ; X64-NEXT: paddw %mm7, %mm5 |
| 73 | ; X64-NEXT: paddw %mm5, %mm2 |
| 74 | ; X64-NEXT: paddw %mm2, %mm0 |
| 75 | ; X64-NEXT: paddw %mm6, %mm0 |
| 76 | ; X64-NEXT: pmuludq %mm3, %mm0 |
| 77 | ; X64-NEXT: paddw -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload |
| 78 | ; X64-NEXT: paddw %mm1, %mm0 |
| 79 | ; X64-NEXT: pmuludq %mm7, %mm0 |
| 80 | ; X64-NEXT: pmuludq -{{[0-9]+}}(%rsp), %mm0 # 8-byte Folded Reload |
| 81 | ; X64-NEXT: paddw %mm5, %mm0 |
| 82 | ; X64-NEXT: paddw %mm2, %mm0 |
| 83 | ; X64-NEXT: movq2dq %mm0, %xmm0 |
| 84 | ; X64-NEXT: retq |
| 85 | %5 = bitcast double %0 to x86_mmx |
| 86 | %6 = bitcast double %1 to x86_mmx |
| 87 | %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %6) |
| 88 | %8 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %7, x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) |
| 89 | %9 = bitcast double %2 to x86_mmx |
| 90 | %10 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %8, x86_mmx %9) |
| 91 | %11 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %5, x86_mmx %10) |
| 92 | %12 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %6, x86_mmx %11) |
| 93 | %13 = bitcast double %3 to x86_mmx |
| 94 | %14 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %12, x86_mmx %13) |
| 95 | %15 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %14, x86_mmx %9) |
| 96 | %16 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %15, x86_mmx %13) |
| 97 | %17 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %16, x86_mmx %10) |
| 98 | %18 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %17, x86_mmx %11) |
| 99 | %19 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %18, x86_mmx %8) |
| 100 | %20 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %19, x86_mmx %7) |
| 101 | %21 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %20, x86_mmx bitcast (double 0.000000e+00 to x86_mmx)) |
| 102 | %22 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %21, x86_mmx %12) |
| 103 | %23 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %22, x86_mmx %15) |
| 104 | %24 = tail call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %23, x86_mmx %6) |
| 105 | %25 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %24, x86_mmx %16) |
| 106 | %26 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %25, x86_mmx %17) |
| 107 | %27 = bitcast x86_mmx %26 to double |
| 108 | ret double %27 |
| 109 | } |
| 110 | |
| 111 | declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx) |
| 112 | declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx) |
| 113 | declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) |