llvm/lib/Target/X86/README-MMX.txt - toolchain/llvm-project - Gitiles

 //===---------------------------------------------------------------------===//
 // Random ideas for the X86 backend: MMX-specific stuff.
 //===---------------------------------------------------------------------===//

 //===---------------------------------------------------------------------===//

 This:

 #include <mmintrin.h>

 __v2si qux(int A) {
   return (__v2si){ 0, A };
 }

 is compiled into:

 _qux:
         subl $28, %esp
         movl 32(%esp), %eax
         movd %eax, %mm0
         movq %mm0, (%esp)
         movl (%esp), %eax
         movl %eax, 20(%esp)
         movq %mm0, 8(%esp)
         movl 12(%esp), %eax
         movl %eax, 16(%esp)
         movq 16(%esp), %mm0
         addl $28, %esp
         ret

 Yuck!

 GCC gives us:

 _qux:
         subl    $12, %esp
         movl    16(%esp), %eax
         movl    20(%esp), %edx
         movl    $0, (%eax)
         movl    %edx, 4(%eax)
         addl    $12, %esp
         ret     $4

 //===---------------------------------------------------------------------===//

 int main() {
   __m64 A[1] = { _mm_cvtsi32_si64(1)  };
   __m64 B[1] = { _mm_cvtsi32_si64(10) };
   __m64 sum = _mm_cvtsi32_si64(0);

   sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);

   printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
   return 0;
 }

 Generates:

         movl $11, %eax
 ###     movd %eax, %mm0
 ###     movq %mm0, 8(%esp)
 ###     movl 8(%esp), %eax
         movl %eax, 4(%esp)
         movl $_str, (%esp)
         call L_printf$stub
         xorl %eax, %eax
         addl $28, %esp

 These instructions are unnecessary.
	//===---------------------------------------------------------------------===//
	// Random ideas for the X86 backend: MMX-specific stuff.
	//===---------------------------------------------------------------------===//

	//===---------------------------------------------------------------------===//

	This:

	#include <mmintrin.h>

	__v2si qux(int A) {
	return (__v2si){ 0, A };
	}

	is compiled into:

	_qux:
	subl $28, %esp
	movl 32(%esp), %eax
	movd %eax, %mm0
	movq %mm0, (%esp)
	movl (%esp), %eax
	movl %eax, 20(%esp)
	movq %mm0, 8(%esp)
	movl 12(%esp), %eax
	movl %eax, 16(%esp)
	movq 16(%esp), %mm0
	addl $28, %esp
	ret

	Yuck!

	GCC gives us:

	_qux:
	subl $12, %esp
	movl 16(%esp), %eax
	movl 20(%esp), %edx
	movl $0, (%eax)
	movl %edx, 4(%eax)
	addl $12, %esp
	ret $4

	//===---------------------------------------------------------------------===//

	int main() {
	__m64 A[1] = { _mm_cvtsi32_si64(1) };
	__m64 B[1] = { _mm_cvtsi32_si64(10) };
	__m64 sum = _mm_cvtsi32_si64(0);

	sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);

	printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
	return 0;
	}

	Generates:

	movl $11, %eax
	### movd %eax, %mm0
	### movq %mm0, 8(%esp)
	### movl 8(%esp), %eax
	movl %eax, 4(%esp)
	movl $_str, (%esp)
	call L_printf$stub
	xorl %eax, %eax
	addl $28, %esp

	These instructions are unnecessary.