It's not necessary to do rounding for alloca operations when the requested
alignment is equal to the stack alignment.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40004 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..5416cdb
--- /dev/null
+++ b/lib/Target/X86/Makefile
@@ -0,0 +1,20 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+# The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+ X86GenRegisterInfo.inc X86GenInstrNames.inc \
+ X86GenInstrInfo.inc X86GenAsmWriter.inc \
+ X86GenAsmWriter1.inc X86GenDAGISel.inc \
+ X86GenCallingConv.inc X86GenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..d94fa02
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt
@@ -0,0 +1,99 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+On darwin/x86, we should codegen:
+
+ ret double 0.000000e+00
+
+as fld0/ret, not as:
+
+ movl $0, 4(%esp)
+ movl $0, (%esp)
+ fldl (%esp)
+ ...
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (extloadf64f32 addr:$src2)))]>;
+ // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP:$dst, (fadd RFP:$src1,
+ (X86fild addr:$src2, i32)))]>;
+ // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global. Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P -> fld Q
+fld Q fld P
+fxch
+
+or:
+
+fxch -> fucomi
+fucomi jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl (%esp), %eax
+ addl $20, %esp
+ ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..57c7c3f
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt
@@ -0,0 +1,69 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+ return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+ subl $28, %esp
+ movl 32(%esp), %eax
+ movd %eax, %mm0
+ movq %mm0, (%esp)
+ movl (%esp), %eax
+ movl %eax, 20(%esp)
+ movq %mm0, 8(%esp)
+ movl 12(%esp), %eax
+ movl %eax, 16(%esp)
+ movq 16(%esp), %mm0
+ addl $28, %esp
+ ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+ subl $12, %esp
+ movl 16(%esp), %eax
+ movl 20(%esp), %edx
+ movl $0, (%eax)
+ movl %edx, 4(%eax)
+ addl $12, %esp
+ ret $4
+
+//===---------------------------------------------------------------------===//
+
+int main() {
+ __m64 A[1] = { _mm_cvtsi32_si64(1) };
+ __m64 B[1] = { _mm_cvtsi32_si64(10) };
+ __m64 sum = _mm_cvtsi32_si64(0);
+
+ sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);
+
+ printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
+ return 0;
+}
+
+Generates:
+
+ movl $11, %eax
+### movd %eax, %mm0
+### movq %mm0, 8(%esp)
+### movl 8(%esp), %eax
+ movl %eax, 4(%esp)
+ movl $_str, (%esp)
+ call L_printf$stub
+ xorl %eax, %eax
+ addl $28, %esp
+
+These instructions are unnecessary.
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..f4b54c4
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt
@@ -0,0 +1,629 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline: Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+ %C = select bool %B, double 123.412, double 523.01123123
+ ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction
+like this:
+
+ X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time). *Note* however that this can only be done
+if Y is dead. Here's a testcase:
+
+%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0]
+implementation ; Functions:
+declare void %printf(int, ...)
+void %main() {
+build_tree.exit:
+ br label %no_exit.i7
+no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit
+ %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1]
+ %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1]
+ %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+ %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+ br bool false, label %Compute_Tree.exit23, label %no_exit.i7
+Compute_Tree.exit23: ; preds = %no_exit.i7
+ tail call void (int, ...)* %printf( int 0 )
+ store double %tmp.34.i18, double* null
+ ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+ xorpd %XMM1, %XMM1
+ addsd %XMM0, %XMM1
+*** movsd %XMM2, QWORD PTR [%ESP + 8]
+*** addsd %XMM2, %XMM1
+*** movsd QWORD PTR [%ESP + 8], %XMM2
+ jmp .BBmain_1 # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+ %C = setlt double %A, %B
+ %z = add double %Z, 0.0 ;; select operand is not a load
+ %D = select bool %C, double %Y, double %z
+ ret double %D
+}
+
+We currently emit:
+
+_X:
+ subl $12, %esp
+ xorpd %xmm0, %xmm0
+ addsd 24(%esp), %xmm0
+ movsd 32(%esp), %xmm1
+ movsd 16(%esp), %xmm2
+ ucomisd 40(%esp), %xmm1
+ jb LBB_X_2
+LBB_X_1:
+ movsd %xmm0, %xmm2
+LBB_X_2:
+ movsd %xmm2, (%esp)
+ fldl (%esp)
+ addl $12, %esp
+ ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+ subl $20, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, 8(%esp)
+ fldl 8(%esp)
+ fisttpll (%esp)
+ movl (%esp), %eax
+ addl $20, %esp
+ ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
+the reg-reg copy in this example:
+
+float foo(int *x, float *y, unsigned c) {
+ float res = 0.0;
+ unsigned i;
+ for (i = 0; i < c; i++) {
+ float xx = (float)x[i];
+ xx = xx * y[i];
+ xx += res;
+ res = xx;
+ }
+ return res;
+}
+
+LBB_foo_3: # no_exit
+ cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
+ mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
+ addss %XMM0, %XMM1
+ inc %ESI
+ cmp %ESI, %ECX
+**** movaps %XMM1, %XMM0
+ jb LBB_foo_3 # no_exit
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+ if (copysign(1.0, x) == copysign(1.0, y))
+into:
+ if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+ movaps LCPI5_5, %xmm2
+ divps %xmm1, %xmm2
+ mulps %xmm2, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm2
+ andps LCPI5_1, %xmm3
+ por %xmm2, %xmm3
+ movdqa %xmm3, (%edi)
+
+ movaps LCPI5_5, %xmm1
+ divps %xmm0, %xmm1
+ mulps %xmm1, %xmm3
+ mulps 8656(%ecx), %xmm3
+ addps 8672(%ecx), %xmm3
+ andps LCPI5_6, %xmm1
+ andps LCPI5_1, %xmm3
+ orps %xmm1, %xmm3
+ movaps %xmm3, 112(%esp)
+ movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+ movaps (%edx), %xmm2 #59.21
+ movaps (%edx), %xmm5 #60.21
+ movaps (%edx), %xmm4 #61.21
+ movaps (%edx), %xmm3 #62.21
+ movl 40(%ecx), %ebp #69.49
+ shufps $0, %xmm2, %xmm5 #60.21
+ movl 100(%esp), %ebx #69.20
+ movl (%ebx), %edi #69.20
+ imull %ebp, %edi #69.49
+ addl (%eax), %edi #70.33
+ shufps $85, %xmm2, %xmm4 #61.21
+ shufps $170, %xmm2, %xmm3 #62.21
+ shufps $255, %xmm2, %xmm2 #63.21
+ lea (%ebp,%ebp,2), %ebx #69.49
+ negl %ebx #69.49
+ lea -3(%edi,%ebx), %ebx #70.33
+ shll $4, %ebx #68.37
+ addl 32(%ecx), %ebx #68.37
+ testb $15, %bl #91.13
+ jne L_B1.24 # Prob 5% #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %reg1078 = MOV32ri -3
+ %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+ %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+ %reg1080 = IMUL32rr %reg1079, %reg1037
+ %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+ %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+ %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+ %reg1082 = SHL32ri %reg1038, 4
+ %reg1039 = ADD32rr %reg1036, %reg1082
+ %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+ %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+ %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+ %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+ %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+ %reg1040 = MOV32rr %reg1039
+ %reg1084 = AND32ri8 %reg1039, 15
+ CMP32ri8 %reg1084, 0
+ JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+ %EAX = MOV32ri -3
+ %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+ ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+ %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+ %EDX = MOV32rm %EDX, 1, %NOREG, 40
+ IMUL32rr %EAX<def&use>, %EDX
+ %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 0
+ MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+ %EAX = LEA32r %ESI, 1, %EAX, -3
+ %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+ %ESI = MOV32rm %ESI, 1, %NOREG, 32
+ %EDI = MOV32rr %EAX
+ SHL32ri %EDI<def&use>, 4
+ ADD32rr %EDI<def&use>, %ESI
+ %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+ %XMM1 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM1<def&use>, %XMM1, 170
+ %XMM2 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM2<def&use>, %XMM2, 0
+ %XMM3 = MOVAPSrr %XMM0
+ SHUFPSrr %XMM3<def&use>, %XMM3, 255
+ SHUFPSrr %XMM0<def&use>, %XMM0, 85
+ %EBX = MOV32rr %EDI
+ AND32ri8 %EBX<def&use>, 15
+ CMP32ri8 %EBX, 0
+ JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4: # cond_true44
+ addps %xmm1, %xmm2
+ subps %xmm3, %xmm2
+ movaps (%ecx), %xmm4
+ movaps %xmm2, %xmm1
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+ addps %xmm2, %xmm1 <=== Commute!
+ subps %xmm3, %xmm1
+ movaps (%ecx), %xmm4
+ movaps %xmm1, %xmm1 <=== Eliminate!
+ addps %xmm4, %xmm1
+ addl $16, %ecx
+ incl %edx
+ cmpl $262144, %edx
+ movaps %xmm3, %xmm2
+ movaps %xmm4, %xmm3
+ jne LBB_main_4 # cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+ return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of
+xmm1 are already zero'd. We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea. Consider code like this:
+
+__m128 test(__m128 a) {
+ float b = *(float*)&A;
+ ...
+ return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled. This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte
+chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+#include <emmintrin.h>
+void test(__m128d *r, __m128d *A, double B) {
+ *r = _mm_loadl_pd(*A, &B);
+}
+
+We generates:
+
+ subl $12, %esp
+ movsd 24(%esp), %xmm0
+ movsd %xmm0, (%esp)
+ movl 20(%esp), %eax
+ movapd (%eax), %xmm0
+ movlpd (%esp), %xmm0
+ movl 16(%esp), %eax
+ movapd %xmm0, (%eax)
+ addl $12, %esp
+ ret
+
+icc generates:
+
+ movl 4(%esp), %edx #3.6
+ movl 8(%esp), %eax #3.6
+ movapd (%eax), %xmm0 #4.22
+ movlpd 12(%esp), %xmm0 #4.8
+ movapd %xmm0, (%edx) #4.3
+ ret #5.1
+
+So icc is smart enough to know that B is in memory so it doesn't load it and
+store it back to stack.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+ return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+void %test(float* %P, <4 x float>* %P2 ) {
+ %xFloat0.688 = load float* %P
+ %loadVector37.712 = load <4 x float>* %P2
+ %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+ store <4 x float> %inFloat3.713, <4 x float>* %P2
+ ret void
+}
+
+Generates:
+
+_test:
+ pxor %xmm0, %xmm0
+ movd %xmm0, %eax ;; EAX = 0!
+ movl 8(%esp), %ecx
+ movaps (%ecx), %xmm0
+ pinsrw $6, %eax, %xmm0
+ shrl $16, %eax ;; EAX = 0 again!
+ pinsrw $7, %eax, %xmm0
+ movaps %xmm0, (%ecx)
+ ret
+
+It would be better to generate:
+
+_test:
+ movl 8(%esp), %ecx
+ movaps (%ecx), %xmm0
+ xor %eax, %eax
+ pinsrw $6, %eax, %xmm0
+ pinsrw $7, %eax, %xmm0
+ movaps %xmm0, (%ecx)
+ ret
+
+or use pxor (to make a zero vector) and shuffle (to insert it).
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+ int i[4];
+ float f[4];
+ __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+ b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+ c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+ movl 4(%esp), %eax
+ movl 8(%esp), %edx
+ movl 12(%esp), %ecx
+ movlps (%eax), %xmm0
+ movlps %xmm0, (%edx)
+ movlps 8(%eax), %xmm0
+ movlps %xmm0, (%ecx)
+ ret
+
+not:
+
+swizzle:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %ecx
+ movlps (%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ movl 12(%esp), %eax
+ movaps (%eax), %xmm0
+ movlps 8(%ecx), %xmm0
+ movaps %xmm0, (%eax)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+#include <emmintrin.h>
+__m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
+
+Should turn into a single 'movq %rdi, %xmm0' instruction. Instead, we
+get this (on x86-64):
+
+_test:
+ movd %rdi, %xmm1
+ xorps %xmm0, %xmm0
+ movsd %xmm1, %xmm0
+ ret
+
+The LLVM IR is:
+
+target triple = "x86_64-apple-darwin8"
+define <2 x i64> @test(i64 %i) {
+entry:
+ %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0
+ %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+ ret <2 x i64> %tmp11
+}
+
+//===---------------------------------------------------------------------===//
+
+These functions should produce the same code:
+
+#include <emmintrin.h>
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo(__m128i* val) {
+ return __builtin_ia32_vec_ext_v4si(*val, 1);
+}
+int bar(__m128i* val) {
+ union vs {
+ __m128i *_v;
+ int* _s;
+ } v = {val};
+ return v._s[1];
+}
+
+We currently produce (with -m64):
+
+_foo:
+ pshufd $1, (%rdi), %xmm0
+ movd %xmm0, %eax
+ ret
+_bar:
+ movl 4(%rdi), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..191904a
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,223 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+Make use of "Red Zone".
+
+//===---------------------------------------------------------------------===//
+
+Implement __int128 and long double support.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+ xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+ jmp _xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+ ucomiss LC0(%rip), %xmm0
+ cvttss2siq %xmm0, %rdx
+ jb L3
+ subss LC0(%rip), %xmm0
+ movabsq $-9223372036854775808, %rax
+ cvttss2siq %xmm0, %rdx
+ xorq %rax, %rdx
+L3:
+ movq %rdx, %rax
+ ret
+
+instead of
+
+_conv:
+ movss LCPI1_0(%rip), %xmm1
+ cvttss2siq %xmm0, %rcx
+ movaps %xmm0, %xmm2
+ subss %xmm1, %xmm2
+ cvttss2siq %xmm2, %rax
+ movabsq $-9223372036854775808, %rdx
+ xorq %rdx, %rax
+ ucomiss %xmm1, %xmm0
+ cmovb %rcx, %rax
+ ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+ memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+ movq _b@GOTPCREL(%rip), %rax
+ movzbq (%rax), %rax
+ movq %rax, %rcx
+ shlq $8, %rcx
+ orq %rax, %rcx
+ movq %rcx, %rax
+ shlq $16, %rax
+ orq %rcx, %rax
+ movq %rax, %rcx
+ shlq $32, %rcx
+ movq _X@GOTPCREL(%rip), %rdx
+ orq %rax, %rcx
+ movq %rcx, (%rdx)
+ ret
+
+gcc:
+ movq _b@GOTPCREL(%rip), %rax
+ movabsq $72340172838076673, %rdx
+ movzbq (%rax), %rax
+ imulq %rdx, %rax
+ movq _X@GOTPCREL(%rip), %rdx
+ movq %rax, (%rdx)
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+ movzbl %al, %edx
+ leaq 0(,%rdx,4), %rax
+ leaq 4+L2(%rip), %rdx
+ leaq 239(%rsp), %rax
+ jmp *%rdx
+ movaps %xmm7, -15(%rax)
+ movaps %xmm6, -31(%rax)
+ movaps %xmm5, -47(%rax)
+ movaps %xmm4, -63(%rax)
+ movaps %xmm3, -79(%rax)
+ movaps %xmm2, -95(%rax)
+ movaps %xmm1, -111(%rax)
+ movaps %xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non-
+ POD structure or union type, or contains unaligned fields, it has class
+ MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS.
+3. Each field of an object is classified recursively so that always two fields
+ are considered. The resulting class is calculated according to the classes
+ of the fields in the eightbyte:
+ (a) If both classes are equal, this is the resulting class.
+ (b) If one of the classes is NO_CLASS, the resulting class is the other
+ class.
+ (c) If one of the classes is MEMORY, the result is the MEMORY class.
+ (d) If one of the classes is INTEGER, the result is the INTEGER.
+ (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+ class.
+ (f) Otherwise class SSE is used.
+4. Then a post merger cleanup is done:
+ (a) If one of the classes is MEMORY, the whole argument is passed in memory.
+ (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+ typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+ typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+ typedef struct { int64_t i; int8_t j; int64_t d; } S;
+ void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+ void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+ void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+ void %test(long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+ call void %test( undef, undef, undef, undef, undef, undef,
+ %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+ void %test(int %d,
+ long %undef1, long %undef2, long %undef3, long %undef4,
+ long %undef5, long %undef6,
+ long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+ movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+ leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..f15090a
--- /dev/null
+++ b/lib/Target/X86/README.txt
@@ -0,0 +1,1150 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+Missing features:
+ - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+ - support for 3DNow!
+ - weird abis?
+
+//===---------------------------------------------------------------------===//
+
+Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
+Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
+X86, & make the dag combiner produce it when needed. This will eliminate one
+imul from the code generated for:
+
+long long test(long long X, long long Y) { return X*Y; }
+
+by using the EAX result from the mul. We should add a similar node for
+DIVREM.
+
+another case is:
+
+long long test(int X, int Y) { return (long long)X*Y; }
+
+... which should only be one imul instruction.
+
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+ return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
+This can be done with a custom expander, but it would be nice to move this to
+generic code.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case. We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+ return X/Y;
+}
+
+This can be done trivially with a custom legalizer. What about overflow
+though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+ xorl %eax, %eax
+ xorl %edx, %edx
+ testb $32, %cl
+ sete %al
+ setne %dl
+ sall %cl, %eax
+ sall %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code. Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+ movzbl %dil, %eax
+ xorl $1, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+ issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+ Sequencing of Instructions".
+4. Scheduling for reduced register pressure. E.g. "Minimum Register
+ Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
+ and other related papers.
+ http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+
+//===---------------------------------------------------------------------===//
+
+Count leading zeros and count trailing zeros:
+
+int clz(int X) { return __builtin_clz(X); }
+int ctz(int X) { return __builtin_ctz(X); }
+
+$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
+clz:
+ bsr %eax, DWORD PTR [%esp+4]
+ xor %eax, 31
+ ret
+ctz:
+ bsf %eax, DWORD PTR [%esp+4]
+ ret
+
+however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
+aren't.
+
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+ if (j)
+ return __builtin_ffs (j) - 1;
+ else
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor. They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare. The
+pattern is written as (cmp reg, (load p)). Because the compare isn't
+commutative, it is not matched with the load on both sides. The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+How about intrinsics? An example is:
+ *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
+
+compiles to
+ pmuludq (%eax), %xmm0
+ movl 8(%esp), %eax
+ movdqa (%eax), %xmm1
+ pmulhuw %xmm0, %xmm1
+
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+ movl 8(%esp), %eax
+ cmpl %eax, 4(%esp)
+ setl %al
+ movzbl %al, %eax
+ ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+ movl 8(%esp), %ebx
+ xor %eax, %eax
+ cmpl %ebx, 4(%esp)
+ setl %al
+ ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important. e.g., for:
+
+void setbit(int *target, int bit) {
+ *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+ *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+ movl $16843009, 4(%edx)
+ movl $16843009, (%edx)
+ movw $257, 8(%edx)
+
+It might be better to generate
+
+ movl $16843009, %eax
+ movl %eax, 4(%edx)
+ movl %eax, (%edx)
+ movw al, 8(%edx)
+
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
+get this:
+
+int %test1(int %X) {
+ %Y = div int %X, 8
+ ret int %Y
+}
+
+_test1:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ sarl $31, %ecx
+ shrl $29, %ecx
+ addl %ecx, %eax
+ sarl $3, %eax
+ ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+ movl 4(%esp), %eax
+ cmpl $-1, %eax
+ leal 7(%eax), %ecx
+ cmovle %ecx, %eax
+ sarl $3, %eax
+ ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+The first BB of this code:
+
+declare bool %foo()
+int %bar() {
+ %V = call bool %foo()
+ br bool %V, label %T, label %F
+T:
+ ret int 1
+F:
+ call bool %foo()
+ ret int 12
+}
+
+compiles to:
+
+_bar:
+ subl $12, %esp
+ call L_foo$stub
+ xorb $1, %al
+ testb %al, %al
+ jne LBB_bar_2 # F
+
+It would be better to emit "cmp %al, 1" than a xor and test.
+
+//===---------------------------------------------------------------------===//
+
+Enable X86InstrInfo::convertToThreeAddress().
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+%X = weak global int 0
+
+void %foo(int %N) {
+ %N = cast int %N to uint
+ %tmp.24 = setgt int %N, 0
+ br bool %tmp.24, label %no_exit, label %return
+
+no_exit:
+ %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
+ %i.0.0 = cast uint %indvar to int
+ volatile store int %i.0.0, int* %X
+ %indvar.next = add uint %indvar, 1
+ %exitcond = seteq uint %indvar.next, %N
+ br bool %exitcond, label %return, label %no_exit
+
+return:
+ ret void
+}
+
+compiles into:
+
+ .text
+ .align 4
+ .globl _foo
+_foo:
+ movl 4(%esp), %eax
+ cmpl $1, %eax
+ jl LBB_foo_4 # return
+LBB_foo_1: # no_exit.preheader
+ xorl %ecx, %ecx
+LBB_foo_2: # no_exit
+ movl L_X$non_lazy_ptr, %edx
+ movl %ecx, (%edx)
+ incl %ecx
+ cmpl %eax, %ecx
+ jne LBB_foo_2 # no_exit
+LBB_foo_3: # return.loopexit
+LBB_foo_4: # return
+ ret
+
+We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
+remateralization is implemented. This can be accomplished with 1) a target
+dependent LICM pass or 2) makeing SelectDAG represent the whole function.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+We are generating far worse code than gcc:
+
+volatile short X, Y;
+
+void foo(int N) {
+ int i;
+ for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+LBB1_1: #bb.preheader
+ xorl %ecx, %ecx
+ xorw %dx, %dx
+LBB1_2: #bb
+ movl L_X$non_lazy_ptr, %esi
+ movw %dx, (%esi)
+ movw %dx, %si
+ shlw $2, %si
+ movl L_Y$non_lazy_ptr, %edi
+ movw %si, (%edi)
+ incl %ecx
+ incw %dx
+ cmpl %eax, %ecx
+ jne LBB1_2 #bb
+
+vs.
+
+ xorl %edx, %edx
+ movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+ movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+ movw %dx, (%esi)
+ leal 0(,%edx,4), %eax
+ movw %ax, (%ecx)
+ addl $1, %edx
+ cmpl %edx, %edi
+ jne L4
+
+There are 3 issues:
+
+1. Lack of post regalloc LICM.
+2. Poor sub-regclass support. That leads to inability to promote the 16-bit
+ arithmetic op to 32-bit and making use of leal.
+3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+ the cast would be free.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+mov $reg, 48(%esp)
+...
+leal 48(%esp), %eax
+mov %eax, (%esp)
+call _foo
+
+Obviously it would have been better for the first mov (or any op) to store
+directly %esp[0] if there are no other uses.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+ if (_mm_comige_ss(*A, *B))
+ return 3;
+ else
+ return 4;
+}
+
+_test:
+ movl 8(%esp), %eax
+ movaps (%eax), %xmm0
+ movl 4(%esp), %eax
+ movaps (%eax), %xmm1
+ comiss %xmm0, %xmm1
+ setae %al
+ movzbl %al, %ecx
+ movl $3, %eax
+ movl $4, %edx
+ cmpl $0, %ecx
+ cmove %edx, %eax
+ ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+If shorter, we should use things like:
+movzwl %ax, %eax
+instead of:
+andl $65535, %EAX
+
+The former can also be used when the two-addressy nature of the 'and' would
+require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+
+//===---------------------------------------------------------------------===//
+
+Bad codegen:
+
+char foo(int x) { return x; }
+
+_foo:
+ movl 4(%esp), %eax
+ shll $24, %eax
+ sarl $24, %eax
+ ret
+
+SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of
+sub-registers.
+
+//===---------------------------------------------------------------------===//
+
+Consider this:
+
+typedef struct pair { float A, B; } pair;
+void pairtest(pair P, float *FP) {
+ *FP = P.A+P.B;
+}
+
+We currently generate this code with llvmgcc4:
+
+_pairtest:
+ movl 8(%esp), %eax
+ movl 4(%esp), %ecx
+ movd %eax, %xmm0
+ movd %ecx, %xmm1
+ addss %xmm0, %xmm1
+ movl 12(%esp), %eax
+ movss %xmm1, (%eax)
+ ret
+
+we should be able to generate:
+_pairtest:
+ movss 4(%esp), %xmm0
+ movl 12(%esp), %eax
+ addss 8(%esp), %xmm0
+ movss %xmm0, (%eax)
+ ret
+
+The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
+integer chunks. It does this so that structs like {short,short} are passed in
+a single 32-bit integer stack slot. We should handle the safe cases above much
+nicer, while still handling the hard cases.
+
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload. This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
+//===---------------------------------------------------------------------===//
+
+Another instruction selector deficiency:
+
+void %bar() {
+ %tmp = load int (int)** %foo
+ %tmp = tail call int %tmp( int 3 )
+ ret void
+}
+
+_bar:
+ subl $12, %esp
+ movl L_foo$non_lazy_ptr, %eax
+ movl (%eax), %eax
+ call *%eax
+ addl $12, %esp
+ ret
+
+The current isel scheme will not allow the load to be folded in the call since
+the load's chain result is read by the callseq_start.
+
+//===---------------------------------------------------------------------===//
+
+Don't forget to find a way to squash noop truncates in the JIT environment.
+
+//===---------------------------------------------------------------------===//
+
+Implement anyext in the same manner as truncate that would allow them to be
+eliminated.
+
+//===---------------------------------------------------------------------===//
+
+How about implementing truncate / anyext as a property of machine instruction
+operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
+Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
+For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+ return a * 3;
+}
+
+We currently emits
+ imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+ movl 4(%esp), %eax
+ leal (%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+//===---------------------------------------------------------------------===//
+
+Implement CTTZ, CTLZ with bsf and bsr.
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+int %foo(int* %a, int %t) {
+entry:
+ br label %cond_true
+
+cond_true: ; preds = %cond_true, %entry
+ %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
+ %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+ %tmp2 = getelementptr int* %a, int %x.0.0
+ %tmp3 = load int* %tmp2 ; <int> [#uses=1]
+ %tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
+ %tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
+ %tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
+ %tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
+ br bool %tmp, label %bb12, label %cond_true
+
+bb12: ; preds = %cond_true
+ ret int %tmp7
+}
+
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+ float fl = (int) (u & 0xffff);
+ float fh = (int) (u >> 16);
+ fh *= 0x1.0p16f;
+ return fh + fl;
+}
+
+00000000 subl $0x04,%esp
+00000003 movl 0x08(%esp,1),%eax
+00000007 movl %eax,%ecx
+00000009 shrl $0x10,%ecx
+0000000c cvtsi2ss %ecx,%xmm0
+00000010 andl $0x0000ffff,%eax
+00000015 cvtsi2ss %eax,%xmm1
+00000019 mulss 0x00000078,%xmm0
+00000021 addss %xmm1,%xmm0
+00000025 movss %xmm0,(%esp,1)
+0000002a flds (%esp,1)
+0000002d addl $0x04,%esp
+00000030 ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+ if (a == 4 || a == 6)
+ b++;
+ return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b". For example, instead of:
+
+int G;
+void f(int X, int Y) {
+ G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+ movl $14, %eax
+ movl $13, %ecx
+ movl 4(%esp), %edx
+ testl %edx, %edx
+ cmovl %eax, %ecx
+ movl %ecx, _G
+ ret
+
+it could be:
+_f:
+ movl 4(%esp), %eax
+ sarl $31, %eax
+ notl %eax
+ addl $14, %eax
+ movl %eax, _G
+ ret
+
+etc.
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+ call fastcc void %test1( )
+ call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+ ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+ subl $16, %esp
+ call _test5
+ addl $12, %esp
+ subl $16, %esp
+ movl $_test5, (%esp)
+ call _test6
+ addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile sign_extend_inreg into two shifts:
+
+long foo(long X) {
+ return (long)(signed char)X;
+}
+
+becomes:
+
+_foo:
+ movl 4(%esp), %eax
+ shll $24, %eax
+ sarl $24, %eax
+ ret
+
+This could be:
+
+_foo:
+ movsbl 4(%esp),%eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+uint %test3(uint %X) {
+ %tmp1 = rem uint %X, 255
+ ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+ movl $2155905153, %ecx
+ movl 8(%esp), %esi
+ movl %esi, %eax
+ mull %ecx
+...
+
+This could be "reassociated" into:
+
+ movl $2155905153, %eax
+ movl 8(%esp), %ecx
+ mull %ecx
+
+to avoid the copy. In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction. I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary. In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
+942 942 0x3d03 movl %dh, (1809(%esp, %esi)
+937 937 0x3d0a incl %esi
+3 3 0x3d0b cmpb %bl, %dl
+27 27 0x3d0d jnz 0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+ if ((p[0] == 1) & (p[1] == 2)) return 1;
+ return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more. For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x += y) == 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+ addl %esi, (%rdi)
+ movl %edx, %eax
+ cmovne %ecx, %eax
+ ret
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ movl %esi, (%rdi)
+ testl %esi, %esi
+ cmove %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+ if ((*x + y) < 0)
+ return a;
+ else
+ return b;
+}
+
+to:
+
+add_zf:
+ addl (%rdi), %esi
+ movl %edx, %eax
+ cmovns %ecx, %eax
+ ret
+
+instead of:
+
+_add_zf:
+ addl (%rdi), %esi
+ testl %esi, %esi
+ cmovs %edx, %ecx
+ movl %ecx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <math.h>
+int foo(double X) { return isnan(X); }
+
+compiles to (-m64):
+
+_foo:
+ pxor %xmm1, %xmm1
+ ucomisd %xmm1, %xmm0
+ setp %al
+ movzbl %al, %eax
+ ret
+
+the pxor is not needed, we could compare the value against itself.
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ movl 8(%esp), %edx
+ cmpl %edx, %ecx
+ jne LBB1_2 #UnifiedReturnBlock
+LBB1_1: #cond_true
+ addl $2, %eax
+ ret
+LBB1_2: #UnifiedReturnBlock
+ movl %ecx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ movl %eax, %ecx
+ incl %ecx
+ cmpl 8(%esp), %ecx
+ sete %cl
+ movzbl %cl, %ecx
+ leal 1(%ecx,%eax), %eax
+ ret
+
+both of which are inferior to GCC's:
+
+_f:
+ movl 4(%esp), %edx
+ leal 1(%edx), %eax
+ addl $2, %edx
+ cmpl 8(%esp), %eax
+ cmove %edx, %eax
+ ret
+_f2:
+ movl 4(%esp), %eax
+ addl $1, %eax
+ xorl %edx, %edx
+ cmpl 8(%esp), %eax
+ sete %dl
+ addl %edx, %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+ if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne LBB1_1
+ addl $12, %esp
+ ret
+LBB1_1:
+ call L_abort$stub
+
+It would be better to produce:
+
+_test:
+ subl $12, %esp
+ cmpl $0, 16(%esp)
+ jne L_abort$stub
+ addl $12, %esp
+ ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ subl $12, %esp
+ call L_abort$stub
+
+Both are useful in different situations. Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+ cmpl $0, 4(%esp)
+ jne LBB1_1
+ ret
+LBB1_1:
+ pop %eax # realign stack.
+ call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead. For example, on X86-64, compile:
+
+int foo(int A, int B) {
+ return A+1;
+}
+
+to:
+
+_foo:
+ leal 1(%edi), %eax
+ ret
+
+instead of:
+
+_foo:
+ incl %edi
+ movl %edi, %eax
+ ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y. Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered. However, this copy is not needed if the register
+;; allocator turns the shift into an LEA. This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN: not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+ %Z = add int %X, %Y
+ volatile store int %Y, int* %G
+ volatile store int %Z, int* %G
+ ret int %X
+}
+
+int %test2(int %X) {
+ %Z = add int %X, 1 ;; inc
+ volatile store int %Z, int* %G
+ ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+We use push/pop of stack space around calls in situations where we don't have to.
+Call to f below produces:
+ subl $16, %esp <<<<<
+ movl %eax, (%esp)
+ call L_f$stub
+ addl $16, %esp <<<<<
+The stack push/pop can be moved into the prolog/epilog. It does this because it's
+building the frame pointer, but this should not be sufficient, only the use of alloca
+should cause it to do this.
+(There are other issues shown by this code, but this is one.)
+
+typedef struct _range_t {
+ float fbias;
+ float fscale;
+ int ibias;
+ int iscale;
+ int ishift;
+ unsigned char lut[];
+} range_t;
+
+struct _decode_t {
+ int type:4;
+ int unit:4;
+ int alpha:8;
+ int N:8;
+ int bpc:8;
+ int bpp:16;
+ int skip:8;
+ int swap:8;
+ const range_t*const*range;
+};
+
+typedef struct _decode_t decode_t;
+
+extern int f(const decode_t* decode);
+
+int decode_byte (const decode_t* decode) {
+ if (decode->swap != 0)
+ return f(decode);
+ return 0;
+}
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <xmmintrin.h>
+unsigned test(float f) {
+ return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
+}
+
+Compiles to:
+_test:
+ movss 4(%esp), %xmm0
+ movd %xmm0, %eax
+ ret
+
+it should compile to a move from the stack slot directly into eax. DAGCombine
+has this xform, but it is currently disabled until the alignment fields of
+the load/store nodes are trustworthy.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction. Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+ movl $7, %eax
+ movsbl 4(%esp), %ecx
+ subl %ecx, %eax
+ ret
+
+We would use one fewer register if codegen'd as:
+
+ movsbl 4(%esp), %eax
+ neg %eax
+ add $7, %eax
+ ret
+
+Note that this isn't beneficial if the load can be folded into the sub. In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+ movl $7, %eax
+ subl 4(%esp), %eax
+ ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1
+movaps %xmm1, 192(%esp)
+movaps %xmm1, 224(%esp)
+movaps %xmm1, 176(%esp)
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..c7663be
--- /dev/null
+++ b/lib/Target/X86/X86.h
@@ -0,0 +1,66 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include <iosfwd>
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPassManager;
+class FunctionPass;
+class MachineCodeEmitter;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM, bool Fast);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.
+///
+FunctionPass *createX86CodePrinterPass(std::ostream &o, X86TargetMachine &tm);
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+ MachineCodeEmitter &MCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers. This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..98362c8
--- /dev/null
+++ b/lib/Target/X86/X86.td
@@ -0,0 +1,150 @@
+//===- X86.td - Target definition file for the Intel X86 arch ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+ "Enable MMX instructions">;
+def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+ "Enable SSE instructions",
+ [FeatureMMX]>;
+def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+ "Enable SSE2 instructions",
+ [FeatureSSE1]>;
+def FeatureSSE3 : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+ "Enable SSE3 instructions",
+ [FeatureSSE2]>;
+def FeatureSSSE3 : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+ "Enable SSSE3 instructions",
+ [FeatureSSE3]>;
+def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+ "Enable 3DNow! instructions">;
+def Feature3DNowA : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+ "Enable 3DNow! Athlon instructions",
+ [Feature3DNow]>;
+def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
+ "Support 64-bit instructions",
+ [FeatureSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic", []>;
+def : Proc<"i386", []>;
+def : Proc<"i486", []>;
+def : Proc<"pentium", []>;
+def : Proc<"pentium-mmx", [FeatureMMX]>;
+def : Proc<"i686", []>;
+def : Proc<"pentiumpro", []>;
+def : Proc<"pentium2", [FeatureMMX]>;
+def : Proc<"pentium3", [FeatureSSE1]>;
+def : Proc<"pentium-m", [FeatureSSE2]>;
+def : Proc<"pentium4", [FeatureSSE2]>;
+def : Proc<"x86-64", [Feature64Bit]>;
+def : Proc<"yonah", [FeatureSSE3]>;
+def : Proc<"prescott", [FeatureSSE3]>;
+def : Proc<"nocona", [FeatureSSE3]>;
+def : Proc<"core2", [FeatureSSSE3]>;
+
+def : Proc<"k6", [FeatureMMX]>;
+def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"k6-3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"athlon", [FeatureMMX, Feature3DNowA]>;
+def : Proc<"athlon-tbird", [FeatureMMX, Feature3DNowA]>;
+def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA]>;
+def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA]>;
+def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA]>;
+def : Proc<"k8", [Feature3DNowA, Feature64Bit]>;
+def : Proc<"opteron", [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon64", [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon-fx", [Feature3DNowA, Feature64Bit]>;
+
+def : Proc<"winchip-c6", [FeatureMMX]>;
+def : Proc<"winchip2", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3", [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2", [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+ // Define how we want to layout our TargetSpecific information field... This
+ // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+ let TSFlagsFields = ["FormBits",
+ "hasOpSizePrefix",
+ "hasAdSizePrefix",
+ "Prefix",
+ "hasREX_WPrefix",
+ "ImmTypeBits",
+ "FPFormBits",
+ "Opcode"];
+ let TSFlagsShifts = [0,
+ 6,
+ 7,
+ 8,
+ 12,
+ 13,
+ 16,
+ 24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+ string AsmWriterClassName = "ATTAsmPrinter";
+ int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+ string AsmWriterClassName = "IntelAsmPrinter";
+ int Variant = 1;
+}
+
+
+def X86 : Target {
+ // Information about the instructions...
+ let InstructionSet = X86InstrInfo;
+
+ let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/lib/Target/X86/X86ATTAsmPrinter.cpp b/lib/Target/X86/X86ATTAsmPrinter.cpp
new file mode 100755
index 0000000..e97babe
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.cpp
@@ -0,0 +1,607 @@
+//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTAsmPrinter.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetAsmInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static std::string computePICLabel(unsigned FnNum,
+ const TargetAsmInfo *TAI,
+ const X86Subtarget* Subtarget) {
+ std::string label;
+ if (Subtarget->isTargetDarwin())
+ label = "\"L" + utostr_32(FnNum) + "$pb\"";
+ else if (Subtarget->isTargetELF())
+ label = ".Lllvm$" + utostr_32(FnNum) + "$piclabel";
+ else
+ assert(0 && "Don't know how to print PIC label!\n");
+
+ return label;
+}
+
+/// getSectionForFunction - Return the section that we should emit the
+/// specified function body into.
+std::string X86ATTAsmPrinter::getSectionForFunction(const Function &F) const {
+ switch (F.getLinkage()) {
+ default: assert(0 && "Unknown linkage type!");
+ case Function::InternalLinkage:
+ case Function::DLLExportLinkage:
+ case Function::ExternalLinkage:
+ return TAI->getTextSection();
+ case Function::WeakLinkage:
+ case Function::LinkOnceLinkage:
+ if (Subtarget->isTargetDarwin()) {
+ return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions";
+ } else if (Subtarget->isTargetCygMing()) {
+ return "\t.section\t.text$linkonce." + CurrentFnName + ",\"ax\"";
+ } else {
+ return "\t.section\t.llvm.linkonce.t." + CurrentFnName +
+ ",\"ax\",@progbits";
+ }
+ }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ if (TAI->doesSupportDebugInformation()) {
+ // Let PassManager know we need debug information and relay
+ // the MachineModuleInfo address on to DwarfWriter.
+ DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+ }
+
+ SetupMachineFunction(MF);
+ O << "\n\n";
+
+ // Print out constants referenced by the function
+ EmitConstantPool(MF.getConstantPool());
+
+ // Print out labels for the function.
+ const Function *F = MF.getFunction();
+ unsigned CC = F->getCallingConv();
+
+ // Populate function information map. Actually, We don't want to populate
+ // non-stdcall or non-fastcall functions' information right now.
+ if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+ FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+ X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+ SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+ switch (F->getLinkage()) {
+ default: assert(0 && "Unknown linkage type!");
+ case Function::InternalLinkage: // Symbols default to internal.
+ EmitAlignment(4, F); // FIXME: This should be parameterized somewhere.
+ break;
+ case Function::DLLExportLinkage:
+ DLLExportedFns.insert(Mang->makeNameProper(F->getName(), ""));
+ //FALLS THROUGH
+ case Function::ExternalLinkage:
+ EmitAlignment(4, F); // FIXME: This should be parameterized somewhere.
+ O << "\t.globl\t" << CurrentFnName << "\n";
+ break;
+ case Function::LinkOnceLinkage:
+ case Function::WeakLinkage:
+ if (Subtarget->isTargetDarwin()) {
+ O << "\t.globl\t" << CurrentFnName << "\n";
+ O << "\t.weak_definition\t" << CurrentFnName << "\n";
+ } else if (Subtarget->isTargetCygMing()) {
+ EmitAlignment(4, F); // FIXME: This should be parameterized somewhere.
+ O << "\t.globl " << CurrentFnName << "\n";
+ O << "\t.linkonce discard\n";
+ } else {
+ EmitAlignment(4, F); // FIXME: This should be parameterized somewhere.
+ O << "\t.weak " << CurrentFnName << "\n";
+ }
+ break;
+ }
+ if (F->hasHiddenVisibility()) {
+ if (const char *Directive = TAI->getHiddenDirective())
+ O << Directive << CurrentFnName << "\n";
+ } else if (F->hasProtectedVisibility()) {
+ if (const char *Directive = TAI->getProtectedDirective())
+ O << Directive << CurrentFnName << "\n";
+ }
+
+ if (Subtarget->isTargetELF())
+ O << "\t.type " << CurrentFnName << ",@function\n";
+ else if (Subtarget->isTargetCygMing()) {
+ O << "\t.def\t " << CurrentFnName
+ << ";\t.scl\t" <<
+ (F->getLinkage() == Function::InternalLinkage ? COFF::C_STAT : COFF::C_EXT)
+ << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+ << ";\t.endef\n";
+ }
+
+ O << CurrentFnName << ":\n";
+ // Add some workaround for linkonce linkage on Cygwin\MinGW
+ if (Subtarget->isTargetCygMing() &&
+ (F->getLinkage() == Function::LinkOnceLinkage ||
+ F->getLinkage() == Function::WeakLinkage))
+ O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+
+ if (TAI->doesSupportDebugInformation()) {
+ // Emit pre-function debug information.
+ DW.BeginFunction(&MF);
+ }
+
+ // Print out code for the function.
+ for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+ I != E; ++I) {
+ // Print a label for the basic block.
+ if (I->pred_begin() != I->pred_end()) {
+ printBasicBlockLabel(I, true);
+ O << '\n';
+ }
+ for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+ II != E; ++II) {
+ // Print the assembly for the instruction.
+ O << "\t";
+ printMachineInstruction(II);
+ }
+ }
+
+ if (TAI->hasDotTypeDotSizeDirective())
+ O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+ if (TAI->doesSupportDebugInformation()) {
+ // Emit post-function debug information.
+ DW.EndFunction();
+ }
+
+ // Print out jump tables referenced by the function.
+ EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+ // We didn't modify anything.
+ return false;
+}
+
+static inline bool printGOT(TargetMachine &TM, const X86Subtarget* ST) {
+ return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_;
+}
+
+static inline bool printStub(TargetMachine &TM, const X86Subtarget* ST) {
+ return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static;
+}
+
+void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier, bool NotRIPRel) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ const MRegisterInfo &RI = *TM.getRegisterInfo();
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register: {
+ assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+ "Virtual registers should not make it this far!");
+ O << '%';
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ MVT::ValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+ O << (char)tolower(*Name);
+ return;
+ }
+
+ case MachineOperand::MO_Immediate:
+ if (!Modifier ||
+ (strcmp(Modifier, "debug") && strcmp(Modifier, "mem")))
+ O << '$';
+ O << MO.getImmedValue();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ printBasicBlockLabel(MO.getMachineBasicBlock());
+ return;
+ case MachineOperand::MO_JumpTableIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << '$';
+ O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << "_"
+ << MO.getJumpTableIndex();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget->isPICStyleStub())
+ O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << "$pb\"";
+ else if (Subtarget->isPICStyleGOT())
+ O << "@GOTOFF";
+ }
+
+ if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+ O << "(%rip)";
+ return;
+ }
+ case MachineOperand::MO_ConstantPoolIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << '$';
+ O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+ << MO.getConstantPoolIndex();
+
+ if (TM.getRelocationModel() == Reloc::PIC_) {
+ if (Subtarget->isPICStyleStub())
+ O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << "$pb\"";
+ else if (Subtarget->isPICStyleGOT())
+ O << "@GOTOFF";
+ }
+
+ int Offset = MO.getOffset();
+ if (Offset > 0)
+ O << "+" << Offset;
+ else if (Offset < 0)
+ O << Offset;
+
+ if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+ O << "(%rip)";
+ return;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ bool needCloseParen = false;
+
+ GlobalValue *GV = MO.getGlobal();
+ GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+ bool isThreadLocal = GVar && GVar->isThreadLocal();
+
+ std::string Name = Mang->getValueName(GV);
+ X86SharedAsmPrinter::decorateName(Name, GV);
+
+ if (!isMemOp && !isCallOp)
+ O << '$';
+ else if (Name[0] == '$') {
+ // The name begins with a dollar-sign. In order to avoid having it look
+ // like an integer immediate to the assembler, enclose it in parens.
+ O << '(';
+ needCloseParen = true;
+ }
+
+ if (printStub(TM, Subtarget)) {
+ // Link-once, declaration, or Weakly-linked global variables need
+ // non-lazily-resolved stubs
+ if (GV->isDeclaration() ||
+ GV->hasWeakLinkage() ||
+ GV->hasLinkOnceLinkage()) {
+ // Dynamically-resolved functions need a stub for the function.
+ if (isCallOp && isa<Function>(GV)) {
+ FnStubs.insert(Name);
+ O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+ } else {
+ GVStubs.insert(Name);
+ O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr";
+ }
+ } else {
+ if (GV->hasDLLImportLinkage())
+ O << "__imp_";
+ O << Name;
+ }
+
+ if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
+ O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+ << "$pb\"";
+ } else {
+ if (GV->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name;
+
+ if (isCallOp && isa<Function>(GV)) {
+ if (printGOT(TM, Subtarget)) {
+ // Assemble call via PLT for non-local symbols
+ if (!(GV->hasHiddenVisibility() || GV->hasProtectedVisibility()) ||
+ GV->isDeclaration())
+ O << "@PLT";
+ }
+ if (Subtarget->isTargetCygMing() && GV->isDeclaration())
+ // Save function name for later type emission
+ FnStubs.insert(Name);
+ }
+ }
+
+ if (GV->hasExternalWeakLinkage())
+ ExtWeakSymbols.insert(GV);
+
+ int Offset = MO.getOffset();
+ if (Offset > 0)
+ O << "+" << Offset;
+ else if (Offset < 0)
+ O << Offset;
+
+ if (isThreadLocal) {
+ if (TM.getRelocationModel() == Reloc::PIC_)
+ O << "@TLSGD"; // general dynamic TLS model
+ else
+ if (GV->isDeclaration())
+ O << "@INDNTPOFF"; // initial exec TLS model
+ else
+ O << "@NTPOFF"; // local exec TLS model
+ } else if (isMemOp) {
+ if (printGOT(TM, Subtarget)) {
+ if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+ O << "@GOT";
+ else
+ O << "@GOTOFF";
+ } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) {
+ if ((GV->isDeclaration() ||
+ GV->hasWeakLinkage() ||
+ GV->hasLinkOnceLinkage()) &&
+ TM.getRelocationModel() != Reloc::Static)
+ O << "@GOTPCREL";
+
+ if (needCloseParen) {
+ needCloseParen = false;
+ O << ')';
+ }
+
+ // Use rip when possible to reduce code size, except when
+ // index or base register are also part of the address. e.g.
+ // foo(%rip)(%rcx,%rax,4) is not legal
+ O << "(%rip)";
+ }
+ }
+
+ if (needCloseParen)
+ O << ')';
+
+ return;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool needCloseParen = false;
+ std::string Name(TAI->getGlobalPrefix());
+ Name += MO.getSymbolName();
+ if (isCallOp && printStub(TM, Subtarget)) {
+ FnStubs.insert(Name);
+ O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+ return;
+ }
+ if (!isCallOp)
+ O << '$';
+ else if (Name[0] == '$') {
+ // The name begins with a dollar-sign. In order to avoid having it look
+ // like an integer immediate to the assembler, enclose it in parens.
+ O << '(';
+ needCloseParen = true;
+ }
+
+ O << Name;
+
+ if (printGOT(TM, Subtarget)) {
+ std::string GOTName(TAI->getGlobalPrefix());
+ GOTName+="_GLOBAL_OFFSET_TABLE_";
+ if (Name == GOTName)
+ // HACK! Emit extra offset to PC during printing GOT offset to
+ // compensate for the size of popl instruction. The resulting code
+ // should look like:
+ // call .piclabel
+ // piclabel:
+ // popl %some_register
+ // addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register
+ O << " + [.-"
+ << computePICLabel(getFunctionNumber(), TAI, Subtarget) << "]";
+
+ if (isCallOp)
+ O << "@PLT";
+ }
+
+ if (needCloseParen)
+ O << ')';
+
+ if (!isCallOp && Subtarget->isPICStyleRIPRel())
+ O << "(%rip)";
+
+ return;
+ }
+ default:
+ O << "<unknown operand type>"; return;
+ }
+}
+
+void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+ unsigned char value = MI->getOperand(Op).getImmedValue();
+ assert(value <= 7 && "Invalid ssecc argument!");
+ switch (value) {
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier){
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+ MachineOperand BaseReg = MI->getOperand(Op);
+ MachineOperand IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ bool NotRIPRel = IndexReg.getReg() || BaseReg.getReg();
+ if (DispSpec.isGlobalAddress() ||
+ DispSpec.isConstantPoolIndex() ||
+ DispSpec.isJumpTableIndex()) {
+ printOperand(MI, Op+3, "mem", NotRIPRel);
+ } else {
+ int DispVal = DispSpec.getImmedValue();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+ O << DispVal;
+ }
+
+ if (IndexReg.getReg() || BaseReg.getReg()) {
+ unsigned ScaleVal = MI->getOperand(Op+1).getImmedValue();
+ unsigned BaseRegOperand = 0, IndexRegOperand = 2;
+
+ // There are cases where we can end up with ESP/RSP in the indexreg slot.
+ // If this happens, swap the base/index register to support assemblers that
+ // don't work when the index is *SP.
+ if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) {
+ assert(ScaleVal == 1 && "Scale not supported for stack pointer!");
+ std::swap(BaseReg, IndexReg);
+ std::swap(BaseRegOperand, IndexRegOperand);
+ }
+
+ O << "(";
+ if (BaseReg.getReg())
+ printOperand(MI, Op+BaseRegOperand, Modifier);
+
+ if (IndexReg.getReg()) {
+ O << ",";
+ printOperand(MI, Op+IndexRegOperand, Modifier);
+ if (ScaleVal != 1)
+ O << "," << ScaleVal;
+ }
+ O << ")";
+ }
+}
+
+void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+ std::string label = computePICLabel(getFunctionNumber(), TAI, Subtarget);
+ O << label << "\n" << label << ":";
+}
+
+
+bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+ const char Mode) {
+ const MRegisterInfo &RI = *TM.getRegisterInfo();
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ }
+
+ O << '%';
+ for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+ O << (char)tolower(*Name);
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'c': // Don't print "$" before a global var name or constant.
+ printOperand(MI, OpNo, "mem");
+ return false;
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ if (MI->getOperand(OpNo).isReg())
+ return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+ printOperand(MI, OpNo);
+ return false;
+
+ case 'P': // Don't print @PLT, but do print as memory.
+ printOperand(MI, OpNo, "mem");
+ return false;
+ }
+ }
+
+ printOperand(MI, OpNo);
+ return false;
+}
+
+bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+ printMemReference(MI, OpNo);
+ return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in AT&T syntax to the current output stream.
+///
+void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+ ++EmittedInsts;
+
+ // See if a truncate instruction can be turned into a nop.
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::TRUNC_64to32:
+ case X86::TRUNC_64to16:
+ case X86::TRUNC_32to16:
+ case X86::TRUNC_32to8:
+ case X86::TRUNC_16to8:
+ case X86::TRUNC_32_to8:
+ case X86::TRUNC_16_to8: {
+ const MachineOperand &MO0 = MI->getOperand(0);
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned Reg0 = MO0.getReg();
+ unsigned Reg1 = MO1.getReg();
+ unsigned Opc = MI->getOpcode();
+ if (Opc == X86::TRUNC_64to32)
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+ else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+ else
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+ O << TAI->getCommentString() << " TRUNCATE ";
+ if (Reg0 != Reg1)
+ O << "\n\t";
+ break;
+ }
+ case X86::PsMOVZX64rr32:
+ O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+ break;
+ }
+
+ // Call the autogenerated instruction printer routines.
+ printInstruction(MI);
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter.inc"
+
diff --git a/lib/Target/X86/X86ATTAsmPrinter.h b/lib/Target/X86/X86ATTAsmPrinter.h
new file mode 100755
index 0000000..a3bdce9
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.h
@@ -0,0 +1,87 @@
+//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ATTASMPRINTER_H
+#define X86ATTASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+struct X86ATTAsmPrinter : public X86SharedAsmPrinter {
+ X86ATTAsmPrinter(std::ostream &O, X86TargetMachine &TM, const TargetAsmInfo *T)
+ : X86SharedAsmPrinter(O, TM, T) { }
+
+ virtual const char *getPassName() const {
+ return "X86 AT&T-Style Assembly Printer";
+ }
+
+ /// printInstruction - This method is automatically generated by tablegen
+ /// from the instruction set description. This method returns true if the
+ /// machine instruction was sufficiently described to print it, otherwise it
+ /// returns false.
+ bool printInstruction(const MachineInstr *MI);
+
+ // These methods are used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier = 0, bool NotRIPRel = false);
+ void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo);
+ }
+ void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+ printMemReference(MI, OpNo, "subreg64");
+ }
+
+ bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+
+ void printMachineInstruction(const MachineInstr *MI);
+ void printSSECC(const MachineInstr *MI, unsigned Op);
+ void printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL);
+ void printPICLabel(const MachineInstr *MI, unsigned Op);
+ bool runOnMachineFunction(MachineFunction &F);
+
+ /// getSectionForFunction - Return the section that we should emit the
+ /// specified function body into.
+ virtual std::string getSectionForFunction(const Function &F) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..59b9b1f
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,409 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86ATTAsmPrinter.h"
+#include "X86COFF.h"
+#include "X86IntelAsmPrinter.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+ const TargetData *TD) {
+ X86MachineFunctionInfo Info;
+ uint64_t Size = 0;
+
+ switch (F->getCallingConv()) {
+ case CallingConv::X86_StdCall:
+ Info.setDecorationStyle(StdCall);
+ break;
+ case CallingConv::X86_FastCall:
+ Info.setDecorationStyle(FastCall);
+ break;
+ default:
+ return Info;
+ }
+
+ for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+ AI != AE; ++AI)
+ // Size should be aligned to DWORD boundary
+ Size += ((TD->getTypeSize(AI->getType()) + 3)/4)*4;
+
+ // We're not supporting tooooo huge arguments :)
+ Info.setBytesToPopOnReturn((unsigned int)Size);
+ return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86SharedAsmPrinter::decorateName(std::string &Name,
+ const GlobalValue *GV) {
+ const Function *F = dyn_cast<Function>(GV);
+ if (!F) return;
+
+ // We don't want to decorate non-stdcall or non-fastcall functions right now
+ unsigned CC = F->getCallingConv();
+ if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+ return;
+
+ // Decorate names only when we're targeting Cygwin/Mingw32 targets
+ if (!Subtarget->isTargetCygMing())
+ return;
+
+ FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+ const X86MachineFunctionInfo *Info;
+ if (info_item == FunctionInfoMap.end()) {
+ // Calculate apropriate function info and populate map
+ FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+ Info = &FunctionInfoMap[F];
+ } else {
+ Info = &info_item->second;
+ }
+
+ const FunctionType *FT = F->getFunctionType();
+ switch (Info->getDecorationStyle()) {
+ case None:
+ break;
+ case StdCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && FT->isStructReturn()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+ break;
+ case FastCall:
+ // "Pure" variadic functions do not receive @0 suffix.
+ if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+ (FT->getNumParams() == 1 && FT->isStructReturn()))
+ Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+ if (Name[0] == '_') {
+ Name[0] = '@';
+ } else {
+ Name = '@' + Name;
+ }
+ break;
+ default:
+ assert(0 && "Unsupported DecorationStyle");
+ }
+}
+
+/// doInitialization
+bool X86SharedAsmPrinter::doInitialization(Module &M) {
+ if (TAI->doesSupportDebugInformation()) {
+ // Emit initial debug information.
+ DW.BeginModule(&M);
+ }
+
+ AsmPrinter::doInitialization(M);
+
+ // Darwin wants symbols to be quoted if they have complex names.
+ if (Subtarget->isTargetDarwin())
+ Mang->setUseQuotes(true);
+
+ return false;
+}
+
+bool X86SharedAsmPrinter::doFinalization(Module &M) {
+ // Note: this code is not shared by the Intel printer as it is too different
+ // from how MASM does things. When making changes here don't forget to look
+ // at X86IntelAsmPrinter::doFinalization().
+ const TargetData *TD = TM.getTargetData();
+
+ // Print out module-level global variables here.
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (!I->hasInitializer())
+ continue; // External global require no code
+
+ // Check to see if this is a special global used by LLVM, if so, emit it.
+ if (EmitSpecialLLVMGlobal(I)) {
+ if (Subtarget->isTargetDarwin() &&
+ TM.getRelocationModel() == Reloc::Static) {
+ if (I->getName() == "llvm.global_ctors")
+ O << ".reference .constructors_used\n";
+ else if (I->getName() == "llvm.global_dtors")
+ O << ".reference .destructors_used\n";
+ }
+ continue;
+ }
+
+ std::string name = Mang->getValueName(I);
+ Constant *C = I->getInitializer();
+ const Type *Type = C->getType();
+ unsigned Size = TD->getTypeSize(Type);
+ unsigned Align = TD->getPreferredAlignmentLog(I);
+
+ if (I->hasHiddenVisibility()) {
+ if (const char *Directive = TAI->getHiddenDirective())
+ O << Directive << name << "\n";
+ } else if (I->hasProtectedVisibility()) {
+ if (const char *Directive = TAI->getProtectedDirective())
+ O << Directive << name << "\n";
+ }
+
+ if (Subtarget->isTargetELF())
+ O << "\t.type " << name << ",@object\n";
+
+ if (C->isNullValue()) {
+ if (I->hasExternalLinkage()) {
+ if (const char *Directive = TAI->getZeroFillDirective()) {
+ O << "\t.globl\t" << name << "\n";
+ O << Directive << "__DATA__, __common, " << name << ", "
+ << Size << ", " << Align << "\n";
+ continue;
+ }
+ }
+
+ if (!I->hasSection() && !I->isThreadLocal() &&
+ (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+ I->hasLinkOnceLinkage())) {
+ if (Size == 0) Size = 1; // .comm Foo, 0 is undefined, avoid it.
+ if (!NoZerosInBSS && TAI->getBSSSection())
+ SwitchToDataSection(TAI->getBSSSection(), I);
+ else
+ SwitchToDataSection(TAI->getDataSection(), I);
+ if (TAI->getLCOMMDirective() != NULL) {
+ if (I->hasInternalLinkage()) {
+ O << TAI->getLCOMMDirective() << name << "," << Size;
+ if (Subtarget->isTargetDarwin())
+ O << "," << Align;
+ } else
+ O << TAI->getCOMMDirective() << name << "," << Size;
+ } else {
+ if (!Subtarget->isTargetCygMing()) {
+ if (I->hasInternalLinkage())
+ O << "\t.local\t" << name << "\n";
+ }
+ O << TAI->getCOMMDirective() << name << "," << Size;
+ if (TAI->getCOMMDirectiveTakesAlignment())
+ O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+ }
+ O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n";
+ continue;
+ }
+ }
+
+ switch (I->getLinkage()) {
+ case GlobalValue::LinkOnceLinkage:
+ case GlobalValue::WeakLinkage:
+ if (Subtarget->isTargetDarwin()) {
+ O << "\t.globl " << name << "\n"
+ << "\t.weak_definition " << name << "\n";
+ SwitchToDataSection(".section __DATA,__const_coal,coalesced", I);
+ } else if (Subtarget->isTargetCygMing()) {
+ std::string SectionName(".section\t.data$linkonce." +
+ name +
+ ",\"aw\"");
+ SwitchToDataSection(SectionName.c_str(), I);
+ O << "\t.globl " << name << "\n"
+ << "\t.linkonce same_size\n";
+ } else {
+ std::string SectionName("\t.section\t.llvm.linkonce.d." +
+ name +
+ ",\"aw\",@progbits");
+ SwitchToDataSection(SectionName.c_str(), I);
+ O << "\t.weak " << name << "\n";
+ }
+ break;
+ case GlobalValue::AppendingLinkage:
+ // FIXME: appending linkage variables should go into a section of
+ // their name or something. For now, just emit them as external.
+ case GlobalValue::DLLExportLinkage:
+ DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),""));
+ // FALL THROUGH
+ case GlobalValue::ExternalLinkage:
+ // If external or appending, declare as a global symbol
+ O << "\t.globl " << name << "\n";
+ // FALL THROUGH
+ case GlobalValue::InternalLinkage: {
+ if (I->isConstant()) {
+ const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+ if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+ SwitchToDataSection(TAI->getCStringSection(), I);
+ break;
+ }
+ }
+ // FIXME: special handling for ".ctors" & ".dtors" sections
+ if (I->hasSection() &&
+ (I->getSection() == ".ctors" ||
+ I->getSection() == ".dtors")) {
+ std::string SectionName = ".section " + I->getSection();
+
+ if (Subtarget->isTargetCygMing()) {
+ SectionName += ",\"aw\"";
+ } else {
+ assert(!Subtarget->isTargetDarwin());
+ SectionName += ",\"aw\",@progbits";
+ }
+
+ SwitchToDataSection(SectionName.c_str());
+ } else {
+ if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection())
+ SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() :
+ TAI->getBSSSection(), I);
+ else if (!I->isConstant())
+ SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() :
+ TAI->getDataSection(), I);
+ else if (I->isThreadLocal())
+ SwitchToDataSection(TAI->getTLSDataSection());
+ else {
+ // Read-only data.
+ bool HasReloc = C->ContainsRelocations();
+ if (HasReloc &&
+ Subtarget->isTargetDarwin() &&
+ TM.getRelocationModel() != Reloc::Static)
+ SwitchToDataSection("\t.const_data\n");
+ else if (!HasReloc && Size == 4 &&
+ TAI->getFourByteConstantSection())
+ SwitchToDataSection(TAI->getFourByteConstantSection(), I);
+ else if (!HasReloc && Size == 8 &&
+ TAI->getEightByteConstantSection())
+ SwitchToDataSection(TAI->getEightByteConstantSection(), I);
+ else if (!HasReloc && Size == 16 &&
+ TAI->getSixteenByteConstantSection())
+ SwitchToDataSection(TAI->getSixteenByteConstantSection(), I);
+ else if (TAI->getReadOnlySection())
+ SwitchToDataSection(TAI->getReadOnlySection(), I);
+ else
+ SwitchToDataSection(TAI->getDataSection(), I);
+ }
+ }
+
+ break;
+ }
+ default:
+ assert(0 && "Unknown linkage type!");
+ }
+
+ EmitAlignment(Align, I);
+ O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName()
+ << "\n";
+ if (TAI->hasDotTypeDotSizeDirective())
+ O << "\t.size " << name << ", " << Size << "\n";
+ // If the initializer is a extern weak symbol, remember to emit the weak
+ // reference!
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+ if (GV->hasExternalWeakLinkage())
+ ExtWeakSymbols.insert(GV);
+
+ EmitGlobalConstant(C);
+ }
+
+ // Output linker support code for dllexported globals
+ if (DLLExportedGVs.begin() != DLLExportedGVs.end()) {
+ SwitchToDataSection(".section .drectve");
+ }
+
+ for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+ e = DLLExportedGVs.end();
+ i != e; ++i) {
+ O << "\t.ascii \" -export:" << *i << ",data\"\n";
+ }
+
+ if (DLLExportedFns.begin() != DLLExportedFns.end()) {
+ SwitchToDataSection(".section .drectve");
+ }
+
+ for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+ e = DLLExportedFns.end();
+ i != e; ++i) {
+ O << "\t.ascii \" -export:" << *i << "\"\n";
+ }
+
+ if (Subtarget->isTargetDarwin()) {
+ SwitchToDataSection("");
+
+ // Output stubs for dynamically-linked functions
+ unsigned j = 1;
+ for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+ i != e; ++i, ++j) {
+ SwitchToDataSection(".section __IMPORT,__jump_table,symbol_stubs,"
+ "self_modifying_code+pure_instructions,5", 0);
+ O << "L" << *i << "$stub:\n";
+ O << "\t.indirect_symbol " << *i << "\n";
+ O << "\thlt ; hlt ; hlt ; hlt ; hlt\n";
+ }
+
+ O << "\n";
+
+ // Output stubs for external and common global variables.
+ if (GVStubs.begin() != GVStubs.end())
+ SwitchToDataSection(
+ ".section __IMPORT,__pointers,non_lazy_symbol_pointers");
+ for (std::set<std::string>::iterator i = GVStubs.begin(), e = GVStubs.end();
+ i != e; ++i) {
+ O << "L" << *i << "$non_lazy_ptr:\n";
+ O << "\t.indirect_symbol " << *i << "\n";
+ O << "\t.long\t0\n";
+ }
+
+ // Emit final debug information.
+ DW.EndModule();
+
+ // Funny Darwin hack: This flag tells the linker that no global symbols
+ // contain code that falls through to other global symbols (e.g. the obvious
+ // implementation of multiple entry points). If this doesn't occur, the
+ // linker can safely perform dead code stripping. Since LLVM never
+ // generates code that does this, it is always safe to set.
+ O << "\t.subsections_via_symbols\n";
+ } else if (Subtarget->isTargetCygMing()) {
+ // Emit type information for external functions
+ for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+ i != e; ++i) {
+ O << "\t.def\t " << *i
+ << ";\t.scl\t" << COFF::C_EXT
+ << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+ << ";\t.endef\n";
+ }
+
+ // Emit final debug information.
+ DW.EndModule();
+ } else if (Subtarget->isTargetELF()) {
+ // Emit final debug information.
+ DW.EndModule();
+ }
+
+ AsmPrinter::doFinalization(M);
+ return false; // success
+}
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
+/// for a MachineFunction to the given output stream, using the given target
+/// machine description.
+///
+FunctionPass *llvm::createX86CodePrinterPass(std::ostream &o,
+ X86TargetMachine &tm) {
+ const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+
+ if (Subtarget->isFlavorIntel()) {
+ return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo());
+ } else {
+ return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo());
+ }
+}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
new file mode 100755
index 0000000..45be89e
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,97 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to Intel assembly ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+
+
+namespace llvm {
+
+struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter {
+ DwarfWriter DW;
+
+ X86SharedAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+ const TargetAsmInfo *T)
+ : AsmPrinter(O, TM, T), DW(O, this, T) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ }
+
+ // We have to propagate some information about MachineFunction to
+ // AsmPrinter. It's ok, when we're printing the function, since we have
+ // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+ // Unfortunately, this is not possible when we're printing reference to
+ // Function (e.g. calling it and so on). Even more, there is no way to get the
+ // corresponding MachineFunctions: it can even be not created at all. That's
+ // why we should use additional structure, when we're collecting all necessary
+ // information.
+ //
+ // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+ // function, since we have to use arguments' size for decoration.
+ typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+ FMFInfoMap FunctionInfoMap;
+
+ void decorateName(std::string& Name, const GlobalValue* GV);
+
+ bool doInitialization(Module &M);
+ bool doFinalization(Module &M);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ if (Subtarget->isTargetDarwin() ||
+ Subtarget->isTargetELF() ||
+ Subtarget->isTargetCygMing()) {
+ AU.addRequired<MachineModuleInfo>();
+ }
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ const X86Subtarget *Subtarget;
+
+ // Necessary for Darwin to print out the apprioriate types of linker stubs
+ std::set<std::string> FnStubs, GVStubs, LinkOnceStubs;
+
+ // Necessary for dllexport support
+ std::set<std::string> DLLExportedFns, DLLExportedGVs;
+
+ inline static bool isScale(const MachineOperand &MO) {
+ return MO.isImmediate() &&
+ (MO.getImmedValue() == 1 || MO.getImmedValue() == 2 ||
+ MO.getImmedValue() == 4 || MO.getImmedValue() == 8);
+ }
+
+ inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+ if (MI->getOperand(Op).isFrameIndex()) return true;
+ return Op+4 <= MI->getNumOperands() &&
+ MI->getOperand(Op ).isRegister() && isScale(MI->getOperand(Op+1)) &&
+ MI->getOperand(Op+2).isRegister() &&
+ (MI->getOperand(Op+3).isImmediate() ||
+ MI->getOperand(Op+3).isGlobalAddress() ||
+ MI->getOperand(Op+3).isConstantPoolIndex() ||
+ MI->getOperand(Op+3).isJumpTableIndex());
+ }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..75892ef
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h
@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Anton Korobeynikov and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+ C_EFCN = -1, ///< Physical end of function
+ C_NULL = 0, ///< No symbol
+ C_AUTO = 1, ///< External definition
+ C_EXT = 2, ///< External symbol
+ C_STAT = 3, ///< Static
+ C_REG = 4, ///< Register variable
+ C_EXTDEF = 5, ///< External definition
+ C_LABEL = 6, ///< Label
+ C_ULABEL = 7, ///< Undefined label
+ C_MOS = 8, ///< Member of structure
+ C_ARG = 9, ///< Function argument
+ C_STRTAG = 10, ///< Structure tag
+ C_MOU = 11, ///< Member of union
+ C_UNTAG = 12, ///< Union tag
+ C_TPDEF = 13, ///< Type definition
+ C_USTATIC = 14, ///< Undefined static
+ C_ENTAG = 15, ///< Enumeration tag
+ C_MOE = 16, ///< Member of enumeration
+ C_REGPARM = 17, ///< Register parameter
+ C_FIELD = 18, ///< Bit field
+
+ C_BLOCK = 100, ///< ".bb" or ".eb" - beginning or end of block
+ C_FCN = 101, ///< ".bf" or ".ef" - beginning or end of function
+ C_EOS = 102, ///< End of structure
+ C_FILE = 103, ///< File name
+ C_LINE = 104, ///< Line number, reformatted as symbol
+ C_ALIAS = 105, ///< Duplicate tag
+ C_HIDDEN = 106 ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+ T_NULL = 0, ///< No type info
+ T_ARG = 1, ///< Void function argument (only used by compiler)
+ T_VOID = 1, ///< The same as above. Just named differently in some specs.
+ T_CHAR = 2, ///< Character
+ T_SHORT = 3, ///< Short integer
+ T_INT = 4, ///< Integer
+ T_LONG = 5, ///< Long integer
+ T_FLOAT = 6, ///< Floating point
+ T_DOUBLE = 7, ///< Double word
+ T_STRUCT = 8, ///< Structure
+ T_UNION = 9, ///< Union
+ T_ENUM = 10, ///< Enumeration
+ T_MOE = 11, ///< Member of enumeration
+ T_UCHAR = 12, ///< Unsigned character
+ T_USHORT = 13, ///< Unsigned short
+ T_UINT = 14, ///< Unsigned integer
+ T_ULONG = 15 ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+ DT_NON = 0, ///< No derived type
+ DT_PTR = 1, ///< Pointer to T
+ DT_FCN = 2, ///< Function returning T
+ DT_ARY = 3 ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+ N_BTMASK = 017, ///< Mask for base type
+ N_TMASK = 060 ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+ N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..39811bd7
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,172 @@
+//===- X86CallingConv.td - Calling Conventions for X86 32/64 ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+ // Scalar values are returned in AX first, then DX.
+ CCIfType<[i8] , CCAssignToReg<[AL]>>,
+ CCIfType<[i16], CCAssignToReg<[AX]>>,
+ CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+ CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+
+ // Vector types are returned in XMM0 and XMM1, when they fit. If the target
+ // doesn't have XMM registers, it won't have vector types.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1]>>,
+
+ // MMX vector types are always returned in MM0. If the target doesn't have
+ // MM0, it doesn't support these vector types.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+ // The X86-32 calling convention returns FP values in ST0, otherwise it is the
+ // same as the common X86 calling conv.
+ CCIfType<[f32], CCAssignToReg<[ST0]>>,
+ CCIfType<[f64], CCAssignToReg<[ST0]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+ // The X86-32 fastcc returns FP values in XMM0 if the target has SSE2,
+ // otherwise it is the the C calling conventions.
+ CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+ CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+ // The X86-64 calling convention always returns FP values in XMM0.
+ CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+ CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+ // If FastCC, use RetCC_X86_32_Fast.
+ CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // Otherwise, use RetCC_X86_32_C.
+ CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+ // Always just the same as C calling conv for X86-64.
+ CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+ CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+ CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8, R9 ]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+ // The first 8 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64],
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+ // Integer/Float values get stored in stack slots that are 4 bytes in
+ // size and 4-byte aligned.
+ CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+ // Doubles get 8-byte slots that are 4-byte aligned.
+ CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+ // The first 4 vector arguments are passed in XMM registers.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // Other vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned. They are
+ // passed in the parameter area.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
+
+
+def CC_X86_32_FastCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The first 2 integer arguments are passed in ECX/EDX
+ CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..8b22634
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,824 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+ class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+ const X86InstrInfo *II;
+ const TargetData *TD;
+ TargetMachine &TM;
+ MachineCodeEmitter &MCE;
+ bool Is64BitMode;
+ public:
+ static char ID;
+ explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce)
+ : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm),
+ MCE(mce), Is64BitMode(false) {}
+ Emitter(TargetMachine &tm, MachineCodeEmitter &mce,
+ const X86InstrInfo &ii, const TargetData &td, bool is64)
+ : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm),
+ MCE(mce), Is64BitMode(is64) {}
+
+ bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Machine Code Emitter";
+ }
+
+ void emitInstruction(const MachineInstr &MI);
+
+ private:
+ void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+ void emitPCRelativeValue(intptr_t Address);
+ void emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub);
+ void emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+ int Disp = 0, unsigned PCAdj = 0);
+ void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+ void emitConstPoolAddress(unsigned CPI, unsigned Reloc, int Disp = 0,
+ unsigned PCAdj = 0);
+ void emitJumpTableAddress(unsigned JTI, unsigned Reloc, unsigned PCAdj = 0);
+
+ void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+ unsigned PCAdj = 0);
+
+ void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+ void emitConstant(uint64_t Val, unsigned Size);
+
+ void emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ unsigned PCAdj = 0);
+
+ unsigned getX86RegNum(unsigned RegNo);
+ bool isX86_64ExtendedReg(const MachineOperand &MO);
+ unsigned determineREX(const MachineInstr &MI);
+ };
+ char Emitter::ID = 0;
+}
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *llvm::createX86CodeEmitterPass(X86TargetMachine &TM,
+ MachineCodeEmitter &MCE) {
+ return new Emitter(TM, MCE);
+}
+
+bool Emitter::runOnMachineFunction(MachineFunction &MF) {
+ assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+ MF.getTarget().getRelocationModel() != Reloc::Static) &&
+ "JIT relocation model must be set to static or default!");
+ II = ((X86TargetMachine&)MF.getTarget()).getInstrInfo();
+ TD = ((X86TargetMachine&)MF.getTarget()).getTargetData();
+ Is64BitMode =
+ ((X86TargetMachine&)MF.getTarget()).getSubtarget<X86Subtarget>().is64Bit();
+
+ do {
+ MCE.startFunction(MF);
+ for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+ MBB != E; ++MBB) {
+ MCE.StartMachineBasicBlock(MBB);
+ for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+ I != E; ++I)
+ emitInstruction(*I);
+ }
+ } while (MCE.finishFunction(MF));
+
+ return false;
+}
+
+/// emitPCRelativeValue - Emit a PC relative address.
+///
+void Emitter::emitPCRelativeValue(intptr_t Address) {
+ MCE.emitWordLE(Address-MCE.getCurrentPCValue()-4);
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+void Emitter::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+ // Remember where this reference was and where it is to so we can
+ // deal with it later.
+ MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+ X86::reloc_pcrel_word, MBB));
+ MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddressForCall - Emit the specified address to the code stream
+/// assuming this is part of a function call, which is PC relative.
+///
+void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub) {
+ MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+ X86::reloc_pcrel_word, GV, 0,
+ DoesntNeedStub));
+ MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+ int Disp /* = 0 */,
+ unsigned PCAdj /* = 0 */) {
+ MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+ GV, PCAdj));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitWordLE(0);
+ MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
+ MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+ Reloc, ES));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitWordLE(0);
+ MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+ int Disp /* = 0 */,
+ unsigned PCAdj /* = 0 */) {
+ MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+ Reloc, CPI, PCAdj));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitWordLE(0);
+ MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+ unsigned PCAdj /* = 0 */) {
+ MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+ Reloc, JTI, PCAdj));
+ if (Reloc == X86::reloc_absolute_dword)
+ MCE.emitWordLE(0);
+ MCE.emitWordLE(0); // The relocated value will be added to the displacement
+}
+
+/// N86 namespace - Native X86 Register numbers... used by X86 backend.
+///
+namespace N86 {
+ enum {
+ EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+ };
+}
+
+// getX86RegNum - This function maps LLVM register identifiers to their X86
+// specific numbering, which is used in various places encoding instructions.
+//
+unsigned Emitter::getX86RegNum(unsigned RegNo) {
+ switch(RegNo) {
+ case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+ case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+ case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+ case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+ case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+ return N86::ESP;
+ case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+ return N86::EBP;
+ case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+ return N86::ESI;
+ case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+ return N86::EDI;
+
+ case X86::R8: case X86::R8D: case X86::R8W: case X86::R8B:
+ return N86::EAX;
+ case X86::R9: case X86::R9D: case X86::R9W: case X86::R9B:
+ return N86::ECX;
+ case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+ return N86::EDX;
+ case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+ return N86::EBX;
+ case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+ return N86::ESP;
+ case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+ return N86::EBP;
+ case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+ return N86::ESI;
+ case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+ return N86::EDI;
+
+ case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+ case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+ return RegNo-X86::ST0;
+
+ case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
+ case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7:
+ return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+ II->getRegisterInfo().getDwarfRegNum(X86::XMM0);
+ case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
+ case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+ return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+ II->getRegisterInfo().getDwarfRegNum(X86::XMM8);
+
+ default:
+ assert(MRegisterInfo::isVirtualRegister(RegNo) &&
+ "Unknown physical register!");
+ assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+ return 0;
+ }
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+ unsigned RM) {
+ assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+ return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+void Emitter::emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeFld){
+ MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+void Emitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) {
+ // SIB byte is in the same format as the ModRMByte...
+ MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+void Emitter::emitConstant(uint64_t Val, unsigned Size) {
+ // Output the constant in little endian byte order...
+ for (unsigned i = 0; i != Size; ++i) {
+ MCE.emitByte(Val & 255);
+ Val >>= 8;
+ }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit
+/// sign-extended field.
+static bool isDisp8(int Value) {
+ return Value == (signed char)Value;
+}
+
+void Emitter::emitDisplacementField(const MachineOperand *RelocOp,
+ int DispVal, unsigned PCAdj) {
+ // If this is a simple integer displacement that doesn't require a relocation,
+ // emit it now.
+ if (!RelocOp) {
+ emitConstant(DispVal, 4);
+ return;
+ }
+
+ // Otherwise, this is something that requires a relocation. Emit it as such
+ // now.
+ if (RelocOp->isGlobalAddress()) {
+ // In 64-bit static small code model, we could potentially emit absolute.
+ // But it's probably not beneficial.
+ // 89 05 00 00 00 00 mov %eax,0(%rip) # PC-relative
+ // 89 04 25 00 00 00 00 mov %eax,0x0 # Absolute
+ unsigned rt= Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+ emitGlobalAddressForPtr(RelocOp->getGlobal(), rt,
+ RelocOp->getOffset(), PCAdj);
+ } else if (RelocOp->isConstantPoolIndex()) {
+ // Must be in 64-bit mode.
+ emitConstPoolAddress(RelocOp->getConstantPoolIndex(), X86::reloc_pcrel_word,
+ RelocOp->getOffset(), PCAdj);
+ } else if (RelocOp->isJumpTableIndex()) {
+ // Must be in 64-bit mode.
+ emitJumpTableAddress(RelocOp->getJumpTableIndex(), X86::reloc_pcrel_word,
+ PCAdj);
+ } else {
+ assert(0 && "Unknown value to relocate!");
+ }
+}
+
+void Emitter::emitMemModRMByte(const MachineInstr &MI,
+ unsigned Op, unsigned RegOpcodeField,
+ unsigned PCAdj) {
+ const MachineOperand &Op3 = MI.getOperand(Op+3);
+ int DispVal = 0;
+ const MachineOperand *DispForReloc = 0;
+
+ // Figure out what sort of displacement we have to handle here.
+ if (Op3.isGlobalAddress()) {
+ DispForReloc = &Op3;
+ } else if (Op3.isConstantPoolIndex()) {
+ if (Is64BitMode) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex());
+ DispVal += Op3.getOffset();
+ }
+ } else if (Op3.isJumpTableIndex()) {
+ if (Is64BitMode) {
+ DispForReloc = &Op3;
+ } else {
+ DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex());
+ }
+ } else {
+ DispVal = Op3.getImm();
+ }
+
+ const MachineOperand &Base = MI.getOperand(Op);
+ const MachineOperand &Scale = MI.getOperand(Op+1);
+ const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+ unsigned BaseReg = Base.getReg();
+
+ // Is a SIB byte needed?
+ if (IndexReg.getReg() == 0 &&
+ (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
+ if (BaseReg == 0) { // Just a displacement?
+ // Emit special case [disp32] encoding
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+ // Emit simple indirect register encoding... [EAX] f.e.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+ } else if (!DispForReloc && isDisp8(DispVal)) {
+ // Emit the disp8 encoding... [REG+disp8]
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+ emitConstant(DispVal, 1);
+ } else {
+ // Emit the most general non-SIB encoding: [REG+disp32]
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ }
+ }
+
+ } else { // We need a SIB byte, so start by outputting the ModR/M byte first
+ assert(IndexReg.getReg() != X86::ESP &&
+ IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+ bool ForceDisp32 = false;
+ bool ForceDisp8 = false;
+ if (BaseReg == 0) {
+ // If there is no base register, we emit the special case SIB byte with
+ // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispForReloc) {
+ // Emit the normal disp32 encoding.
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ ForceDisp32 = true;
+ } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+ // Emit no displacement ModR/M byte
+ MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+ } else if (isDisp8(DispVal)) {
+ // Emit the disp8 encoding...
+ MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ } else {
+ // Emit the normal disp32 encoding...
+ MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+ }
+
+ // Calculate what the SS field value should be...
+ static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+ unsigned SS = SSTable[Scale.getImm()];
+
+ if (BaseReg == 0) {
+ // Handle the SIB byte for the case where there is no base. The
+ // displacement has already been output.
+ assert(IndexReg.getReg() && "Index register must be specified!");
+ emitSIBByte(SS, getX86RegNum(IndexReg.getReg()), 5);
+ } else {
+ unsigned BaseRegNo = getX86RegNum(BaseReg);
+ unsigned IndexRegNo;
+ if (IndexReg.getReg())
+ IndexRegNo = getX86RegNum(IndexReg.getReg());
+ else
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, BaseRegNo);
+ }
+
+ // Do we need to output a displacement?
+ if (ForceDisp8) {
+ emitConstant(DispVal, 1);
+ } else if (DispVal != 0 || ForceDisp32) {
+ emitDisplacementField(DispForReloc, DispVal, PCAdj);
+ }
+ }
+}
+
+static unsigned sizeOfImm(const TargetInstrDescriptor *Desc) {
+ switch (Desc->TSFlags & X86II::ImmMask) {
+ case X86II::Imm8: return 1;
+ case X86II::Imm16: return 2;
+ case X86II::Imm32: return 4;
+ case X86II::Imm64: return 8;
+ default: assert(0 && "Immediate size not set!");
+ return 0;
+ }
+}
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool Emitter::isX86_64ExtendedReg(const MachineOperand &MO) {
+ if (!MO.isRegister()) return false;
+ unsigned RegNo = MO.getReg();
+ int DWNum = II->getRegisterInfo().getDwarfRegNum(RegNo);
+ if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::R8) &&
+ DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::R15))
+ return true;
+ if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::XMM8) &&
+ DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::XMM15))
+ return true;
+ return false;
+}
+
+inline static bool isX86_64TruncToByte(unsigned oc) {
+ return (oc == X86::TRUNC_64to8 || oc == X86::TRUNC_32to8 ||
+ oc == X86::TRUNC_16to8);
+}
+
+
+inline static bool isX86_64NonExtLowByteReg(unsigned reg) {
+ return (reg == X86::SPL || reg == X86::BPL ||
+ reg == X86::SIL || reg == X86::DIL);
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned Emitter::determineREX(const MachineInstr &MI) {
+ unsigned REX = 0;
+ const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+ unsigned Opcode = Desc->Opcode;
+
+ // Pseudo instructions do not need REX prefix byte.
+ if ((Desc->TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return 0;
+ if (Desc->TSFlags & X86II::REX_W)
+ REX |= 1 << 3;
+
+ unsigned NumOps = Desc->numOperands;
+ if (NumOps) {
+ bool isTwoAddr = NumOps > 1 &&
+ Desc->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+ bool isTrunc8 = isX86_64TruncToByte(Opcode);
+ unsigned i = isTwoAddr ? 1 : 0;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isRegister()) {
+ unsigned Reg = MO.getReg();
+ // Trunc to byte are actually movb. The real source operand is the low
+ // byte of the register.
+ if (isTrunc8 && i == 1)
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ if (isX86_64NonExtLowByteReg(Reg))
+ REX |= 0x40;
+ }
+ }
+
+ switch (Desc->TSFlags & X86II::FormMask) {
+ case X86II::MRMInitReg:
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= (1 << 0) | (1 << 2);
+ break;
+ case X86II::MRMSrcReg: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 0;
+ }
+ break;
+ }
+ case X86II::MRMSrcMem: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ i = isTwoAddr ? 2 : 1;
+ for (; i != NumOps; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isRegister()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRMDestMem: {
+ unsigned e = isTwoAddr ? 5 : 4;
+ i = isTwoAddr ? 1 : 0;
+ if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+ REX |= 1 << 2;
+ unsigned Bit = 0;
+ for (; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (MO.isRegister()) {
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << Bit;
+ Bit++;
+ }
+ }
+ break;
+ }
+ default: {
+ if (isX86_64ExtendedReg(MI.getOperand(0)))
+ REX |= 1 << 0;
+ i = isTwoAddr ? 2 : 1;
+ for (unsigned e = NumOps; i != e; ++i) {
+ const MachineOperand& MO = MI.getOperand(i);
+ if (isX86_64ExtendedReg(MO))
+ REX |= 1 << 2;
+ }
+ break;
+ }
+ }
+ }
+ return REX;
+}
+
+void Emitter::emitInstruction(const MachineInstr &MI) {
+ NumEmitted++; // Keep track of the # of mi's emitted
+
+ const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+ unsigned Opcode = Desc->Opcode;
+
+ // Emit the repeat opcode prefix as needed.
+ if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+
+ // Emit the operand size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+ // Emit the address size opcode prefix as needed.
+ if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+ bool Need0FPrefix = false;
+ switch (Desc->TSFlags & X86II::Op0Mask) {
+ case X86II::TB:
+ Need0FPrefix = true; // Two-byte opcode prefix
+ break;
+ case X86II::T8:
+ MCE.emitByte(0x0F);
+ MCE.emitByte(0x38);
+ break;
+ case X86II::TA:
+ MCE.emitByte(0x0F);
+ MCE.emitByte(0x3A);
+ break;
+ case X86II::REP: break; // already handled.
+ case X86II::XS: // F3 0F
+ MCE.emitByte(0xF3);
+ Need0FPrefix = true;
+ break;
+ case X86II::XD: // F2 0F
+ MCE.emitByte(0xF2);
+ Need0FPrefix = true;
+ break;
+ case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+ case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+ MCE.emitByte(0xD8+
+ (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+ >> X86II::Op0Shift));
+ break; // Two-byte opcode prefix
+ default: assert(0 && "Invalid prefix!");
+ case 0: break; // No prefix!
+ }
+
+ if (Is64BitMode) {
+ // REX prefix
+ unsigned REX = determineREX(MI);
+ if (REX)
+ MCE.emitByte(0x40 | REX);
+ }
+
+ // 0x0F escape code must be emitted just before the opcode.
+ if (Need0FPrefix)
+ MCE.emitByte(0x0F);
+
+ // If this is a two-address instruction, skip one of the register operands.
+ unsigned NumOps = Desc->numOperands;
+ unsigned CurOp = 0;
+ if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+ CurOp++;
+
+ unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
+ switch (Desc->TSFlags & X86II::FormMask) {
+ default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+ case X86II::Pseudo:
+#ifndef NDEBUG
+ switch (Opcode) {
+ default:
+ assert(0 && "psuedo instructions should be removed before code emission");
+ case TargetInstrInfo::INLINEASM:
+ assert(0 && "JIT does not support inline asm!\n");
+ case TargetInstrInfo::LABEL:
+ assert(0 && "JIT does not support meta labels!\n");
+ case X86::IMPLICIT_USE:
+ case X86::IMPLICIT_DEF:
+ case X86::IMPLICIT_DEF_GR8:
+ case X86::IMPLICIT_DEF_GR16:
+ case X86::IMPLICIT_DEF_GR32:
+ case X86::IMPLICIT_DEF_GR64:
+ case X86::IMPLICIT_DEF_FR32:
+ case X86::IMPLICIT_DEF_FR64:
+ case X86::IMPLICIT_DEF_VR64:
+ case X86::IMPLICIT_DEF_VR128:
+ case X86::FP_REG_KILL:
+ break;
+ }
+#endif
+ CurOp = NumOps;
+ break;
+
+ case X86II::RawFrm:
+ MCE.emitByte(BaseOpcode);
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ if (MO.isMachineBasicBlock()) {
+ emitPCRelativeBlockAddress(MO.getMachineBasicBlock());
+ } else if (MO.isGlobalAddress()) {
+ bool NeedStub = Is64BitMode ||
+ Opcode == X86::TAILJMPd ||
+ Opcode == X86::TAILJMPr || Opcode == X86::TAILJMPm;
+ emitGlobalAddressForCall(MO.getGlobal(), !NeedStub);
+ } else if (MO.isExternalSymbol()) {
+ emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+ } else if (MO.isImmediate()) {
+ emitConstant(MO.getImm(), sizeOfImm(Desc));
+ } else {
+ assert(0 && "Unknown RawFrm operand!");
+ }
+ }
+ break;
+
+ case X86II::AddRegFrm:
+ MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = sizeOfImm(Desc);
+ if (MO1.isImmediate())
+ emitConstant(MO1.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+ if (Opcode == X86::MOV64ri)
+ rt = X86::reloc_absolute_dword; // FIXME: add X86II flag?
+ if (MO1.isGlobalAddress())
+ emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+ else if (MO1.isExternalSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isConstantPoolIndex())
+ emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+ else if (MO1.isJumpTableIndex())
+ emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+ }
+ }
+ break;
+
+ case X86II::MRMDestReg: {
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+ break;
+ }
+ case X86II::MRMDestMem: {
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(CurOp+4).getReg()));
+ CurOp += 5;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+ break;
+ }
+
+ case X86II::MRMSrcReg:
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ CurOp += 2;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+ break;
+
+ case X86II::MRMSrcMem: {
+ unsigned PCAdj = (CurOp+5 != NumOps) ? sizeOfImm(Desc) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+ PCAdj);
+ CurOp += 5;
+ if (CurOp != NumOps)
+ emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+ break;
+ }
+
+ case X86II::MRM0r: case X86II::MRM1r:
+ case X86II::MRM2r: case X86II::MRM3r:
+ case X86II::MRM4r: case X86II::MRM5r:
+ case X86II::MRM6r: case X86II::MRM7r:
+ MCE.emitByte(BaseOpcode);
+ emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+ (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO1 = MI.getOperand(CurOp++);
+ unsigned Size = sizeOfImm(Desc);
+ if (MO1.isImmediate())
+ emitConstant(MO1.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : X86::reloc_absolute_word;
+ if (Opcode == X86::MOV64ri32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ if (MO1.isGlobalAddress())
+ emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+ else if (MO1.isExternalSymbol())
+ emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+ else if (MO1.isConstantPoolIndex())
+ emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+ else if (MO1.isJumpTableIndex())
+ emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+ }
+ }
+ break;
+
+ case X86II::MRM0m: case X86II::MRM1m:
+ case X86II::MRM2m: case X86II::MRM3m:
+ case X86II::MRM4m: case X86II::MRM5m:
+ case X86II::MRM6m: case X86II::MRM7m: {
+ unsigned PCAdj = (CurOp+4 != NumOps) ?
+ (MI.getOperand(CurOp+4).isImmediate() ? sizeOfImm(Desc) : 4) : 0;
+
+ MCE.emitByte(BaseOpcode);
+ emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+ PCAdj);
+ CurOp += 4;
+
+ if (CurOp != NumOps) {
+ const MachineOperand &MO = MI.getOperand(CurOp++);
+ unsigned Size = sizeOfImm(Desc);
+ if (MO.isImmediate())
+ emitConstant(MO.getImm(), Size);
+ else {
+ unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+ : X86::reloc_absolute_word;
+ if (Opcode == X86::MOV64mi32)
+ rt = X86::reloc_absolute_word; // FIXME: add X86II flag?
+ if (MO.isGlobalAddress())
+ emitGlobalAddressForPtr(MO.getGlobal(), rt, MO.getOffset());
+ else if (MO.isExternalSymbol())
+ emitExternalSymbolAddress(MO.getSymbolName(), rt);
+ else if (MO.isConstantPoolIndex())
+ emitConstPoolAddress(MO.getConstantPoolIndex(), rt);
+ else if (MO.isJumpTableIndex())
+ emitJumpTableAddress(MO.getJumpTableIndex(), rt);
+ }
+ }
+ break;
+ }
+
+ case X86II::MRMInitReg:
+ MCE.emitByte(BaseOpcode);
+ // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+ emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+ getX86RegNum(MI.getOperand(CurOp).getReg()));
+ ++CurOp;
+ break;
+ }
+
+ assert((Desc->Flags & M_VARIABLE_OPS) != 0 ||
+ CurOp == NumOps && "Unknown encoding!");
+}
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..f8f8d48
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -0,0 +1,18 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+using namespace llvm;
+
+X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..eb564fb
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -0,0 +1,29 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+ class X86ELFWriterInfo : public TargetELFWriterInfo {
+ public:
+ X86ELFWriterInfo();
+ virtual ~X86ELFWriterInfo();
+ };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..c293a32
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,882 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions. This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code. In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges. Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet. The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP , "Number of floating point instructions");
+
+namespace {
+ struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass {
+ static char ID;
+ FPS() : MachineFunctionPass((intptr_t)&ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<LiveVariables>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+ private:
+ const TargetInstrInfo *TII; // Machine instruction info.
+ LiveVariables *LV; // Live variable info for current function...
+ MachineBasicBlock *MBB; // Current basic block
+ unsigned Stack[8]; // FP<n> Registers in each stack slot...
+ unsigned RegMap[8]; // Track which stack slot contains each register
+ unsigned StackTop; // The current top of the FP stack.
+
+ void dumpStack() const {
+ cerr << "Stack contents:";
+ for (unsigned i = 0; i != StackTop; ++i) {
+ cerr << " FP" << Stack[i];
+ assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+ }
+ cerr << "\n";
+ }
+ private:
+ // getSlot - Return the stack slot number a particular register number is
+ // in...
+ unsigned getSlot(unsigned RegNo) const {
+ assert(RegNo < 8 && "Regno out of range!");
+ return RegMap[RegNo];
+ }
+
+ // getStackEntry - Return the X86::FP<n> register in register ST(i)
+ unsigned getStackEntry(unsigned STi) const {
+ assert(STi < StackTop && "Access past stack top!");
+ return Stack[StackTop-1-STi];
+ }
+
+ // getSTReg - Return the X86::ST(i) register which contains the specified
+ // FP<RegNo> register
+ unsigned getSTReg(unsigned RegNo) const {
+ return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+ }
+
+ // pushReg - Push the specified FP<n> register onto the stack
+ void pushReg(unsigned Reg) {
+ assert(Reg < 8 && "Register number out of range!");
+ assert(StackTop < 8 && "Stack overflow!");
+ Stack[StackTop] = Reg;
+ RegMap[Reg] = StackTop++;
+ }
+
+ bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+ void moveToTop(unsigned RegNo, MachineBasicBlock::iterator &I) {
+ if (!isAtTop(RegNo)) {
+ unsigned STReg = getSTReg(RegNo);
+ unsigned RegOnTop = getStackEntry(0);
+
+ // Swap the slots the regs are in
+ std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+ // Swap stack slot contents
+ assert(RegMap[RegOnTop] < StackTop);
+ std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+ // Emit an fxch to update the runtime processors version of the state
+ BuildMI(*MBB, I, TII->get(X86::XCH_F)).addReg(STReg);
+ NumFXCH++;
+ }
+ }
+
+ void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+ unsigned STReg = getSTReg(RegNo);
+ pushReg(AsReg); // New register on top of stack
+
+ BuildMI(*MBB, I, TII->get(X86::LD_Frr)).addReg(STReg);
+ }
+
+ // popStackAfter - Pop the current value off of the top of the FP stack
+ // after the specified instruction.
+ void popStackAfter(MachineBasicBlock::iterator &I);
+
+ // freeStackSlotAfter - Free the specified register from the register stack,
+ // so that it is no longer in a register. If the register is currently at
+ // the top of the stack, we just pop the current instruction, otherwise we
+ // store the current top-of-stack into the specified slot, then pop the top
+ // of stack.
+ void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+ bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+ void handleZeroArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFP(MachineBasicBlock::iterator &I);
+ void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+ void handleTwoArgFP(MachineBasicBlock::iterator &I);
+ void handleCompareFP(MachineBasicBlock::iterator &I);
+ void handleCondMovFP(MachineBasicBlock::iterator &I);
+ void handleSpecialFP(MachineBasicBlock::iterator &I);
+ };
+ char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+ // We only need to run this pass if there are any FP registers used in this
+ // function. If it is all integer, there is nothing for us to do!
+ bool FPIsUsed = false;
+
+ assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+ for (unsigned i = 0; i <= 6; ++i)
+ if (MF.isPhysRegUsed(X86::FP0+i)) {
+ FPIsUsed = true;
+ break;
+ }
+
+ // Early exit.
+ if (!FPIsUsed) return false;
+
+ TII = MF.getTarget().getInstrInfo();
+ LV = &getAnalysis<LiveVariables>();
+ StackTop = 0;
+
+ // Process the function in depth first order so that we process at least one
+ // of the predecessors for every reachable block in the function.
+ std::set<MachineBasicBlock*> Processed;
+ MachineBasicBlock *Entry = MF.begin();
+
+ bool Changed = false;
+ for (df_ext_iterator<MachineBasicBlock*, std::set<MachineBasicBlock*> >
+ I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+ I != E; ++I)
+ Changed |= processBasicBlock(MF, **I);
+
+ return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+ bool Changed = false;
+ MBB = &BB;
+
+ for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+ MachineInstr *MI = I;
+ unsigned Flags = MI->getInstrDescriptor()->TSFlags;
+ if ((Flags & X86II::FPTypeMask) == X86II::NotFP)
+ continue; // Efficiently ignore non-fp insts!
+
+ MachineInstr *PrevMI = 0;
+ if (I != BB.begin())
+ PrevMI = prior(I);
+
+ ++NumFP; // Keep track of # of pseudo instrs
+ DOUT << "\nFPInst:\t" << *MI;
+
+ // Get dead variables list now because the MI pointer may be deleted as part
+ // of processing!
+ SmallVector<unsigned, 8> DeadRegs;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.isDead())
+ DeadRegs.push_back(MO.getReg());
+ }
+
+ switch (Flags & X86II::FPTypeMask) {
+ case X86II::ZeroArgFP: handleZeroArgFP(I); break;
+ case X86II::OneArgFP: handleOneArgFP(I); break; // fstp ST(0)
+ case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+ case X86II::TwoArgFP: handleTwoArgFP(I); break;
+ case X86II::CompareFP: handleCompareFP(I); break;
+ case X86II::CondMovFP: handleCondMovFP(I); break;
+ case X86II::SpecialFP: handleSpecialFP(I); break;
+ default: assert(0 && "Unknown FP Type!");
+ }
+
+ // Check to see if any of the values defined by this instruction are dead
+ // after definition. If so, pop them.
+ for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+ unsigned Reg = DeadRegs[i];
+ if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+ DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+ freeStackSlotAfter(I, Reg-X86::FP0);
+ }
+ }
+
+ // Print out all of the instructions expanded to if -debug
+ DEBUG(
+ MachineBasicBlock::iterator PrevI(PrevMI);
+ if (I == PrevI) {
+ cerr << "Just deleted pseudo instruction\n";
+ } else {
+ MachineBasicBlock::iterator Start = I;
+ // Rewind to first instruction newly inserted.
+ while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+ cerr << "Inserted instructions:\n\t";
+ Start->print(*cerr.stream(), &MF.getTarget());
+ while (++Start != next(I));
+ }
+ dumpStack();
+ );
+
+ Changed = true;
+ }
+
+ assert(StackTop == 0 && "Stack not empty at end of basic block?");
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ struct TableEntry {
+ unsigned from;
+ unsigned to;
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool operator<(unsigned V, const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+ for (unsigned i = 0; i != NumEntries-1; ++i)
+ if (!(Table[i] < Table[i+1])) return false;
+ return true;
+}
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+ if (I != Table+N && I->from == Opcode)
+ return I->to;
+ return -1;
+}
+
+#define ARRAY_SIZE(TABLE) \
+ (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+ { X86::ABS_Fp32 , X86::ABS_F },
+ { X86::ABS_Fp64 , X86::ABS_F },
+ { X86::ADD_Fp32m , X86::ADD_F32m },
+ { X86::ADD_Fp64m , X86::ADD_F64m },
+ { X86::ADD_Fp64m32 , X86::ADD_F32m },
+ { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+ { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+ { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+ { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+ { X86::CHS_Fp32 , X86::CHS_F },
+ { X86::CHS_Fp64 , X86::CHS_F },
+ { X86::CMOVBE_Fp32 , X86::CMOVBE_F },
+ { X86::CMOVBE_Fp64 , X86::CMOVBE_F },
+ { X86::CMOVB_Fp32 , X86::CMOVB_F },
+ { X86::CMOVB_Fp64 , X86::CMOVB_F },
+ { X86::CMOVE_Fp32 , X86::CMOVE_F },
+ { X86::CMOVE_Fp64 , X86::CMOVE_F },
+ { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+ { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+ { X86::CMOVNB_Fp32 , X86::CMOVNB_F },
+ { X86::CMOVNB_Fp64 , X86::CMOVNB_F },
+ { X86::CMOVNE_Fp32 , X86::CMOVNE_F },
+ { X86::CMOVNE_Fp64 , X86::CMOVNE_F },
+ { X86::CMOVNP_Fp32 , X86::CMOVNP_F },
+ { X86::CMOVNP_Fp64 , X86::CMOVNP_F },
+ { X86::CMOVP_Fp32 , X86::CMOVP_F },
+ { X86::CMOVP_Fp64 , X86::CMOVP_F },
+ { X86::COS_Fp32 , X86::COS_F },
+ { X86::COS_Fp64 , X86::COS_F },
+ { X86::DIVR_Fp32m , X86::DIVR_F32m },
+ { X86::DIVR_Fp64m , X86::DIVR_F64m },
+ { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+ { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+ { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+ { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+ { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+ { X86::DIV_Fp32m , X86::DIV_F32m },
+ { X86::DIV_Fp64m , X86::DIV_F64m },
+ { X86::DIV_Fp64m32 , X86::DIV_F32m },
+ { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+ { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+ { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+ { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+ { X86::ILD_Fp16m32 , X86::ILD_F16m },
+ { X86::ILD_Fp16m64 , X86::ILD_F16m },
+ { X86::ILD_Fp32m32 , X86::ILD_F32m },
+ { X86::ILD_Fp32m64 , X86::ILD_F32m },
+ { X86::ILD_Fp64m32 , X86::ILD_F64m },
+ { X86::ILD_Fp64m64 , X86::ILD_F64m },
+ { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+ { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+ { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+ { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+ { X86::IST_Fp16m32 , X86::IST_F16m },
+ { X86::IST_Fp16m64 , X86::IST_F16m },
+ { X86::IST_Fp32m32 , X86::IST_F32m },
+ { X86::IST_Fp32m64 , X86::IST_F32m },
+ { X86::IST_Fp64m32 , X86::IST_FP64m },
+ { X86::IST_Fp64m64 , X86::IST_FP64m },
+ { X86::LD_Fp032 , X86::LD_F0 },
+ { X86::LD_Fp064 , X86::LD_F0 },
+ { X86::LD_Fp132 , X86::LD_F1 },
+ { X86::LD_Fp164 , X86::LD_F1 },
+ { X86::LD_Fp32m , X86::LD_F32m },
+ { X86::LD_Fp64m , X86::LD_F64m },
+ { X86::MUL_Fp32m , X86::MUL_F32m },
+ { X86::MUL_Fp64m , X86::MUL_F64m },
+ { X86::MUL_Fp64m32 , X86::MUL_F32m },
+ { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+ { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+ { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+ { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+ { X86::SIN_Fp32 , X86::SIN_F },
+ { X86::SIN_Fp64 , X86::SIN_F },
+ { X86::SQRT_Fp32 , X86::SQRT_F },
+ { X86::SQRT_Fp64 , X86::SQRT_F },
+ { X86::ST_Fp32m , X86::ST_F32m },
+ { X86::ST_Fp64m , X86::ST_F64m },
+ { X86::ST_Fp64m32 , X86::ST_F32m },
+ { X86::SUBR_Fp32m , X86::SUBR_F32m },
+ { X86::SUBR_Fp64m , X86::SUBR_F64m },
+ { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+ { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+ { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+ { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+ { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+ { X86::SUB_Fp32m , X86::SUB_F32m },
+ { X86::SUB_Fp64m , X86::SUB_F64m },
+ { X86::SUB_Fp64m32 , X86::SUB_F32m },
+ { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+ { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+ { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+ { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+ { X86::TST_Fp32 , X86::TST_F },
+ { X86::TST_Fp64 , X86::TST_F },
+ { X86::UCOM_FpIr32 , X86::UCOM_FIr },
+ { X86::UCOM_FpIr64 , X86::UCOM_FIr },
+ { X86::UCOM_Fpr32 , X86::UCOM_Fr },
+ { X86::UCOM_Fpr64 , X86::UCOM_Fr },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+ ASSERT_SORTED(OpcodeTable);
+ int Opc = Lookup(OpcodeTable, ARRAY_SIZE(OpcodeTable), Opcode);
+ assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+ return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version. The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+ { X86::ADD_FrST0 , X86::ADD_FPrST0 },
+
+ { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+ { X86::DIV_FrST0 , X86::DIV_FPrST0 },
+
+ { X86::IST_F16m , X86::IST_FP16m },
+ { X86::IST_F32m , X86::IST_FP32m },
+
+ { X86::MUL_FrST0 , X86::MUL_FPrST0 },
+
+ { X86::ST_F32m , X86::ST_FP32m },
+ { X86::ST_F64m , X86::ST_FP64m },
+ { X86::ST_Frr , X86::ST_FPrr },
+
+ { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+ { X86::SUB_FrST0 , X86::SUB_FPrST0 },
+
+ { X86::UCOM_FIr , X86::UCOM_FIPr },
+
+ { X86::UCOM_FPr , X86::UCOM_FPPr },
+ { X86::UCOM_Fr , X86::UCOM_FPr },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction. This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible. The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(PopTable);
+ assert(StackTop > 0 && "Cannot pop empty stack!");
+ RegMap[Stack[--StackTop]] = ~0; // Update state
+
+ // Check to see if there is a popping version of this instruction...
+ int Opcode = Lookup(PopTable, ARRAY_SIZE(PopTable), I->getOpcode());
+ if (Opcode != -1) {
+ I->setInstrDescriptor(TII->get(Opcode));
+ if (Opcode == X86::UCOM_FPPr)
+ I->RemoveOperand(0);
+ } else { // Insert an explicit pop
+ I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+ }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register. If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+ if (getStackEntry(0) == FPRegNo) { // already at the top of stack? easy.
+ popStackAfter(I);
+ return;
+ }
+
+ // Otherwise, store the top of stack into the dead slot, killing the operand
+ // without having to add in an explicit xchg then pop.
+ //
+ unsigned STReg = getSTReg(FPRegNo);
+ unsigned OldSlot = getSlot(FPRegNo);
+ unsigned TopReg = Stack[StackTop-1];
+ Stack[OldSlot] = TopReg;
+ RegMap[TopReg] = OldSlot;
+ RegMap[FPRegNo] = ~0;
+ Stack[--StackTop] = ~0;
+ I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+static unsigned getFPReg(const MachineOperand &MO) {
+ assert(MO.isRegister() && "Expected an FP register!");
+ unsigned Reg = MO.getReg();
+ assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+ return Reg - X86::FP0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0 ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0); // Remove the explicit ST(0) operand
+ MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // Result gets pushed on the stack.
+ pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+ assert((NumOps == 5 || NumOps == 1) &&
+ "Can only handle fst* & ftst instructions!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+ bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+ // FISTP64m is strange because there isn't a non-popping versions.
+ // If we have one _and_ we don't want to pop the operand, duplicate the value
+ // on the stack instead of moving it. This ensure that popping the value is
+ // always ok.
+ // Ditto FISTTP16m, FISTTP32m, FISTTP64m.
+ //
+ if (!KillsSrc &&
+ (MI->getOpcode() == X86::IST_Fp64m32 ||
+ MI->getOpcode() == X86::ISTT_Fp16m32 ||
+ MI->getOpcode() == X86::ISTT_Fp32m32 ||
+ MI->getOpcode() == X86::ISTT_Fp64m32 ||
+ MI->getOpcode() == X86::IST_Fp64m64 ||
+ MI->getOpcode() == X86::ISTT_Fp16m64 ||
+ MI->getOpcode() == X86::ISTT_Fp32m64 ||
+ MI->getOpcode() == X86::ISTT_Fp64m64)) {
+ duplicateToTop(Reg, 7 /*temp register*/, I);
+ } else {
+ moveToTop(Reg, I); // Move to the top of the stack...
+ }
+
+ // Convert from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(NumOps-1); // Remove explicit ST(0) operand
+ MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ if (MI->getOpcode() == X86::IST_FP64m ||
+ MI->getOpcode() == X86::ISTT_FP16m ||
+ MI->getOpcode() == X86::ISTT_FP32m ||
+ MI->getOpcode() == X86::ISTT_FP64m) {
+ assert(StackTop > 0 && "Stack empty??");
+ --StackTop;
+ } else if (KillsSrc) { // Last use of operand?
+ popStackAfter(I);
+ }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value. These instructions may have
+/// non-fp operands after their FP operands.
+///
+/// Examples:
+/// R1 = fchs R2
+/// R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+ assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+
+ // Is this the last use of the source register?
+ unsigned Reg = getFPReg(MI->getOperand(1));
+ bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+ if (KillsSrc) {
+ // If this is the last use of the source register, just make sure it's on
+ // the top of the stack.
+ moveToTop(Reg, I);
+ assert(StackTop > 0 && "Stack cannot be empty!");
+ --StackTop;
+ pushReg(getFPReg(MI->getOperand(0)));
+ } else {
+ // If this is not the last use of the source register, _copy_ it to the top
+ // of the stack.
+ duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+ }
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(1); // Drop the source operand.
+ MI->RemoveOperand(0); // Drop the destination operand.
+ MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r },
+ { X86::ADD_Fp64 , X86::ADD_FST0r },
+ { X86::DIV_Fp32 , X86::DIV_FST0r },
+ { X86::DIV_Fp64 , X86::DIV_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r },
+ { X86::MUL_Fp64 , X86::MUL_FST0r },
+ { X86::SUB_Fp32 , X86::SUB_FST0r },
+ { X86::SUB_Fp64 , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+ { X86::ADD_Fp32 , X86::ADD_FST0r }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FST0r }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FST0r },
+ { X86::DIV_Fp64 , X86::DIVR_FST0r },
+ { X86::MUL_Fp32 , X86::MUL_FST0r }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FST0r }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FST0r },
+ { X86::SUB_Fp64 , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 }, // commutative
+ { X86::ADD_Fp64 , X86::ADD_FrST0 }, // commutative
+ { X86::DIV_Fp32 , X86::DIVR_FrST0 },
+ { X86::DIV_Fp64 , X86::DIVR_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 }, // commutative
+ { X86::MUL_Fp64 , X86::MUL_FrST0 }, // commutative
+ { X86::SUB_Fp32 , X86::SUBR_FrST0 },
+ { X86::SUB_Fp64 , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+ { X86::ADD_Fp32 , X86::ADD_FrST0 },
+ { X86::ADD_Fp64 , X86::ADD_FrST0 },
+ { X86::DIV_Fp32 , X86::DIV_FrST0 },
+ { X86::DIV_Fp64 , X86::DIV_FrST0 },
+ { X86::MUL_Fp32 , X86::MUL_FrST0 },
+ { X86::MUL_Fp64 , X86::MUL_FrST0 },
+ { X86::SUB_Fp32 , X86::SUB_FrST0 },
+ { X86::SUB_Fp64 , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub ST(0), ST(i)
+/// ST(i) = fsub ST(0), ST(i)
+/// ST(0) = fsubr ST(0), ST(i)
+/// ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+ assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+ unsigned Dest = getFPReg(MI->getOperand(0));
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+ bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+ unsigned TOS = getStackEntry(0);
+
+ // One of our operands must be on the top of the stack. If neither is yet, we
+ // need to move one.
+ if (Op0 != TOS && Op1 != TOS) { // No operand at TOS?
+ // We can choose to move either operand to the top of the stack. If one of
+ // the operands is killed by this instruction, we want that one so that we
+ // can update right on top of the old version.
+ if (KillsOp0) {
+ moveToTop(Op0, I); // Move dead operand to TOS.
+ TOS = Op0;
+ } else if (KillsOp1) {
+ moveToTop(Op1, I);
+ TOS = Op1;
+ } else {
+ // All of the operands are live after this instruction executes, so we
+ // cannot update on top of any operand. Because of this, we must
+ // duplicate one of the stack elements to the top. It doesn't matter
+ // which one we pick.
+ //
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+ } else if (!KillsOp0 && !KillsOp1) {
+ // If we DO have one of our operands at the top of the stack, but we don't
+ // have a dead operand, we must duplicate one of the operands to a new slot
+ // on the stack.
+ duplicateToTop(Op0, Dest, I);
+ Op0 = TOS = Dest;
+ KillsOp0 = true;
+ }
+
+ // Now we know that one of our operands is on the top of the stack, and at
+ // least one of our operands is killed by this instruction.
+ assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+ "Stack conditions not set up right!");
+
+ // We decide which form to use based on what is on the top of the stack, and
+ // which operand is killed by this instruction.
+ const TableEntry *InstTable;
+ bool isForward = TOS == Op0;
+ bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+ if (updateST0) {
+ if (isForward)
+ InstTable = ForwardST0Table;
+ else
+ InstTable = ReverseST0Table;
+ } else {
+ if (isForward)
+ InstTable = ForwardSTiTable;
+ else
+ InstTable = ReverseSTiTable;
+ }
+
+ int Opcode = Lookup(InstTable, ARRAY_SIZE(ForwardST0Table), MI->getOpcode());
+ assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+ // NotTOS - The register which is not on the top of stack...
+ unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+ // Replace the old instruction with a new instruction
+ MBB->remove(I++);
+ I = BuildMI(*MBB, I, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+ // If both operands are killed, pop one off of the stack in addition to
+ // overwriting the other one.
+ if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+ assert(!updateST0 && "Should have updated other operand!");
+ popStackAfter(I); // Pop the top of stack
+ }
+
+ // Update stack information so that we know the destination register is now on
+ // the stack.
+ unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+ assert(UpdatedSlot < StackTop && Dest < 7);
+ Stack[UpdatedSlot] = Dest;
+ RegMap[Dest] = UpdatedSlot;
+ delete MI; // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+ ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+ ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+ MachineInstr *MI = I;
+
+ unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+ assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+ unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+ unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+ bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+ bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+ // Make sure the first operand is on the top of stack, the other one can be
+ // anywhere.
+ moveToTop(Op0, I);
+
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->RemoveOperand(1);
+ MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If any of the operands are killed by this instruction, free them.
+ if (KillsOp0) freeStackSlotAfter(I, Op0);
+ if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions. These
+/// instructions move a st(i) register to st(0) iff a condition is true. These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+
+ unsigned Op0 = getFPReg(MI->getOperand(0));
+ unsigned Op1 = getFPReg(MI->getOperand(2));
+ bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+ // The first operand *must* be on the top of the stack.
+ moveToTop(Op0, I);
+
+ // Change the second operand to the stack register that the operand is in.
+ // Change from the pseudo instruction to the concrete instruction.
+ MI->RemoveOperand(0);
+ MI->RemoveOperand(1);
+ MI->getOperand(0).setReg(getSTReg(Op1));
+ MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+ // If we kill the second operand, make sure to pop it from the stack.
+ if (Op0 != Op1 && KillsOp1) {
+ // Get this value off of the register stack.
+ freeStackSlotAfter(I, Op1);
+ }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions. This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+ MachineInstr *MI = I;
+ switch (MI->getOpcode()) {
+ default: assert(0 && "Unknown SpecialFP instruction!");
+ case X86::FpGETRESULT32: // Appears immediately after a call returning FP type!
+ case X86::FpGETRESULT64: // Appears immediately after a call returning FP type!
+ assert(StackTop == 0 && "Stack should be empty after a call!");
+ pushReg(getFPReg(MI->getOperand(0)));
+ break;
+ case X86::FpSETRESULT32:
+ case X86::FpSETRESULT64:
+ assert(StackTop == 1 && "Stack should have one element on it to return!");
+ --StackTop; // "Forget" we have something on the top of stack!
+ break;
+ case X86::MOV_Fp3232:
+ case X86::MOV_Fp3264:
+ case X86::MOV_Fp6432:
+ case X86::MOV_Fp6464: {
+ unsigned SrcReg = getFPReg(MI->getOperand(1));
+ unsigned DestReg = getFPReg(MI->getOperand(0));
+
+ if (LV->KillsRegister(MI, X86::FP0+SrcReg)) {
+ // If the input operand is killed, we can just change the owner of the
+ // incoming stack slot into the result.
+ unsigned Slot = getSlot(SrcReg);
+ assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+ Stack[Slot] = DestReg;
+ RegMap[DestReg] = Slot;
+
+ } else {
+ // For FMOV we just duplicate the specified value to a new stack slot.
+ // This could be made better, but would require substantial changes.
+ duplicateToTop(SrcReg, DestReg, I);
+ }
+ break;
+ }
+ }
+
+ I = MBB->erase(I); // Remove the pseudo instruction
+ --I;
+}
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..8b1690c
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,1342 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFPKill , "Number of FP_REG_KILL instructions added");
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+
+//===----------------------------------------------------------------------===//
+// Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+ /// SDOperand's instead of register numbers for the leaves of the matched
+ /// tree.
+ struct X86ISelAddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ struct { // This is really a union, discriminated by BaseType!
+ SDOperand Reg;
+ int FrameIndex;
+ } Base;
+
+ bool isRIPRel; // RIP relative?
+ unsigned Scale;
+ SDOperand IndexReg;
+ unsigned Disp;
+ GlobalValue *GV;
+ Constant *CP;
+ const char *ES;
+ int JT;
+ unsigned Align; // CP alignment.
+
+ X86ISelAddressMode()
+ : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+ GV(0), CP(0), ES(0), JT(-1), Align(0) {
+ }
+ };
+}
+
+namespace {
+ //===--------------------------------------------------------------------===//
+ /// ISel - X86 specific code to select X86 machine instructions for
+ /// SelectionDAG operations.
+ ///
+ class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
+ /// ContainsFPCode - Every instruction we select that uses or defines a FP
+ /// register should set this to true.
+ bool ContainsFPCode;
+
+ /// FastISel - Enable fast(er) instruction selection.
+ ///
+ bool FastISel;
+
+ /// TM - Keep a reference to X86TargetMachine.
+ ///
+ X86TargetMachine &TM;
+
+ /// X86Lowering - This object fully describes how to lower LLVM code to an
+ /// X86-specific SelectionDAG.
+ X86TargetLowering X86Lowering;
+
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+
+ /// GlobalBaseReg - keeps track of the virtual register mapped onto global
+ /// base register.
+ unsigned GlobalBaseReg;
+
+ public:
+ X86DAGToDAGISel(X86TargetMachine &tm, bool fast)
+ : SelectionDAGISel(X86Lowering),
+ ContainsFPCode(false), FastISel(fast), TM(tm),
+ X86Lowering(*TM.getTargetLowering()),
+ Subtarget(&TM.getSubtarget<X86Subtarget>()) {}
+
+ virtual bool runOnFunction(Function &Fn) {
+ // Make sure we re-emit a set of the global base reg if necessary
+ GlobalBaseReg = 0;
+ return SelectionDAGISel::runOnFunction(Fn);
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 DAG->DAG Instruction Selection";
+ }
+
+ /// InstructionSelectBasicBlock - This callback is invoked by
+ /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+ virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+
+ virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+ virtual bool CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root);
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+ private:
+ SDNode *Select(SDOperand N);
+
+ bool MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+ bool isRoot = true, unsigned Depth = 0);
+ bool SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+ SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+ bool SelectLEAAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+ SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+ bool SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+ SDOperand N, SDOperand &Base, SDOperand &Scale,
+ SDOperand &Index, SDOperand &Disp,
+ SDOperand &InChain, SDOperand &OutChain);
+ bool TryFoldLoad(SDOperand P, SDOperand N,
+ SDOperand &Base, SDOperand &Scale,
+ SDOperand &Index, SDOperand &Disp);
+ void InstructionSelectPreprocess(SelectionDAG &DAG);
+
+ /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+ /// inline asm expressions.
+ virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+ char ConstraintCode,
+ std::vector<SDOperand> &OutOps,
+ SelectionDAG &DAG);
+
+ void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+ inline void getAddressOperands(X86ISelAddressMode &AM, SDOperand &Base,
+ SDOperand &Scale, SDOperand &Index,
+ SDOperand &Disp) {
+ Base = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+ CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+ AM.Base.Reg;
+ Scale = getI8Imm(AM.Scale);
+ Index = AM.IndexReg;
+ // These are 32-bit even in 64-bit mode since RIP relative offset
+ // is 32-bit.
+ if (AM.GV)
+ Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+ else if (AM.CP)
+ Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp);
+ else if (AM.ES)
+ Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+ else if (AM.JT != -1)
+ Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+ else
+ Disp = getI32Imm(AM.Disp);
+ }
+
+ /// getI8Imm - Return a target constant with the specified value, of type
+ /// i8.
+ inline SDOperand getI8Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i8);
+ }
+
+ /// getI16Imm - Return a target constant with the specified value, of type
+ /// i16.
+ inline SDOperand getI16Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i16);
+ }
+
+ /// getI32Imm - Return a target constant with the specified value, of type
+ /// i32.
+ inline SDOperand getI32Imm(unsigned Imm) {
+ return CurDAG->getTargetConstant(Imm, MVT::i32);
+ }
+
+ /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+ /// base register. Return the virtual register that holds this value.
+ SDNode *getGlobalBaseReg();
+
+#ifndef NDEBUG
+ unsigned Indent;
+#endif
+ };
+}
+
+static SDNode *findFlagUse(SDNode *N) {
+ unsigned FlagResNo = N->getNumValues()-1;
+ for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+ SDNode *User = *I;
+ for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+ SDOperand Op = User->getOperand(i);
+ if (Op.Val == N && Op.ResNo == FlagResNo)
+ return User;
+ }
+ }
+ return NULL;
+}
+
+static void findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+ SDNode *Root, SDNode *Skip, bool &found,
+ std::set<SDNode *> &Visited) {
+ if (found ||
+ Use->getNodeId() > Def->getNodeId() ||
+ !Visited.insert(Use).second)
+ return;
+
+ for (unsigned i = 0, e = Use->getNumOperands(); !found && i != e; ++i) {
+ SDNode *N = Use->getOperand(i).Val;
+ if (N == Skip)
+ continue;
+ if (N == Def) {
+ if (Use == ImmedUse)
+ continue; // Immediate use is ok.
+ if (Use == Root) {
+ assert(Use->getOpcode() == ISD::STORE ||
+ Use->getOpcode() == X86ISD::CMP);
+ continue;
+ }
+ found = true;
+ break;
+ }
+ findNonImmUse(N, Def, ImmedUse, Root, Skip, found, Visited);
+ }
+}
+
+/// isNonImmUse - Start searching from Root up the DAG to check is Def can
+/// be reached. Return true if that's the case. However, ignore direct uses
+/// by ImmedUse (which would be U in the example illustrated in
+/// CanBeFoldedBy) and by Root (which can happen in the store case).
+/// FIXME: to be really generic, we should allow direct use by any node
+/// that is being folded. But realisticly since we only fold loads which
+/// have one non-chain use, we only need to watch out for load/op/store
+/// and load/op/cmp case where the root (store / cmp) may reach the load via
+/// its chain operand.
+static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
+ SDNode *Skip = NULL) {
+ std::set<SDNode *> Visited;
+ bool found = false;
+ findNonImmUse(Root, Def, ImmedUse, Root, Skip, found, Visited);
+ return found;
+}
+
+
+bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) {
+ if (FastISel) return false;
+
+ // If U use can somehow reach N through another path then U can't fold N or
+ // it will create a cycle. e.g. In the following diagram, U can reach N
+ // through X. If N is folded into into U, then X is both a predecessor and
+ // a successor of U.
+ //
+ // [ N ]
+ // ^ ^
+ // | |
+ // / \---
+ // / [X]
+ // | ^
+ // [U]--------|
+
+ if (isNonImmUse(Root, N, U))
+ return false;
+
+ // If U produces a flag, then it gets (even more) interesting. Since it
+ // would have been "glued" together with its flag use, we need to check if
+ // it might reach N:
+ //
+ // [ N ]
+ // ^ ^
+ // | |
+ // [U] \--
+ // ^ [TF]
+ // | ^
+ // | |
+ // \ /
+ // [FU]
+ //
+ // If FU (flag use) indirectly reach N (the load), and U fold N (call it
+ // NU), then TF is a predecessor of FU and a successor of NU. But since
+ // NU and FU are flagged together, this effectively creates a cycle.
+ bool HasFlagUse = false;
+ MVT::ValueType VT = Root->getValueType(Root->getNumValues()-1);
+ while ((VT == MVT::Flag && !Root->use_empty())) {
+ SDNode *FU = findFlagUse(Root);
+ if (FU == NULL)
+ break;
+ else {
+ Root = FU;
+ HasFlagUse = true;
+ }
+ VT = Root->getValueType(Root->getNumValues()-1);
+ }
+
+ if (HasFlagUse)
+ return !isNonImmUse(Root, N, Root, U);
+ return true;
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
+ SDOperand Store, SDOperand TF) {
+ std::vector<SDOperand> Ops;
+ for (unsigned i = 0, e = TF.Val->getNumOperands(); i != e; ++i)
+ if (Load.Val == TF.Val->getOperand(i).Val)
+ Ops.push_back(Load.Val->getOperand(0));
+ else
+ Ops.push_back(TF.Val->getOperand(i));
+ DAG.UpdateNodeOperands(TF, &Ops[0], Ops.size());
+ DAG.UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
+ DAG.UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+ Store.getOperand(2), Store.getOperand(3));
+}
+
+/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction
+/// selector to pick more load-modify-store instructions. This is a common
+/// case:
+///
+/// [Load chain]
+/// ^
+/// |
+/// [Load]
+/// ^ ^
+/// | |
+/// / \-
+/// / |
+/// [TokenFactor] [Op]
+/// ^ ^
+/// | |
+/// \ /
+/// \ /
+/// [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+/// [Load chain]
+/// ^
+/// |
+/// [TokenFactor]
+/// ^
+/// |
+/// [Load]
+/// ^ ^
+/// | |
+/// | \-
+/// | |
+/// | [Op]
+/// | ^
+/// | |
+/// \ /
+/// \ /
+/// [Store]
+void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
+ for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+ E = DAG.allnodes_end(); I != E; ++I) {
+ if (!ISD::isNON_TRUNCStore(I))
+ continue;
+ SDOperand Chain = I->getOperand(0);
+ if (Chain.Val->getOpcode() != ISD::TokenFactor)
+ continue;
+
+ SDOperand N1 = I->getOperand(1);
+ SDOperand N2 = I->getOperand(2);
+ if (MVT::isFloatingPoint(N1.getValueType()) ||
+ MVT::isVector(N1.getValueType()) ||
+ !N1.hasOneUse())
+ continue;
+
+ bool RModW = false;
+ SDOperand Load;
+ unsigned Opcode = N1.Val->getOpcode();
+ switch (Opcode) {
+ case ISD::ADD:
+ case ISD::MUL:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR:
+ case ISD::ADDC:
+ case ISD::ADDE: {
+ SDOperand N10 = N1.getOperand(0);
+ SDOperand N11 = N1.getOperand(1);
+ if (ISD::isNON_EXTLoad(N10.Val))
+ RModW = true;
+ else if (ISD::isNON_EXTLoad(N11.Val)) {
+ RModW = true;
+ std::swap(N10, N11);
+ }
+ RModW = RModW && N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+ (N10.getOperand(1) == N2) &&
+ (N10.Val->getValueType(0) == N1.getValueType());
+ if (RModW)
+ Load = N10;
+ break;
+ }
+ case ISD::SUB:
+ case ISD::SHL:
+ case ISD::SRA:
+ case ISD::SRL:
+ case ISD::ROTL:
+ case ISD::ROTR:
+ case ISD::SUBC:
+ case ISD::SUBE:
+ case X86ISD::SHLD:
+ case X86ISD::SHRD: {
+ SDOperand N10 = N1.getOperand(0);
+ if (ISD::isNON_EXTLoad(N10.Val))
+ RModW = N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+ (N10.getOperand(1) == N2) &&
+ (N10.Val->getValueType(0) == N1.getValueType());
+ if (RModW)
+ Load = N10;
+ break;
+ }
+ }
+
+ if (RModW) {
+ MoveBelowTokenFactor(DAG, Load, SDOperand(I, 0), Chain);
+ ++NumLoadMoved;
+ }
+ }
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+ DEBUG(BB->dump());
+ MachineFunction::iterator FirstMBB = BB;
+
+ if (!FastISel)
+ InstructionSelectPreprocess(DAG);
+
+ // Codegen the basic block.
+#ifndef NDEBUG
+ DOUT << "===== Instruction selection begins:\n";
+ Indent = 0;
+#endif
+ DAG.setRoot(SelectRoot(DAG.getRoot()));
+#ifndef NDEBUG
+ DOUT << "===== Instruction selection ends:\n";
+#endif
+
+ DAG.RemoveDeadNodes();
+
+ // Emit machine code to BB.
+ ScheduleAndEmitDAG(DAG);
+
+ // If we are emitting FP stack code, scan the basic block to determine if this
+ // block defines any FP values. If so, put an FP_REG_KILL instruction before
+ // the terminator of the block.
+ if (!Subtarget->hasSSE2()) {
+ // Note that FP stack instructions *are* used in SSE code when returning
+ // values, but these are not live out of the basic block, so we don't need
+ // an FP_REG_KILL in this case either.
+ bool ContainsFPCode = false;
+
+ // Scan all of the machine instructions in these MBBs, checking for FP
+ // stores.
+ MachineFunction::iterator MBBI = FirstMBB;
+ do {
+ for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end();
+ !ContainsFPCode && I != E; ++I) {
+ if (I->getNumOperands() != 0 && I->getOperand(0).isRegister()) {
+ const TargetRegisterClass *clas;
+ for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+ if (I->getOperand(op).isRegister() && I->getOperand(op).isDef() &&
+ MRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+ ((clas = RegMap->getRegClass(I->getOperand(0).getReg())) ==
+ X86::RFP32RegisterClass ||
+ clas == X86::RFP64RegisterClass)) {
+ ContainsFPCode = true;
+ break;
+ }
+ }
+ }
+ }
+ } while (!ContainsFPCode && &*(MBBI++) != BB);
+
+ // Check PHI nodes in successor blocks. These PHI's will be lowered to have
+ // a copy of the input value in this block.
+ if (!ContainsFPCode) {
+ // Final check, check LLVM BB's that are successors to the LLVM BB
+ // corresponding to BB for FP PHI nodes.
+ const BasicBlock *LLVMBB = BB->getBasicBlock();
+ const PHINode *PN;
+ for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+ !ContainsFPCode && SI != E; ++SI) {
+ for (BasicBlock::const_iterator II = SI->begin();
+ (PN = dyn_cast<PHINode>(II)); ++II) {
+ if (PN->getType()->isFloatingPoint()) {
+ ContainsFPCode = true;
+ break;
+ }
+ }
+ }
+ }
+
+ // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+ if (ContainsFPCode) {
+ BuildMI(*BB, BB->getFirstTerminator(),
+ TM.getInstrInfo()->get(X86::FP_REG_KILL));
+ ++NumFPKill;
+ }
+ }
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+ MachineFrameInfo *MFI) {
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ if (Subtarget->isTargetCygMing())
+ BuildMI(BB, TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+
+ // Switch the FPU to 64-bit precision mode for better compatibility and speed.
+ int CWFrameIdx = MFI->CreateStackObject(2, 2);
+ addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Set the high part to be 64-bit precision.
+ addFrameReference(BuildMI(BB, TII->get(X86::MOV8mi)),
+ CWFrameIdx, 1).addImm(2);
+
+ // Reload the modified control word now.
+ addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+ // If this is main, emit special code for main.
+ MachineBasicBlock *BB = MF.begin();
+ if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+ EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done. This just pattern matches for the
+/// addressing mode
+bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+ bool isRoot, unsigned Depth) {
+ if (Depth > 5) {
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base.Reg = N;
+ return false;
+ }
+
+ // RIP relative addressing: %rip + 32-bit displacement!
+ if (AM.isRIPRel) {
+ if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+ int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+ if (isInt32(AM.Disp + Val)) {
+ AM.Disp += Val;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ int id = N.Val->getNodeId();
+ bool Available = isSelected(id);
+
+ switch (N.getOpcode()) {
+ default: break;
+ case ISD::Constant: {
+ int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+ if (isInt32(AM.Disp + Val)) {
+ AM.Disp += Val;
+ return false;
+ }
+ break;
+ }
+
+ case X86ISD::Wrapper: {
+ bool is64Bit = Subtarget->is64Bit();
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+ if (is64Bit && TM.getCodeModel() != CodeModel::Small)
+ break;
+ if (AM.GV != 0 || AM.CP != 0 || AM.ES != 0 || AM.JT != -1)
+ break;
+ // If value is available in a register both base and index components have
+ // been picked, we can't fit the result available in the register in the
+ // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+ if (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val)) {
+ bool isStatic = TM.getRelocationModel() == Reloc::Static;
+ SDOperand N0 = N.getOperand(0);
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+ GlobalValue *GV = G->getGlobal();
+ bool isAbs32 = !is64Bit || isStatic;
+ if (isAbs32 || isRoot) {
+ AM.GV = GV;
+ AM.Disp += G->getOffset();
+ AM.isRIPRel = !isAbs32;
+ return false;
+ }
+ } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+ if (!is64Bit || isStatic || isRoot) {
+ AM.CP = CP->getConstVal();
+ AM.Align = CP->getAlignment();
+ AM.Disp += CP->getOffset();
+ AM.isRIPRel = !isStatic;
+ return false;
+ }
+ } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+ if (isStatic || isRoot) {
+ AM.ES = S->getSymbol();
+ AM.isRIPRel = !isStatic;
+ return false;
+ }
+ } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+ if (isStatic || isRoot) {
+ AM.JT = J->getIndex();
+ AM.isRIPRel = !isStatic;
+ return false;
+ }
+ }
+ }
+ break;
+ }
+
+ case ISD::FrameIndex:
+ if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) {
+ AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+ return false;
+ }
+ break;
+
+ case ISD::SHL:
+ if (!Available && AM.IndexReg.Val == 0 && AM.Scale == 1)
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
+ unsigned Val = CN->getValue();
+ if (Val == 1 || Val == 2 || Val == 3) {
+ AM.Scale = 1 << Val;
+ SDOperand ShVal = N.Val->getOperand(0);
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+ isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
+ AM.IndexReg = ShVal.Val->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(ShVal.Val->getOperand(1));
+ uint64_t Disp = AM.Disp + (AddVal->getValue() << Val);
+ if (isInt32(Disp))
+ AM.Disp = Disp;
+ else
+ AM.IndexReg = ShVal;
+ } else {
+ AM.IndexReg = ShVal;
+ }
+ return false;
+ }
+ }
+ break;
+
+ case ISD::MUL:
+ // X*[3,5,9] -> X+X*[2,4,8]
+ if (!Available &&
+ AM.BaseType == X86ISelAddressMode::RegBase &&
+ AM.Base.Reg.Val == 0 &&
+ AM.IndexReg.Val == 0) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1)))
+ if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) {
+ AM.Scale = unsigned(CN->getValue())-1;
+
+ SDOperand MulVal = N.Val->getOperand(0);
+ SDOperand Reg;
+
+ // Okay, we know that we have a scale by now. However, if the scaled
+ // value is an add of something and a constant, we can fold the
+ // constant into the disp field here.
+ if (MulVal.Val->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+ isa<ConstantSDNode>(MulVal.Val->getOperand(1))) {
+ Reg = MulVal.Val->getOperand(0);
+ ConstantSDNode *AddVal =
+ cast<ConstantSDNode>(MulVal.Val->getOperand(1));
+ uint64_t Disp = AM.Disp + AddVal->getValue() * CN->getValue();
+ if (isInt32(Disp))
+ AM.Disp = Disp;
+ else
+ Reg = N.Val->getOperand(0);
+ } else {
+ Reg = N.Val->getOperand(0);
+ }
+
+ AM.IndexReg = AM.Base.Reg = Reg;
+ return false;
+ }
+ }
+ break;
+
+ case ISD::ADD:
+ if (!Available) {
+ X86ISelAddressMode Backup = AM;
+ if (!MatchAddress(N.Val->getOperand(0), AM, false, Depth+1) &&
+ !MatchAddress(N.Val->getOperand(1), AM, false, Depth+1))
+ return false;
+ AM = Backup;
+ if (!MatchAddress(N.Val->getOperand(1), AM, false, Depth+1) &&
+ !MatchAddress(N.Val->getOperand(0), AM, false, Depth+1))
+ return false;
+ AM = Backup;
+ }
+ break;
+
+ case ISD::OR:
+ // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+ if (!Available) {
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ X86ISelAddressMode Backup = AM;
+ // Start with the LHS as an addr mode.
+ if (!MatchAddress(N.getOperand(0), AM, false) &&
+ // Address could not have picked a GV address for the displacement.
+ AM.GV == NULL &&
+ // On x86-64, the resultant disp must fit in 32-bits.
+ isInt32(AM.Disp + CN->getSignExtended()) &&
+ // Check to see if the LHS & C is zero.
+ CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getValue())) {
+ AM.Disp += CN->getValue();
+ return false;
+ }
+ AM = Backup;
+ }
+ }
+ break;
+ }
+
+ // Is the base register already occupied?
+ if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) {
+ // If so, check to see if the scale index register is set.
+ if (AM.IndexReg.Val == 0) {
+ AM.IndexReg = N;
+ AM.Scale = 1;
+ return false;
+ }
+
+ // Otherwise, we cannot select it.
+ return true;
+ }
+
+ // Default, generate it as a register.
+ AM.BaseType = X86ISelAddressMode::RegBase;
+ AM.Base.Reg = N;
+ return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+ SDOperand &Scale, SDOperand &Index,
+ SDOperand &Disp) {
+ X86ISelAddressMode AM;
+ if (MatchAddress(N, AM))
+ return false;
+
+ MVT::ValueType VT = N.getValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base.Reg.Val)
+ AM.Base.Reg = CurDAG->getRegister(0, VT);
+ }
+
+ if (!AM.IndexReg.Val)
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ getAddressOperands(AM, Base, Scale, Index, Disp);
+ return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+ return ((isa<ConstantSDNode>(Elt) &&
+ cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+ (isa<ConstantFPSDNode>(Elt) &&
+ cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+
+/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to
+/// match a load whose top elements are either undef or zeros. The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+ SDOperand N, SDOperand &Base,
+ SDOperand &Scale, SDOperand &Index,
+ SDOperand &Disp, SDOperand &InChain,
+ SDOperand &OutChain) {
+ if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ InChain = N.getOperand(0).getValue(1);
+ if (ISD::isNON_EXTLoad(InChain.Val) &&
+ InChain.getValue(0).hasOneUse() &&
+ N.hasOneUse() &&
+ CanBeFoldedBy(N.Val, Pred.Val, Op.Val)) {
+ LoadSDNode *LD = cast<LoadSDNode>(InChain);
+ if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+ return false;
+ OutChain = LD->getChain();
+ return true;
+ }
+ }
+
+ // Also handle the case where we explicitly require zeros in the top
+ // elements. This is a vector shuffle from the zero vector.
+ if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
+ N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+ N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ N.getOperand(1).Val->hasOneUse() &&
+ ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
+ N.getOperand(1).getOperand(0).hasOneUse()) {
+ // Check to see if the BUILD_VECTOR is building a zero vector.
+ SDOperand BV = N.getOperand(0);
+ for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
+ if (!isZeroNode(BV.getOperand(i)) &&
+ BV.getOperand(i).getOpcode() != ISD::UNDEF)
+ return false; // Not a zero/undef vector.
+ // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
+ // from the LHS.
+ unsigned VecWidth = BV.getNumOperands();
+ SDOperand ShufMask = N.getOperand(2);
+ assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
+ if (C->getValue() == VecWidth) {
+ for (unsigned i = 1; i != VecWidth; ++i) {
+ if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
+ // ok.
+ } else {
+ ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
+ if (C->getValue() >= VecWidth) return false;
+ }
+ }
+ }
+
+ // Okay, this is a zero extending load. Fold it.
+ LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
+ if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+ return false;
+ OutChain = LD->getChain();
+ InChain = SDOperand(LD, 1);
+ return true;
+ }
+ }
+ return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDOperand Op, SDOperand N,
+ SDOperand &Base, SDOperand &Scale,
+ SDOperand &Index, SDOperand &Disp) {
+ X86ISelAddressMode AM;
+ if (MatchAddress(N, AM))
+ return false;
+
+ MVT::ValueType VT = N.getValueType();
+ unsigned Complexity = 0;
+ if (AM.BaseType == X86ISelAddressMode::RegBase)
+ if (AM.Base.Reg.Val)
+ Complexity = 1;
+ else
+ AM.Base.Reg = CurDAG->getRegister(0, VT);
+ else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+ Complexity = 4;
+
+ if (AM.IndexReg.Val)
+ Complexity++;
+ else
+ AM.IndexReg = CurDAG->getRegister(0, VT);
+
+ // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+ // a simple shift.
+ if (AM.Scale > 1)
+ Complexity++;
+
+ // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+ // to a LEA. This is determined with some expermentation but is by no means
+ // optimal (especially for code size consideration). LEA is nice because of
+ // its three-address nature. Tweak the cost function again when we can run
+ // convertToThreeAddress() at register allocation time.
+ if (AM.GV || AM.CP || AM.ES || AM.JT != -1) {
+ // For X86-64, we should always use lea to materialize RIP relative
+ // addresses.
+ if (Subtarget->is64Bit())
+ Complexity = 4;
+ else
+ Complexity += 2;
+ }
+
+ if (AM.Disp && (AM.Base.Reg.Val || AM.IndexReg.Val))
+ Complexity++;
+
+ if (Complexity > 2) {
+ getAddressOperands(AM, Base, Scale, Index, Disp);
+ return true;
+ }
+ return false;
+}
+
+bool X86DAGToDAGISel::TryFoldLoad(SDOperand P, SDOperand N,
+ SDOperand &Base, SDOperand &Scale,
+ SDOperand &Index, SDOperand &Disp) {
+ if (ISD::isNON_EXTLoad(N.Val) &&
+ N.hasOneUse() &&
+ CanBeFoldedBy(N.Val, P.Val, P.Val))
+ return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp);
+ return false;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+ assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing");
+ if (!GlobalBaseReg) {
+ // Insert the set of GlobalBaseReg into the first MBB of the function
+ MachineBasicBlock &FirstMBB = BB->getParent()->front();
+ MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+ SSARegMap *RegMap = BB->getParent()->getSSARegMap();
+ unsigned PC = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+
+ const TargetInstrInfo *TII = TM.getInstrInfo();
+ BuildMI(FirstMBB, MBBI, TII->get(X86::MovePCtoStack));
+ BuildMI(FirstMBB, MBBI, TII->get(X86::POP32r), PC);
+
+ // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+ // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+ if (TM.getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ GlobalBaseReg = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+ BuildMI(FirstMBB, MBBI, TII->get(X86::ADD32ri), GlobalBaseReg).
+ addReg(PC).
+ addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+ } else {
+ GlobalBaseReg = PC;
+ }
+
+ }
+ return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val;
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+ if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+ assert(Node->getOperand(0).getValueType() == MVT::Other &&
+ "Node doesn't have a token chain argument!");
+ return FindCallStartFromCall(Node->getOperand(0).Val);
+}
+
+SDNode *X86DAGToDAGISel::Select(SDOperand N) {
+ SDNode *Node = N.Val;
+ MVT::ValueType NVT = Node->getValueType(0);
+ unsigned Opc, MOpc;
+ unsigned Opcode = Node->getOpcode();
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent, ' ') << "Selecting: ";
+ DEBUG(Node->dump(CurDAG));
+ DOUT << "\n";
+ Indent += 2;
+#endif
+
+ if (Opcode >= ISD::BUILTIN_OP_END && Opcode < X86ISD::FIRST_NUMBER) {
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "== ";
+ DEBUG(Node->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+ return NULL; // Already selected.
+ }
+
+ switch (Opcode) {
+ default: break;
+ case X86ISD::GlobalBaseReg:
+ return getGlobalBaseReg();
+
+ case ISD::ADD: {
+ // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd
+ // code and is matched first so to prevent it from being turned into
+ // LEA32r X+c.
+ // In 64-bit mode, use LEA to take advantage of RIP-relative addressing.
+ MVT::ValueType PtrVT = TLI.getPointerTy();
+ SDOperand N0 = N.getOperand(0);
+ SDOperand N1 = N.getOperand(1);
+ if (N.Val->getValueType(0) == PtrVT &&
+ N0.getOpcode() == X86ISD::Wrapper &&
+ N1.getOpcode() == ISD::Constant) {
+ unsigned Offset = (unsigned)cast<ConstantSDNode>(N1)->getValue();
+ SDOperand C(0, 0);
+ // TODO: handle ExternalSymbolSDNode.
+ if (GlobalAddressSDNode *G =
+ dyn_cast<GlobalAddressSDNode>(N0.getOperand(0))) {
+ C = CurDAG->getTargetGlobalAddress(G->getGlobal(), PtrVT,
+ G->getOffset() + Offset);
+ } else if (ConstantPoolSDNode *CP =
+ dyn_cast<ConstantPoolSDNode>(N0.getOperand(0))) {
+ C = CurDAG->getTargetConstantPool(CP->getConstVal(), PtrVT,
+ CP->getAlignment(),
+ CP->getOffset()+Offset);
+ }
+
+ if (C.Val) {
+ if (Subtarget->is64Bit()) {
+ SDOperand Ops[] = { CurDAG->getRegister(0, PtrVT), getI8Imm(1),
+ CurDAG->getRegister(0, PtrVT), C };
+ return CurDAG->SelectNodeTo(N.Val, X86::LEA64r, MVT::i64, Ops, 4);
+ } else
+ return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, PtrVT, C);
+ }
+ }
+
+ // Other cases are handled by auto-generated code.
+ break;
+ }
+
+ case ISD::MULHU:
+ case ISD::MULHS: {
+ if (Opcode == ISD::MULHU)
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::MUL8r; MOpc = X86::MUL8m; break;
+ case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+ case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+ case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+ }
+ else
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::IMUL8r; MOpc = X86::IMUL8m; break;
+ case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+ case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+ case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+ }
+
+ unsigned LoReg, HiReg;
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: LoReg = X86::AL; HiReg = X86::AH; break;
+ case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; break;
+ case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+ case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+ }
+
+ SDOperand N0 = Node->getOperand(0);
+ SDOperand N1 = Node->getOperand(1);
+
+ bool foldedLoad = false;
+ SDOperand Tmp0, Tmp1, Tmp2, Tmp3;
+ foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+ // MULHU and MULHS are commmutative
+ if (!foldedLoad) {
+ foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3);
+ if (foldedLoad) {
+ N0 = Node->getOperand(1);
+ N1 = Node->getOperand(0);
+ }
+ }
+
+ SDOperand Chain;
+ if (foldedLoad) {
+ Chain = N1.getOperand(0);
+ AddToISelQueue(Chain);
+ } else
+ Chain = CurDAG->getEntryNode();
+
+ SDOperand InFlag(0, 0);
+ AddToISelQueue(N0);
+ Chain = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT),
+ N0, InFlag);
+ InFlag = Chain.getValue(1);
+
+ if (foldedLoad) {
+ AddToISelQueue(Tmp0);
+ AddToISelQueue(Tmp1);
+ AddToISelQueue(Tmp2);
+ AddToISelQueue(Tmp3);
+ SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag };
+ SDNode *CNode =
+ CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+ Chain = SDOperand(CNode, 0);
+ InFlag = SDOperand(CNode, 1);
+ } else {
+ AddToISelQueue(N1);
+ InFlag =
+ SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+ }
+
+ SDOperand Result = CurDAG->getCopyFromReg(Chain, HiReg, NVT, InFlag);
+ ReplaceUses(N.getValue(0), Result);
+ if (foldedLoad)
+ ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.Val->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+ return NULL;
+ }
+
+ case ISD::SDIV:
+ case ISD::UDIV:
+ case ISD::SREM:
+ case ISD::UREM: {
+ bool isSigned = Opcode == ISD::SDIV || Opcode == ISD::SREM;
+ bool isDiv = Opcode == ISD::SDIV || Opcode == ISD::UDIV;
+ if (!isSigned)
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break;
+ case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+ case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+ case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+ }
+ else
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break;
+ case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+ case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+ case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+ }
+
+ unsigned LoReg, HiReg;
+ unsigned ClrOpcode, SExtOpcode;
+ switch (NVT) {
+ default: assert(0 && "Unsupported VT!");
+ case MVT::i8:
+ LoReg = X86::AL; HiReg = X86::AH;
+ ClrOpcode = 0;
+ SExtOpcode = X86::CBW;
+ break;
+ case MVT::i16:
+ LoReg = X86::AX; HiReg = X86::DX;
+ ClrOpcode = X86::MOV16r0;
+ SExtOpcode = X86::CWD;
+ break;
+ case MVT::i32:
+ LoReg = X86::EAX; HiReg = X86::EDX;
+ ClrOpcode = X86::MOV32r0;
+ SExtOpcode = X86::CDQ;
+ break;
+ case MVT::i64:
+ LoReg = X86::RAX; HiReg = X86::RDX;
+ ClrOpcode = X86::MOV64r0;
+ SExtOpcode = X86::CQO;
+ break;
+ }
+
+ SDOperand N0 = Node->getOperand(0);
+ SDOperand N1 = Node->getOperand(1);
+ SDOperand InFlag(0, 0);
+ if (NVT == MVT::i8 && !isSigned) {
+ // Special case for div8, just use a move with zero extension to AX to
+ // clear the upper 8 bits (AH).
+ SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Move, Chain;
+ if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3)) {
+ SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N0.getOperand(0) };
+ AddToISelQueue(N0.getOperand(0));
+ AddToISelQueue(Tmp0);
+ AddToISelQueue(Tmp1);
+ AddToISelQueue(Tmp2);
+ AddToISelQueue(Tmp3);
+ Move =
+ SDOperand(CurDAG->getTargetNode(X86::MOVZX16rm8, MVT::i16, MVT::Other,
+ Ops, 5), 0);
+ Chain = Move.getValue(1);
+ ReplaceUses(N0.getValue(1), Chain);
+ } else {
+ AddToISelQueue(N0);
+ Move =
+ SDOperand(CurDAG->getTargetNode(X86::MOVZX16rr8, MVT::i16, N0), 0);
+ Chain = CurDAG->getEntryNode();
+ }
+ Chain = CurDAG->getCopyToReg(Chain, X86::AX, Move, InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AddToISelQueue(N0);
+ InFlag =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), LoReg, N0,
+ InFlag).getValue(1);
+ if (isSigned) {
+ // Sign extend the low part into the high part.
+ InFlag =
+ SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0);
+ } else {
+ // Zero out the high part, effectively zero extending the input.
+ SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0);
+ InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), HiReg, ClrNode,
+ InFlag).getValue(1);
+ }
+ }
+
+ SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Chain;
+ bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+ if (foldedLoad) {
+ AddToISelQueue(N1.getOperand(0));
+ AddToISelQueue(Tmp0);
+ AddToISelQueue(Tmp1);
+ AddToISelQueue(Tmp2);
+ AddToISelQueue(Tmp3);
+ SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag };
+ SDNode *CNode =
+ CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+ Chain = SDOperand(CNode, 0);
+ InFlag = SDOperand(CNode, 1);
+ } else {
+ AddToISelQueue(N1);
+ Chain = CurDAG->getEntryNode();
+ InFlag =
+ SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+ }
+
+ SDOperand Result =
+ CurDAG->getCopyFromReg(Chain, isDiv ? LoReg : HiReg, NVT, InFlag);
+ ReplaceUses(N.getValue(0), Result);
+ if (foldedLoad)
+ ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(Result.Val->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+
+ return NULL;
+ }
+
+ case ISD::TRUNCATE: {
+ if (!Subtarget->is64Bit() && NVT == MVT::i8) {
+ unsigned Opc2;
+ MVT::ValueType VT;
+ switch (Node->getOperand(0).getValueType()) {
+ default: assert(0 && "Unknown truncate!");
+ case MVT::i16:
+ Opc = X86::MOV16to16_;
+ VT = MVT::i16;
+ Opc2 = X86::TRUNC_16_to8;
+ break;
+ case MVT::i32:
+ Opc = X86::MOV32to32_;
+ VT = MVT::i32;
+ Opc2 = X86::TRUNC_32_to8;
+ break;
+ }
+
+ AddToISelQueue(Node->getOperand(0));
+ SDOperand Tmp =
+ SDOperand(CurDAG->getTargetNode(Opc, VT, Node->getOperand(0)), 0);
+ SDNode *ResNode = CurDAG->getTargetNode(Opc2, NVT, Tmp);
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ DEBUG(ResNode->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+ return ResNode;
+ }
+
+ break;
+ }
+ }
+
+ SDNode *ResNode = SelectCode(N);
+
+#ifndef NDEBUG
+ DOUT << std::string(Indent-2, ' ') << "=> ";
+ if (ResNode == NULL || ResNode == N.Val)
+ DEBUG(N.Val->dump(CurDAG));
+ else
+ DEBUG(ResNode->dump(CurDAG));
+ DOUT << "\n";
+ Indent -= 2;
+#endif
+
+ return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDOperand &Op, char ConstraintCode,
+ std::vector<SDOperand> &OutOps, SelectionDAG &DAG){
+ SDOperand Op0, Op1, Op2, Op3;
+ switch (ConstraintCode) {
+ case 'o': // offsetable ??
+ case 'v': // not offsetable ??
+ default: return true;
+ case 'm': // memory
+ if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3))
+ return true;
+ break;
+ }
+
+ OutOps.push_back(Op0);
+ OutOps.push_back(Op1);
+ OutOps.push_back(Op2);
+ OutOps.push_back(Op3);
+ AddToISelQueue(Op0);
+ AddToISelQueue(Op1);
+ AddToISelQueue(Op2);
+ AddToISelQueue(Op3);
+ return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, bool Fast) {
+ return new X86DAGToDAGISel(TM, Fast);
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..37dea79
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,5094 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+X86TargetLowering::X86TargetLowering(TargetMachine &TM)
+ : TargetLowering(TM) {
+ Subtarget = &TM.getSubtarget<X86Subtarget>();
+ X86ScalarSSE = Subtarget->hasSSE2();
+ X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+ RegInfo = TM.getRegisterInfo();
+
+ // Set up the TargetLowering object.
+
+ // X86 is weird, it always uses i8 for shift amounts and setcc results.
+ setShiftAmountType(MVT::i8);
+ setSetCCResultType(MVT::i8);
+ setSetCCResultContents(ZeroOrOneSetCCResult);
+ setSchedulingPreference(SchedulingForRegPressure);
+ setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0
+ setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+ if (Subtarget->isTargetDarwin()) {
+ // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+ setUseUnderscoreSetJmp(false);
+ setUseUnderscoreLongJmp(false);
+ } else if (Subtarget->isTargetMingw()) {
+ // MS runtime is weird: it exports _setjmp, but longjmp!
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(false);
+ } else {
+ setUseUnderscoreSetJmp(true);
+ setUseUnderscoreLongJmp(true);
+ }
+
+ // Set up the register classes.
+ addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+ addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+ addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+ if (Subtarget->is64Bit())
+ addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+ setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand);
+
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand);
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ } else {
+ if (X86ScalarSSE)
+ // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
+ else
+ setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
+ }
+
+ // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
+ // SSE has no i16 to fp conversion, only i32
+ if (X86ScalarSSE)
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
+ else {
+ setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
+ setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
+ }
+
+ if (!Subtarget->is64Bit()) {
+ // Custom lower SINT_TO_FP and FP_TO_SINT from/to i64 in 32-bit mode.
+ setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
+ }
+
+ // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
+
+ if (X86ScalarSSE) {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
+ } else {
+ setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
+ setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
+ }
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
+
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ } else {
+ if (X86ScalarSSE && !Subtarget->hasSSE3())
+ // Expand FP_TO_UINT into a select.
+ // FIXME: We would like to use a Custom expander here eventually to do
+ // the optimal thing for SSE vs. the default expansion in the legalizer.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
+ else
+ // With SSE3 we can use fisttpll to convert to a signed i64.
+ setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
+ }
+
+ // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+ if (!X86ScalarSSE) {
+ setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand);
+ setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand);
+ }
+
+ setOperationAction(ISD::BR_JT , MVT::Other, Expand);
+ setOperationAction(ISD::BRCOND , MVT::Other, Custom);
+ setOperationAction(ISD::BR_CC , MVT::Other, Expand);
+ setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
+ setOperationAction(ISD::MEMMOVE , MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+ setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
+ setOperationAction(ISD::FREM , MVT::f64 , Expand);
+
+ setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i8 , Expand);
+ setOperationAction(ISD::CTLZ , MVT::i8 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i16 , Expand);
+ setOperationAction(ISD::CTLZ , MVT::i16 , Expand);
+ setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
+ setOperationAction(ISD::CTLZ , MVT::i32 , Expand);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
+ setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
+ setOperationAction(ISD::CTLZ , MVT::i64 , Expand);
+ }
+
+ setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
+ setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
+
+ // These should be promoted to a larger select which is supported.
+ setOperationAction(ISD::SELECT , MVT::i1 , Promote);
+ setOperationAction(ISD::SELECT , MVT::i8 , Promote);
+ // X86 wants to expand cmov itself.
+ setOperationAction(ISD::SELECT , MVT::i16 , Custom);
+ setOperationAction(ISD::SELECT , MVT::i32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f32 , Custom);
+ setOperationAction(ISD::SELECT , MVT::f64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i8 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i16 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f32 , Custom);
+ setOperationAction(ISD::SETCC , MVT::f64 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::SELECT , MVT::i64 , Custom);
+ setOperationAction(ISD::SETCC , MVT::i64 , Custom);
+ }
+ // X86 ret instruction may pop stack.
+ setOperationAction(ISD::RET , MVT::Other, Custom);
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+
+ // Darwin ABI issue.
+ setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom);
+ setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom);
+ setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom);
+ if (Subtarget->is64Bit()) {
+ setOperationAction(ISD::ConstantPool , MVT::i64 , Custom);
+ setOperationAction(ISD::JumpTable , MVT::i64 , Custom);
+ setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom);
+ setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom);
+ }
+ // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+ setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom);
+ setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom);
+ // X86 wants to expand memset / memcpy itself.
+ setOperationAction(ISD::MEMSET , MVT::Other, Custom);
+ setOperationAction(ISD::MEMCPY , MVT::Other, Custom);
+
+ // We don't have line number support yet.
+ setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+ setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+ // FIXME - use subtarget debug flags
+ if (!Subtarget->isTargetDarwin() &&
+ !Subtarget->isTargetELF() &&
+ !Subtarget->isTargetCygMing())
+ setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
+ setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+ setOperationAction(ISD::EHSELECTION, MVT::i32, Expand);
+ if (Subtarget->is64Bit()) {
+ // FIXME: Verify
+ setExceptionPointerRegister(X86::RAX);
+ setExceptionSelectorRegister(X86::RDX);
+ } else {
+ setExceptionPointerRegister(X86::EAX);
+ setExceptionSelectorRegister(X86::EDX);
+ }
+
+ // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+ setOperationAction(ISD::VASTART , MVT::Other, Custom);
+ setOperationAction(ISD::VAARG , MVT::Other, Expand);
+ setOperationAction(ISD::VAEND , MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::VACOPY , MVT::Other, Custom);
+ else
+ setOperationAction(ISD::VACOPY , MVT::Other, Expand);
+
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+ setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+ if (Subtarget->is64Bit())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+ if (Subtarget->isTargetCygMing())
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+ else
+ setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+ if (X86ScalarSSE) {
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+ addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+ // Use ANDPD to simulate FABS.
+ setOperationAction(ISD::FABS , MVT::f64, Custom);
+ setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+ // Use XORP to simulate FNEG.
+ setOperationAction(ISD::FNEG , MVT::f64, Custom);
+ setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+ // Use ANDPD and ORPD to simulate FCOPYSIGN.
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+ // We don't support sin/cos/fmod
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FREM , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+ // Expand FP immediates into loads from the stack, except for the special
+ // cases we handle.
+ setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+ addLegalFPImmediate(+0.0); // xorps / xorpd
+ } else {
+ // Set up the FP register classes.
+ addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+ addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+ setOperationAction(ISD::UNDEF, MVT::f64, Expand);
+ setOperationAction(ISD::UNDEF, MVT::f32, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+ setOperationAction(ISD::FP_ROUND, MVT::f32, Expand);
+
+ if (!UnsafeFPMath) {
+ setOperationAction(ISD::FSIN , MVT::f64 , Expand);
+ setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ }
+
+ setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+ setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+ addLegalFPImmediate(+0.0); // FLD0
+ addLegalFPImmediate(+1.0); // FLD1
+ addLegalFPImmediate(-0.0); // FLD0/FCHS
+ addLegalFPImmediate(-1.0); // FLD1/FCHS
+ }
+
+ // First set operation action for all vector types to expand. Then we
+ // will selectively turn on ones that can be effectively codegen'd.
+ for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+ VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+ setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FNEG, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand);
+ setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand);
+ }
+
+ if (Subtarget->hasMMX()) {
+ addRegisterClass(MVT::v8i8, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+ addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+ // FIXME: add MMX packed arithmetics
+
+ setOperationAction(ISD::ADD, MVT::v8i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::SUB, MVT::v8i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i32, Legal);
+
+ setOperationAction(ISD::MULHS, MVT::v4i16, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i16, Legal);
+
+ setOperationAction(ISD::AND, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::AND, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::OR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::OR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::XOR, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::XOR, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::LOAD, MVT::v8i8, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v2i32, Promote);
+ AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64);
+ setOperationAction(ISD::LOAD, MVT::v1i64, Legal);
+
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom);
+ }
+
+ if (Subtarget->hasSSE1()) {
+ addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
+ setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
+ setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
+ }
+
+ if (Subtarget->hasSSE2()) {
+ addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+ addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+ setOperationAction(ISD::ADD, MVT::v16i8, Legal);
+ setOperationAction(ISD::ADD, MVT::v8i16, Legal);
+ setOperationAction(ISD::ADD, MVT::v4i32, Legal);
+ setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SUB, MVT::v16i8, Legal);
+ setOperationAction(ISD::SUB, MVT::v8i16, Legal);
+ setOperationAction(ISD::SUB, MVT::v4i32, Legal);
+ setOperationAction(ISD::SUB, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v8i16, Legal);
+ setOperationAction(ISD::FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
+ setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
+ // Implement v4f32 insert_vector_elt in terms of SSE2 v8i16 ones.
+ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
+
+ // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+ for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+ setOperationAction(ISD::BUILD_VECTOR, (MVT::ValueType)VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::ValueType)VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Custom);
+ }
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+
+ // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+ for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+ setOperationAction(ISD::AND, (MVT::ValueType)VT, Promote);
+ AddPromotedToType (ISD::AND, (MVT::ValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::OR, (MVT::ValueType)VT, Promote);
+ AddPromotedToType (ISD::OR, (MVT::ValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::XOR, (MVT::ValueType)VT, Promote);
+ AddPromotedToType (ISD::XOR, (MVT::ValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Promote);
+ AddPromotedToType (ISD::LOAD, (MVT::ValueType)VT, MVT::v2i64);
+ setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
+ AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
+ }
+
+ // Custom lower v2i64 and v2f64 selects.
+ setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
+ setOperationAction(ISD::LOAD, MVT::v2i64, Legal);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
+ }
+
+ // We want to custom lower some of our intrinsics.
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+ // We have target-specific dag combine patterns for the following nodes:
+ setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+ setTargetDAGCombine(ISD::SELECT);
+
+ computeRegisterProperties();
+
+ // FIXME: These should be based on subtarget info. Plus, the values should
+ // be smaller when we are in optimizing for size mode.
+ maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores
+ maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores
+ maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores
+ allowUnalignedMemoryAccesses = true; // x86 supports it!
+}
+
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+/// LowerRET - Lower an ISD::RET node.
+SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
+ assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+
+ SmallVector<CCValAssign, 16> RVLocs;
+ unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+ bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+ CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
+
+
+ // If this is the first return lowered for this function, add the regs to the
+ // liveout set for the function.
+ if (DAG.getMachineFunction().liveout_empty()) {
+ for (unsigned i = 0; i != RVLocs.size(); ++i)
+ if (RVLocs[i].isRegLoc())
+ DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+ }
+
+ SDOperand Chain = Op.getOperand(0);
+ SDOperand Flag;
+
+ // Copy the result values into the output registers.
+ if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
+ RVLocs[0].getLocReg() != X86::ST0) {
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ CCValAssign &VA = RVLocs[i];
+ assert(VA.isRegLoc() && "Can only return in registers!");
+ Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1),
+ Flag);
+ Flag = Chain.getValue(1);
+ }
+ } else {
+ // We need to handle a destination of ST0 specially, because it isn't really
+ // a register.
+ SDOperand Value = Op.getOperand(1);
+
+ // If this is an FP return with ScalarSSE, we need to move the value from
+ // an XMM register onto the fp-stack.
+ if (X86ScalarSSE) {
+ SDOperand MemLoc;
+
+ // If this is a load into a scalarsse value, don't store the loaded value
+ // back to the stack, only to reload it: just replace the scalar-sse load.
+ if (ISD::isNON_EXTLoad(Value.Val) &&
+ (Chain == Value.getValue(1) || Chain == Value.getOperand(0))) {
+ Chain = Value.getOperand(0);
+ MemLoc = Value.getOperand(1);
+ } else {
+ // Spill the value to memory and reload it into top of stack.
+ unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+ MemLoc = DAG.getFrameIndex(SSFI, getPointerTy());
+ Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0);
+ }
+ SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other);
+ SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())};
+ Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+ Chain = Value.getValue(1);
+ }
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDOperand Ops[] = { Chain, Value };
+ Chain = DAG.getNode(X86ISD::FP_SET_RESULT, Tys, Ops, 2);
+ Flag = Chain.getValue(1);
+ }
+
+ SDOperand BytesToPop = DAG.getConstant(getBytesToPopOnReturn(), MVT::i16);
+ if (Flag.Val)
+ return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop, Flag);
+ else
+ return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop);
+}
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers. This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered. The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *X86TargetLowering::
+LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
+ unsigned CallingConv, SelectionDAG &DAG) {
+
+ // Assign locations to each value returned by this call.
+ SmallVector<CCValAssign, 16> RVLocs;
+ bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0;
+ CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+ CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
+
+
+ SmallVector<SDOperand, 8> ResultVals;
+
+ // Copy all of the result registers out of their specified physreg.
+ if (RVLocs.size() != 1 || RVLocs[0].getLocReg() != X86::ST0) {
+ for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
+ RVLocs[i].getValVT(), InFlag).getValue(1);
+ InFlag = Chain.getValue(2);
+ ResultVals.push_back(Chain.getValue(0));
+ }
+ } else {
+ // Copies from the FP stack are special, as ST0 isn't a valid register
+ // before the fp stackifier runs.
+
+ // Copy ST0 into an RFP register with FP_GET_RESULT.
+ SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag);
+ SDOperand GROps[] = { Chain, InFlag };
+ SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2);
+ Chain = RetVal.getValue(1);
+ InFlag = RetVal.getValue(2);
+
+ // If we are using ScalarSSE, store ST(0) to the stack and reload it into
+ // an XMM register.
+ if (X86ScalarSSE) {
+ // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+ SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ SDOperand Ops[] = {
+ Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag
+ };
+ Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
+ RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0);
+ Chain = RetVal.getValue(1);
+ }
+ ResultVals.push_back(RetVal);
+ }
+
+ // Merge everything together with a MERGE_VALUES node.
+ ResultVals.push_back(Chain);
+ return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(),
+ &ResultVals[0], ResultVals.size()).Val;
+}
+
+
+//===----------------------------------------------------------------------===//
+// C & StdCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+// StdCall calling convention seems to be standard for many Windows' API
+// routines and around. It differs from C calling convention just a little:
+// callee should clean up the stack, not caller. Symbols should be also
+// decorated in some fancy way :) It doesn't support any vector arguments.
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value. It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+ const TargetRegisterClass *RC) {
+ assert(RC->contains(PReg) && "Not the correct regclass!");
+ unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
+ MF.addLiveIn(PReg, VReg);
+ return VReg;
+}
+
+SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+ bool isStdCall) {
+ unsigned NumArgs = Op.Val->getNumValues() - 1;
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ SDOperand Root = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+
+ SmallVector<SDOperand, 8> ArgValues;
+ unsigned LastVal = ~0U;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ MVT::ValueType RegVT = VA.getLocVT();
+ TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = X86::GR32RegisterClass;
+ else {
+ assert(MVT::isVector(RegVT));
+ RC = X86::VR128RegisterClass;
+ }
+
+ unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+ SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+
+ ArgValues.push_back(ArgValue);
+ } else {
+ assert(VA.isMemLoc());
+
+ // Create the nodes corresponding to a load from this parameter slot.
+ int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+ VA.getLocMemOffset());
+ SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+ ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+
+ ArgValues.push_back(Root);
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg)
+ VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+
+ if (isStdCall && !isVarArg) {
+ BytesToPopOnReturn = StackSize; // Callee pops everything..
+ BytesCallerReserves = 0;
+ } else {
+ BytesToPopOnReturn = 0; // Callee pops nothing.
+
+ // If this is an sret function, the return should pop the hidden pointer.
+ if (NumArgs &&
+ (cast<ConstantSDNode>(Op.getOperand(3))->getValue() &
+ ISD::ParamFlags::StructReturn))
+ BytesToPopOnReturn = 4;
+
+ BytesCallerReserves = StackSize;
+ }
+
+ RegSaveFrameIndex = 0xAAAAAAA; // X86-64 only.
+ ReturnAddrIndex = 0; // No return address slot generated yet.
+
+ MF.getInfo<X86MachineFunctionInfo>()
+ ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+ // Return the new list of results.
+ return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+ &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+ unsigned NumOps = (Op.getNumOperands() - 5) / 2;
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+
+ SDOperand StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+ }
+ }
+
+ // If the first argument is an sret pointer, remember it.
+ bool isSRet = NumOps &&
+ (cast<ConstantSDNode>(Op.getOperand(6))->getValue() &
+ ISD::ParamFlags::StructReturn);
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ Chain = DAG.getCopyToReg(Chain, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+
+ Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Create the CALLSEQ_END node.
+ unsigned NumBytesForCalleeToPush = 0;
+
+ if (CC == CallingConv::X86_StdCall) {
+ if (isVarArg)
+ NumBytesForCalleeToPush = isSRet ? 4 : 0;
+ else
+ NumBytesForCalleeToPush = NumBytes;
+ } else {
+ // If this is is a call to a struct-return function, the callee
+ // pops the hidden struct pointer, so we have to push it back.
+ // This is common for Darwin/X86, Linux & Mingw32 targets.
+ NumBytesForCalleeToPush = isSRet ? 4 : 0;
+ }
+
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+ Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+// FastCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//
+// The X86 'fastcall' calling convention passes up to two integer arguments in
+// registers (an appropriate portion of ECX/EDX), passes arguments in C order,
+// and requires that the callee pop its arguments off the stack (allowing proper
+// tail calls), and has the same return value conventions as C calling convs.
+//
+// This calling convention always arranges for the callee pop value to be 8n+4
+// bytes, which is needed for tail recursion elimination and stack alignment
+// reasons.
+SDOperand
+X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ SDOperand Root = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_FastCall);
+
+ SmallVector<SDOperand, 8> ArgValues;
+ unsigned LastVal = ~0U;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ MVT::ValueType RegVT = VA.getLocVT();
+ TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = X86::GR32RegisterClass;
+ else {
+ assert(MVT::isVector(RegVT));
+ RC = X86::VR128RegisterClass;
+ }
+
+ unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+ SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+
+ ArgValues.push_back(ArgValue);
+ } else {
+ assert(VA.isMemLoc());
+
+ // Create the nodes corresponding to a load from this parameter slot.
+ int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+ VA.getLocMemOffset());
+ SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+ ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+ }
+ }
+
+ ArgValues.push_back(Root);
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+
+ if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+ // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+ // arguments and the arguments after the retaddr has been pushed are aligned.
+ if ((StackSize & 7) == 0)
+ StackSize += 4;
+ }
+
+ VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs.
+ RegSaveFrameIndex = 0xAAAAAAA; // X86-64 only.
+ ReturnAddrIndex = 0; // No return address slot generated yet.
+ BytesToPopOnReturn = StackSize; // Callee pops all stack arguments.
+ BytesCallerReserves = 0;
+
+ MF.getInfo<X86MachineFunctionInfo>()
+ ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+ // Return the new list of results.
+ return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+ &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_FastCall);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+
+ if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+ // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+ // arguments and the arguments after the retaddr has been pushed are aligned.
+ if ((NumBytes & 7) == 0)
+ NumBytes += 4;
+ }
+
+ Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+
+ SDOperand StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ Chain = DAG.getCopyToReg(Chain, X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ // Add an implicit use GOT pointer in EBX.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT())
+ Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+
+ // FIXME: Do not generate X86ISD::TAILCALL for now.
+ Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Returns a flag for retval copy to use.
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+ Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+// X86-64 C Calling Convention implementation
+//===----------------------------------------------------------------------===//
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ SDOperand Root = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+ static const unsigned GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+ };
+ static const unsigned XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+
+
+ // Assign locations to all of the incoming arguments.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
+
+ SmallVector<SDOperand, 8> ArgValues;
+ unsigned LastVal = ~0U;
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+ // places.
+ assert(VA.getValNo() != LastVal &&
+ "Don't support value assigned to multiple locs yet");
+ LastVal = VA.getValNo();
+
+ if (VA.isRegLoc()) {
+ MVT::ValueType RegVT = VA.getLocVT();
+ TargetRegisterClass *RC;
+ if (RegVT == MVT::i32)
+ RC = X86::GR32RegisterClass;
+ else if (RegVT == MVT::i64)
+ RC = X86::GR64RegisterClass;
+ else if (RegVT == MVT::f32)
+ RC = X86::FR32RegisterClass;
+ else if (RegVT == MVT::f64)
+ RC = X86::FR64RegisterClass;
+ else {
+ assert(MVT::isVector(RegVT));
+ if (MVT::getSizeInBits(RegVT) == 64) {
+ RC = X86::GR64RegisterClass; // MMX values are passed in GPRs.
+ RegVT = MVT::i64;
+ } else
+ RC = X86::VR128RegisterClass;
+ }
+
+ unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+ SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+
+ // If this is an 8 or 16-bit value, it is really passed promoted to 32
+ // bits. Insert an assert[sz]ext to capture this, then truncate to the
+ // right size.
+ if (VA.getLocInfo() == CCValAssign::SExt)
+ ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+ else if (VA.getLocInfo() == CCValAssign::ZExt)
+ ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+ DAG.getValueType(VA.getValVT()));
+
+ if (VA.getLocInfo() != CCValAssign::Full)
+ ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+
+ // Handle MMX values passed in GPRs.
+ if (RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
+ MVT::getSizeInBits(RegVT) == 64)
+ ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+
+ ArgValues.push_back(ArgValue);
+ } else {
+ assert(VA.isMemLoc());
+
+ // Create the nodes corresponding to a load from this parameter slot.
+ int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+ VA.getLocMemOffset());
+ SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+ ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+ }
+ }
+
+ unsigned StackSize = CCInfo.getNextStackOffset();
+
+ // If the function takes variable number of arguments, make a frame index for
+ // the start of the first vararg value... for expansion of llvm.va_start.
+ if (isVarArg) {
+ unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+
+ // For X86-64, if there are vararg parameters that are passed via
+ // registers, then we must store them to their spots on the stack so they
+ // may be loaded by deferencing the result of va_next.
+ VarArgsGPOffset = NumIntRegs * 8;
+ VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
+ VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+ RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
+
+ // Store the integer parameter registers.
+ SmallVector<SDOperand, 8> MemOps;
+ SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+ SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+ DAG.getConstant(VarArgsGPOffset, getPointerTy()));
+ for (; NumIntRegs != 6; ++NumIntRegs) {
+ unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
+ X86::GR64RegisterClass);
+ SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
+ SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+ DAG.getConstant(8, getPointerTy()));
+ }
+
+ // Now store the XMM (fp + vector) parameter registers.
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+ DAG.getConstant(VarArgsFPOffset, getPointerTy()));
+ for (; NumXMMRegs != 8; ++NumXMMRegs) {
+ unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
+ X86::VR128RegisterClass);
+ SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
+ SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+ MemOps.push_back(Store);
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+ DAG.getConstant(16, getPointerTy()));
+ }
+ if (!MemOps.empty())
+ Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOps[0], MemOps.size());
+ }
+
+ ArgValues.push_back(Root);
+
+ ReturnAddrIndex = 0; // No return address slot generated yet.
+ BytesToPopOnReturn = 0; // Callee pops nothing.
+ BytesCallerReserves = StackSize;
+
+ // Return the new list of results.
+ return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+ &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);
+
+ // Get a count of how many bytes are to be pushed on the stack.
+ unsigned NumBytes = CCInfo.getNextStackOffset();
+ Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+
+ SDOperand StackPtr;
+
+ // Walk the register/memloc assignments, inserting copies/loads.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ if (isVarArg) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ // Count the number of XMM registers allocated.
+ static const unsigned XMMArgRegs[] = {
+ X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+ X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+ };
+ unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+
+ Chain = DAG.getCopyToReg(Chain, X86::AL,
+ DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (getTargetMachine().getCodeModel() != CodeModel::Large
+ && !Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ if (getTargetMachine().getCodeModel() != CodeModel::Large)
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+ // Returns a chain & a flag for retval copy to use.
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+
+ // FIXME: Do not generate X86ISD::TAILCALL for now.
+ Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Returns a flag for retval copy to use.
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+ Ops.push_back(DAG.getConstant(0, getPointerTy()));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Handle result values, copying them out of physregs into vregs that we
+ // return.
+ return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+ if (ReturnAddrIndex == 0) {
+ // Set up a frame object for the return address.
+ MachineFunction &MF = DAG.getMachineFunction();
+ if (Subtarget->is64Bit())
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
+ else
+ ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);
+ }
+
+ return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+
+/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code. It returns a false if it cannot do a direct
+/// translation. X86CC is the translated CondCode. LHS/RHS are modified as
+/// needed.
+static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+ unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
+ SelectionDAG &DAG) {
+ X86CC = X86::COND_INVALID;
+ if (!isFP) {
+ if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+ if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+ // X > -1 -> X == 0, jump !sign.
+ RHS = DAG.getConstant(0, RHS.getValueType());
+ X86CC = X86::COND_NS;
+ return true;
+ } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+ // X < 0 -> X == 0, jump on sign.
+ X86CC = X86::COND_S;
+ return true;
+ }
+ }
+
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETEQ: X86CC = X86::COND_E; break;
+ case ISD::SETGT: X86CC = X86::COND_G; break;
+ case ISD::SETGE: X86CC = X86::COND_GE; break;
+ case ISD::SETLT: X86CC = X86::COND_L; break;
+ case ISD::SETLE: X86CC = X86::COND_LE; break;
+ case ISD::SETNE: X86CC = X86::COND_NE; break;
+ case ISD::SETULT: X86CC = X86::COND_B; break;
+ case ISD::SETUGT: X86CC = X86::COND_A; break;
+ case ISD::SETULE: X86CC = X86::COND_BE; break;
+ case ISD::SETUGE: X86CC = X86::COND_AE; break;
+ }
+ } else {
+ // On a floating point condition, the flags are set as follows:
+ // ZF PF CF op
+ // 0 | 0 | 0 | X > Y
+ // 0 | 0 | 1 | X < Y
+ // 1 | 0 | 0 | X == Y
+ // 1 | 1 | 1 | unordered
+ bool Flip = false;
+ switch (SetCCOpcode) {
+ default: break;
+ case ISD::SETUEQ:
+ case ISD::SETEQ: X86CC = X86::COND_E; break;
+ case ISD::SETOLT: Flip = true; // Fallthrough
+ case ISD::SETOGT:
+ case ISD::SETGT: X86CC = X86::COND_A; break;
+ case ISD::SETOLE: Flip = true; // Fallthrough
+ case ISD::SETOGE:
+ case ISD::SETGE: X86CC = X86::COND_AE; break;
+ case ISD::SETUGT: Flip = true; // Fallthrough
+ case ISD::SETULT:
+ case ISD::SETLT: X86CC = X86::COND_B; break;
+ case ISD::SETUGE: Flip = true; // Fallthrough
+ case ISD::SETULE:
+ case ISD::SETLE: X86CC = X86::COND_BE; break;
+ case ISD::SETONE:
+ case ISD::SETNE: X86CC = X86::COND_NE; break;
+ case ISD::SETUO: X86CC = X86::COND_P; break;
+ case ISD::SETO: X86CC = X86::COND_NP; break;
+ }
+ if (Flip)
+ std::swap(LHS, RHS);
+ }
+
+ return X86CC != X86::COND_INVALID;
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+ switch (X86CC) {
+ default:
+ return false;
+ case X86::COND_B:
+ case X86::COND_BE:
+ case X86::COND_E:
+ case X86::COND_P:
+ case X86::COND_A:
+ case X86::COND_AE:
+ case X86::COND_NE:
+ case X86::COND_NP:
+ return true;
+ }
+}
+
+/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode. Return
+/// true if Op is undef or if its value falls within the specified range (L, H].
+static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) {
+ if (Op.getOpcode() == ISD::UNDEF)
+ return true;
+
+ unsigned Val = cast<ConstantSDNode>(Op)->getValue();
+ return (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode. Return
+/// true if Op is undef or if its value equal to the specified value.
+static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
+ if (Op.getOpcode() == ISD::UNDEF)
+ return true;
+ return cast<ConstantSDNode>(Op)->getValue() == Val;
+}
+
+/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFD.
+bool X86::isPSHUFDMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 4)
+ return false;
+
+ // Check if the value doesn't reference the second vector.
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
+ return false;
+ }
+
+ return true;
+}
+
+/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
+bool X86::isPSHUFHWMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 8)
+ return false;
+
+ // Lower quadword copied in order.
+ for (unsigned i = 0; i != 4; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ if (cast<ConstantSDNode>(Arg)->getValue() != i)
+ return false;
+ }
+
+ // Upper quadword shuffled.
+ for (unsigned i = 4; i != 8; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val < 4 || Val > 7)
+ return false;
+ }
+
+ return true;
+}
+
+/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
+bool X86::isPSHUFLWMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 8)
+ return false;
+
+ // Upper quadword copied in order.
+ for (unsigned i = 4; i != 8; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), i))
+ return false;
+
+ // Lower quadword shuffled.
+ for (unsigned i = 0; i != 4; ++i)
+ if (!isUndefOrInRange(N->getOperand(i), 0, 4))
+ return false;
+
+ return true;
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SDOperand *Elems, unsigned NumElems) {
+ if (NumElems != 2 && NumElems != 4) return false;
+
+ unsigned Half = NumElems / 2;
+ for (unsigned i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Elems[i], 0, NumElems))
+ return false;
+ for (unsigned i = Half; i < NumElems; ++i)
+ if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2))
+ return false;
+
+ return true;
+}
+
+bool X86::isSHUFPMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return ::isSHUFPMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFP(const SDOperand *Ops, unsigned NumOps) {
+ if (NumOps != 2 && NumOps != 4) return false;
+
+ unsigned Half = NumOps / 2;
+ for (unsigned i = 0; i < Half; ++i)
+ if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2))
+ return false;
+ for (unsigned i = Half; i < NumOps; ++i)
+ if (!isUndefOrInRange(Ops[i], 0, NumOps))
+ return false;
+ return true;
+}
+
+static bool isCommutedSHUFP(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return isCommutedSHUFP(N->op_begin(), N->getNumOperands());
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 4)
+ return false;
+
+ // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+ return isUndefOrEqual(N->getOperand(0), 6) &&
+ isUndefOrEqual(N->getOperand(1), 7) &&
+ isUndefOrEqual(N->getOperand(2), 2) &&
+ isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 4)
+ return false;
+
+ // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3
+ return isUndefOrEqual(N->getOperand(0), 2) &&
+ isUndefOrEqual(N->getOperand(1), 3) &&
+ isUndefOrEqual(N->getOperand(2), 2) &&
+ isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ unsigned NumElems = N->getNumOperands();
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
+ return false;
+
+ for (unsigned i = NumElems/2; i < NumElems; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), i))
+ return false;
+
+ return true;
+}
+
+/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+/// and MOVLHPS.
+bool X86::isMOVHPMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ unsigned NumElems = N->getNumOperands();
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), i))
+ return false;
+
+ for (unsigned i = 0; i < NumElems/2; ++i) {
+ SDOperand Arg = N->getOperand(i + NumElems/2);
+ if (!isUndefOrEqual(Arg, i + NumElems))
+ return false;
+ }
+
+ return true;
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+bool static isUNPCKLMask(const SDOperand *Elts, unsigned NumElts,
+ bool V2IsSplat = false) {
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ SDOperand BitI = Elts[i];
+ SDOperand BitI1 = Elts[i+1];
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+bool static isUNPCKHMask(const SDOperand *Elts, unsigned NumElts,
+ bool V2IsSplat = false) {
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+ SDOperand BitI = Elts[i];
+ SDOperand BitI1 = Elts[i+1];
+ if (!isUndefOrEqual(BitI, j + NumElts/2))
+ return false;
+ if (V2IsSplat) {
+ if (isUndefOrEqual(BitI1, NumElts))
+ return false;
+ } else {
+ if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ unsigned NumElems = N->getNumOperands();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
+ SDOperand BitI = N->getOperand(i);
+ SDOperand BitI1 = N->getOperand(i+1);
+
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+
+ return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+bool X86::isUNPCKH_v_undef_Mask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ unsigned NumElems = N->getNumOperands();
+ if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+ return false;
+
+ for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+ SDOperand BitI = N->getOperand(i);
+ SDOperand BitI1 = N->getOperand(i + 1);
+
+ if (!isUndefOrEqual(BitI, j))
+ return false;
+ if (!isUndefOrEqual(BitI1, j))
+ return false;
+ }
+
+ return true;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SDOperand *Elts, unsigned NumElts) {
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return false;
+
+ if (!isUndefOrEqual(Elts[0], NumElts))
+ return false;
+
+ for (unsigned i = 1; i < NumElts; ++i) {
+ if (!isUndefOrEqual(Elts[i], i))
+ return false;
+ }
+
+ return true;
+}
+
+bool X86::isMOVLMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return ::isMOVLMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVL(const SDOperand *Ops, unsigned NumOps,
+ bool V2IsSplat = false,
+ bool V2IsUndef = false) {
+ if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+ return false;
+
+ if (!isUndefOrEqual(Ops[0], 0))
+ return false;
+
+ for (unsigned i = 1; i < NumOps; ++i) {
+ SDOperand Arg = Ops[i];
+ if (!(isUndefOrEqual(Arg, i+NumOps) ||
+ (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) ||
+ (V2IsSplat && isUndefOrEqual(Arg, NumOps))))
+ return false;
+ }
+
+ return true;
+}
+
+static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
+ bool V2IsUndef = false) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+ return isCommutedMOVL(N->op_begin(), N->getNumOperands(),
+ V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 4)
+ return false;
+
+ // Expect 1, 1, 3, 3
+ for (unsigned i = 0; i < 2; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val != 1) return false;
+ }
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val != 3) return false;
+ HasHi = true;
+ }
+
+ // Don't use movshdup if it can be done with a shufps.
+ return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 4)
+ return false;
+
+ // Expect 0, 0, 2, 2
+ for (unsigned i = 0; i < 2; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val != 0) return false;
+ }
+
+ bool HasHi = false;
+ for (unsigned i = 2; i < 4; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val != 2) return false;
+ HasHi = true;
+ }
+
+ // Don't use movshdup if it can be done with a shufps.
+ return HasHi;
+}
+
+/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a identity operation on the LHS or RHS.
+static bool isIdentityMask(SDNode *N, bool RHS = false) {
+ unsigned NumElems = N->getNumOperands();
+ for (unsigned i = 0; i < NumElems; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0)))
+ return false;
+ return true;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element.
+static bool isSplatMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ // This is a splat operation if each element of the permute is the same, and
+ // if the value doesn't reference the second vector.
+ unsigned NumElems = N->getNumOperands();
+ SDOperand ElementBase;
+ unsigned i = 0;
+ for (; i != NumElems; ++i) {
+ SDOperand Elt = N->getOperand(i);
+ if (isa<ConstantSDNode>(Elt)) {
+ ElementBase = Elt;
+ break;
+ }
+ }
+
+ if (!ElementBase.Val)
+ return false;
+
+ for (; i != NumElems; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ if (Arg != ElementBase) return false;
+ }
+
+ // Make sure it is a splat of the first vector operand.
+ return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element and it's a 2 or 4 element mask.
+bool X86::isSplatMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ // We can only splat 64-bit, and 32-bit quantities with a single instruction.
+ if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
+ return false;
+ return ::isSplatMask(N);
+}
+
+/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of zero element.
+bool X86::isSplatLoMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+ if (!isUndefOrEqual(N->getOperand(i), 0))
+ return false;
+ return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+/// instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+ unsigned NumOperands = N->getNumOperands();
+ unsigned Shift = (NumOperands == 4) ? 2 : 1;
+ unsigned Mask = 0;
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ unsigned Val = 0;
+ SDOperand Arg = N->getOperand(NumOperands-i-1);
+ if (Arg.getOpcode() != ISD::UNDEF)
+ Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val >= NumOperands) Val -= NumOperands;
+ Mask |= Val;
+ if (i != NumOperands - 1)
+ Mask <<= Shift;
+ }
+
+ return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+/// instructions.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the last 4.
+ for (unsigned i = 7; i >= 4; --i) {
+ unsigned Val = 0;
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() != ISD::UNDEF)
+ Val = cast<ConstantSDNode>(Arg)->getValue();
+ Mask |= (Val - 4);
+ if (i != 4)
+ Mask <<= 2;
+ }
+
+ return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+/// instructions.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+ unsigned Mask = 0;
+ // 8 nodes, but we only care about the first 4.
+ for (int i = 3; i >= 0; --i) {
+ unsigned Val = 0;
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() != ISD::UNDEF)
+ Val = cast<ConstantSDNode>(Arg)->getValue();
+ Mask |= Val;
+ if (i != 0)
+ Mask <<= 2;
+ }
+
+ return Mask;
+}
+
+/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
+/// specifies a 8 element shuffle that can be broken into a pair of
+/// PSHUFHW and PSHUFLW.
+static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
+ assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+ if (N->getNumOperands() != 8)
+ return false;
+
+ // Lower quadword shuffled.
+ for (unsigned i = 0; i != 4; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val > 4)
+ return false;
+ }
+
+ // Upper quadword shuffled.
+ for (unsigned i = 4; i != 8; ++i) {
+ SDOperand Arg = N->getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) continue;
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val < 4 || Val > 7)
+ return false;
+ }
+
+ return true;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// values in ther permute mask.
+static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
+ SDOperand &V2, SDOperand &Mask,
+ SelectionDAG &DAG) {
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType MaskVT = Mask.getValueType();
+ MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT);
+ unsigned NumElems = Mask.getNumOperands();
+ SmallVector<SDOperand, 8> MaskVec;
+
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDOperand Arg = Mask.getOperand(i);
+ if (Arg.getOpcode() == ISD::UNDEF) {
+ MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
+ continue;
+ }
+ assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val < NumElems)
+ MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
+ else
+ MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
+ }
+
+ std::swap(V1, V2);
+ Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(SDNode *Mask) {
+ unsigned NumElems = Mask->getNumOperands();
+ if (NumElems != 4)
+ return false;
+ for (unsigned i = 0, e = 2; i != e; ++i)
+ if (!isUndefOrEqual(Mask->getOperand(i), i+2))
+ return false;
+ for (unsigned i = 2; i != 4; ++i)
+ if (!isUndefOrEqual(Mask->getOperand(i), i+4))
+ return false;
+ return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector.
+static inline bool isScalarLoadToVector(SDNode *N) {
+ if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ N = N->getOperand(0).Val;
+ return ISD::isNON_EXTLoad(N);
+ }
+ return false;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) {
+ if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+ return false;
+ // Is V2 is a vector load, don't do this transformation. We will try to use
+ // load folding shufps op.
+ if (ISD::isNON_EXTLoad(V2))
+ return false;
+
+ unsigned NumElems = Mask->getNumOperands();
+ if (NumElems != 2 && NumElems != 4)
+ return false;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+ if (!isUndefOrEqual(Mask->getOperand(i), i))
+ return false;
+ for (unsigned i = NumElems/2; i != NumElems; ++i)
+ if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems))
+ return false;
+ return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ SDOperand SplatValue = N->getOperand(0);
+ for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+ if (N->getOperand(i) != SplatValue)
+ return false;
+ return true;
+}
+
+/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an undef.
+static bool isUndefShuffle(SDNode *N) {
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ SDOperand V1 = N->getOperand(0);
+ SDOperand V2 = N->getOperand(1);
+ SDOperand Mask = N->getOperand(2);
+ unsigned NumElems = Mask.getNumOperands();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDOperand Arg = Mask.getOperand(i);
+ if (Arg.getOpcode() != ISD::UNDEF) {
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
+ return false;
+ else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
+ return false;
+ }
+ }
+ return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+ return ((isa<ConstantSDNode>(Elt) &&
+ cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+ (isa<ConstantFPSDNode>(Elt) &&
+ cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+static bool isZeroShuffle(SDNode *N) {
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+ return false;
+
+ SDOperand V1 = N->getOperand(0);
+ SDOperand V2 = N->getOperand(1);
+ SDOperand Mask = N->getOperand(2);
+ unsigned NumElems = Mask.getNumOperands();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDOperand Arg = Mask.getOperand(i);
+ if (Arg.getOpcode() != ISD::UNDEF) {
+ unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+ if (Idx < NumElems) {
+ unsigned Opc = V1.Val->getOpcode();
+ if (Opc == ISD::UNDEF)
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !isZeroNode(V1.Val->getOperand(Idx)))
+ return false;
+ } else if (Idx >= NumElems) {
+ unsigned Opc = V2.Val->getOpcode();
+ if (Opc == ISD::UNDEF)
+ continue;
+ if (Opc != ISD::BUILD_VECTOR ||
+ !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
+ assert(MVT::isVector(VT) && "Expected a vector type");
+ unsigned NumElems = MVT::getVectorNumElements(VT);
+ MVT::ValueType EVT = MVT::getVectorElementType(VT);
+ bool isFP = MVT::isFloatingPoint(EVT);
+ SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
+ SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
+ return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
+ assert(Mask.getOpcode() == ISD::BUILD_VECTOR);
+
+ bool Changed = false;
+ SmallVector<SDOperand, 8> MaskVec;
+ unsigned NumElems = Mask.getNumOperands();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDOperand Arg = Mask.getOperand(i);
+ if (Arg.getOpcode() != ISD::UNDEF) {
+ unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+ if (Val > NumElems) {
+ Arg = DAG.getConstant(NumElems, Arg.getValueType());
+ Changed = true;
+ }
+ }
+ MaskVec.push_back(Arg);
+ }
+
+ if (Changed)
+ Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
+ &MaskVec[0], MaskVec.size());
+ return Mask;
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG) {
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+
+ SmallVector<SDOperand, 8> MaskVec;
+ MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
+ for (unsigned i = 1; i != NumElems; ++i)
+ MaskVec.push_back(DAG.getConstant(i, BaseVT));
+ return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
+/// of specified width.
+static SDOperand getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) {
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<SDOperand, 8> MaskVec;
+ for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+ MaskVec.push_back(DAG.getConstant(i, BaseVT));
+ MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
+/// of specified width.
+static SDOperand getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) {
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+ unsigned Half = NumElems/2;
+ SmallVector<SDOperand, 8> MaskVec;
+ for (unsigned i = 0; i != Half; ++i) {
+ MaskVec.push_back(DAG.getConstant(i + Half, BaseVT));
+ MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32.
+///
+static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand V1 = Op.getOperand(0);
+ SDOperand Mask = Op.getOperand(2);
+ MVT::ValueType VT = Op.getValueType();
+ unsigned NumElems = Mask.getNumOperands();
+ Mask = getUnpacklMask(NumElems, DAG);
+ while (NumElems != 4) {
+ V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
+ NumElems >>= 1;
+ }
+ V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
+
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+ Mask = getZeroVector(MaskVT, DAG);
+ SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
+ DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
+ return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.
+static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
+ unsigned NumElems, unsigned Idx,
+ bool isZero, SelectionDAG &DAG) {
+ SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+ SDOperand Zero = DAG.getConstant(0, EVT);
+ SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
+ MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+ SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskVec[0], MaskVec.size());
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG, TargetLowering &TLI) {
+ if (NumNonZero > 8)
+ return SDOperand();
+
+ SDOperand V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 16; ++i) {
+ bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+ if (ThisIsNonZero && First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, DAG);
+ else
+ V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+ First = false;
+ }
+
+ if ((i & 1) != 0) {
+ SDOperand ThisElt(0, 0), LastElt(0, 0);
+ bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+ if (LastIsNonZero) {
+ LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1));
+ }
+ if (ThisIsNonZero) {
+ ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i));
+ ThisElt = DAG.getNode(ISD::SHL, MVT::i16,
+ ThisElt, DAG.getConstant(8, MVT::i8));
+ if (LastIsNonZero)
+ ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt);
+ } else
+ ThisElt = LastElt;
+
+ if (ThisElt.Val)
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt,
+ DAG.getConstant(i/2, TLI.getPointerTy()));
+ }
+ }
+
+ return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG, TargetLowering &TLI) {
+ if (NumNonZero > 4)
+ return SDOperand();
+
+ SDOperand V(0, 0);
+ bool First = true;
+ for (unsigned i = 0; i < 8; ++i) {
+ bool isNonZero = (NonZeros & (1 << i)) != 0;
+ if (isNonZero) {
+ if (First) {
+ if (NumZero)
+ V = getZeroVector(MVT::v8i16, DAG);
+ else
+ V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+ First = false;
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i),
+ DAG.getConstant(i, TLI.getPointerTy()));
+ }
+ }
+
+ return V;
+}
+
+SDOperand
+X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+ // All zero's are handled with pxor.
+ if (ISD::isBuildVectorAllZeros(Op.Val))
+ return Op;
+
+ // All one's are handled with pcmpeqd.
+ if (ISD::isBuildVectorAllOnes(Op.Val))
+ return Op;
+
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType EVT = MVT::getVectorElementType(VT);
+ unsigned EVTBits = MVT::getSizeInBits(EVT);
+
+ unsigned NumElems = Op.getNumOperands();
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ unsigned NonZeros = 0;
+ std::set<SDOperand> Values;
+ for (unsigned i = 0; i < NumElems; ++i) {
+ SDOperand Elt = Op.getOperand(i);
+ if (Elt.getOpcode() != ISD::UNDEF) {
+ Values.insert(Elt);
+ if (isZeroNode(Elt))
+ NumZero++;
+ else {
+ NonZeros |= (1 << i);
+ NumNonZero++;
+ }
+ }
+ }
+
+ if (NumNonZero == 0) {
+ if (NumZero == 0)
+ // All undef vector. Return an UNDEF.
+ return DAG.getNode(ISD::UNDEF, VT);
+ else
+ // A mix of zero and undef. Return a zero vector.
+ return getZeroVector(VT, DAG);
+ }
+
+ // Splat is obviously ok. Let legalizer expand it to a shuffle.
+ if (Values.size() == 1)
+ return SDOperand();
+
+ // Special case for single non-zero element.
+ if (NumNonZero == 1) {
+ unsigned Idx = CountTrailingZeros_32(NonZeros);
+ SDOperand Item = Op.getOperand(Idx);
+ Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
+ if (Idx == 0)
+ // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+ return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx,
+ NumZero > 0, DAG);
+
+ if (EVTBits == 32) {
+ // Turn it into a shuffle of zero and zero-extended scalar to vector.
+ Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0,
+ DAG);
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<SDOperand, 8> MaskVec;
+ for (unsigned i = 0; i < NumElems; i++)
+ MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
+ SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskVec[0], MaskVec.size());
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
+ DAG.getNode(ISD::UNDEF, VT), Mask);
+ }
+ }
+
+ // Let legalizer expand 2-wide build_vectors.
+ if (EVTBits == 64)
+ return SDOperand();
+
+ // If element VT is < 32 bits, convert it to inserts into a zero vector.
+ if (EVTBits == 8 && NumElems == 16) {
+ SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.Val) return V;
+ }
+
+ if (EVTBits == 16 && NumElems == 8) {
+ SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+ *this);
+ if (V.Val) return V;
+ }
+
+ // If element VT is == 32 bits, turn it into a number of shuffles.
+ SmallVector<SDOperand, 8> V;
+ V.resize(NumElems);
+ if (NumElems == 4 && NumZero > 0) {
+ for (unsigned i = 0; i < 4; ++i) {
+ bool isZero = !(NonZeros & (1 << i));
+ if (isZero)
+ V[i] = getZeroVector(VT, DAG);
+ else
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+ }
+
+ for (unsigned i = 0; i < 2; ++i) {
+ switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+ default: break;
+ case 0:
+ V[i] = V[i*2]; // Must be a zero vector.
+ break;
+ case 1:
+ V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
+ getMOVLMask(NumElems, DAG));
+ break;
+ case 2:
+ V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+ getMOVLMask(NumElems, DAG));
+ break;
+ case 3:
+ V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+ getUnpacklMask(NumElems, DAG));
+ break;
+ }
+ }
+
+ // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
+ // clears the upper bits.
+ // FIXME: we can do the same for v4f32 case when we know both parts of
+ // the lower half come from scalar_to_vector (loadf32). We should do
+ // that in post legalizer dag combiner with target specific hooks.
+ if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
+ return V[0];
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<SDOperand, 8> MaskVec;
+ bool Reverse = (NonZeros & 0x3) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ if (Reverse)
+ MaskVec.push_back(DAG.getConstant(1-i, EVT));
+ else
+ MaskVec.push_back(DAG.getConstant(i, EVT));
+ Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+ for (unsigned i = 0; i < 2; ++i)
+ if (Reverse)
+ MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
+ else
+ MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
+ SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskVec[0], MaskVec.size());
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
+ }
+
+ if (Values.size() > 2) {
+ // Expand into a number of unpckl*.
+ // e.g. for v4f32
+ // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+ // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+ // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
+ SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+ NumElems >>= 1;
+ while (NumElems != 0) {
+ for (unsigned i = 0; i < NumElems; ++i)
+ V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
+ UnpckMask);
+ NumElems >>= 1;
+ }
+ return V[0];
+ }
+
+ return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand V1 = Op.getOperand(0);
+ SDOperand V2 = Op.getOperand(1);
+ SDOperand PermMask = Op.getOperand(2);
+ MVT::ValueType VT = Op.getValueType();
+ unsigned NumElems = PermMask.getNumOperands();
+ bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsSplat = false;
+ bool V2IsSplat = false;
+
+ if (isUndefShuffle(Op.Val))
+ return DAG.getNode(ISD::UNDEF, VT);
+
+ if (isZeroShuffle(Op.Val))
+ return getZeroVector(VT, DAG);
+
+ if (isIdentityMask(PermMask.Val))
+ return V1;
+ else if (isIdentityMask(PermMask.Val, true))
+ return V2;
+
+ if (isSplatMask(PermMask.Val)) {
+ if (NumElems <= 4) return Op;
+ // Promote it to a v4i32 splat.
+ return PromoteSplat(Op, DAG);
+ }
+
+ if (X86::isMOVLMask(PermMask.Val))
+ return (V1IsUndef) ? V2 : Op;
+
+ if (X86::isMOVSHDUPMask(PermMask.Val) ||
+ X86::isMOVSLDUPMask(PermMask.Val) ||
+ X86::isMOVHLPSMask(PermMask.Val) ||
+ X86::isMOVHPMask(PermMask.Val) ||
+ X86::isMOVLPMask(PermMask.Val))
+ return Op;
+
+ if (ShouldXformToMOVHLPS(PermMask.Val) ||
+ ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
+ return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+ bool Commuted = false;
+ V1IsSplat = isSplatVector(V1.Val);
+ V2IsSplat = isSplatVector(V2.Val);
+ if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+ Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+ std::swap(V1IsSplat, V2IsSplat);
+ std::swap(V1IsUndef, V2IsUndef);
+ Commuted = true;
+ }
+
+ if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) {
+ if (V2IsUndef) return V1;
+ Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+ if (V2IsSplat) {
+ // V2 is a splat, so the mask may be malformed. That is, it may point
+ // to any V2 element. The instruction selectior won't like this. Get
+ // a corrected mask and commute to form a proper MOVS{S|D}.
+ SDOperand NewMask = getMOVLMask(NumElems, DAG);
+ if (NewMask.Val != PermMask.Val)
+ Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+ }
+ return Op;
+ }
+
+ if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+ X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+ X86::isUNPCKLMask(PermMask.Val) ||
+ X86::isUNPCKHMask(PermMask.Val))
+ return Op;
+
+ if (V2IsSplat) {
+ // Normalize mask so all entries that point to V2 points to its first
+ // element then try to match unpck{h|l} again. If match, return a
+ // new vector_shuffle with the corrected mask.
+ SDOperand NewMask = NormalizeMask(PermMask, DAG);
+ if (NewMask.Val != PermMask.Val) {
+ if (X86::isUNPCKLMask(PermMask.Val, true)) {
+ SDOperand NewMask = getUnpacklMask(NumElems, DAG);
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+ } else if (X86::isUNPCKHMask(PermMask.Val, true)) {
+ SDOperand NewMask = getUnpackhMask(NumElems, DAG);
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+ }
+ }
+ }
+
+ // Normalize the node to match x86 shuffle ops if needed
+ if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val))
+ Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+ if (Commuted) {
+ // Commute is back and try unpck* again.
+ Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+ if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+ X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+ X86::isUNPCKLMask(PermMask.Val) ||
+ X86::isUNPCKHMask(PermMask.Val))
+ return Op;
+ }
+
+ // If VT is integer, try PSHUF* first, then SHUFP*.
+ if (MVT::isInteger(VT)) {
+ if (X86::isPSHUFDMask(PermMask.Val) ||
+ X86::isPSHUFHWMask(PermMask.Val) ||
+ X86::isPSHUFLWMask(PermMask.Val)) {
+ if (V2.getOpcode() != ISD::UNDEF)
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+ DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+ return Op;
+ }
+
+ if (X86::isSHUFPMask(PermMask.Val) &&
+ MVT::getSizeInBits(VT) != 64) // Don't do this for MMX.
+ return Op;
+
+ // Handle v8i16 shuffle high / low shuffle node pair.
+ if (VT == MVT::v8i16 && isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+ MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<SDOperand, 8> MaskVec;
+ for (unsigned i = 0; i != 4; ++i)
+ MaskVec.push_back(PermMask.getOperand(i));
+ for (unsigned i = 4; i != 8; ++i)
+ MaskVec.push_back(DAG.getConstant(i, BaseVT));
+ SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskVec[0], MaskVec.size());
+ V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+ MaskVec.clear();
+ for (unsigned i = 0; i != 4; ++i)
+ MaskVec.push_back(DAG.getConstant(i, BaseVT));
+ for (unsigned i = 4; i != 8; ++i)
+ MaskVec.push_back(PermMask.getOperand(i));
+ Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0],MaskVec.size());
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+ }
+ } else {
+ // Floating point cases in the other order.
+ if (X86::isSHUFPMask(PermMask.Val))
+ return Op;
+ if (X86::isPSHUFDMask(PermMask.Val) ||
+ X86::isPSHUFHWMask(PermMask.Val) ||
+ X86::isPSHUFLWMask(PermMask.Val)) {
+ if (V2.getOpcode() != ISD::UNDEF)
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+ DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+ return Op;
+ }
+ }
+
+ if (NumElems == 4 &&
+ // Don't do this for MMX.
+ MVT::getSizeInBits(VT) != 64) {
+ MVT::ValueType MaskVT = PermMask.getValueType();
+ MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<std::pair<int, int>, 8> Locs;
+ Locs.reserve(NumElems);
+ SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+ SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+ unsigned NumHi = 0;
+ unsigned NumLo = 0;
+ // If no more than two elements come from either vector. This can be
+ // implemented with two shuffles. First shuffle gather the elements.
+ // The second shuffle, which takes the first shuffle as both of its
+ // vector operands, put the elements into the right order.
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDOperand Elt = PermMask.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else {
+ unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
+ if (Val < NumElems) {
+ Locs[i] = std::make_pair(0, NumLo);
+ Mask1[NumLo] = Elt;
+ NumLo++;
+ } else {
+ Locs[i] = std::make_pair(1, NumHi);
+ if (2+NumHi < NumElems)
+ Mask1[2+NumHi] = Elt;
+ NumHi++;
+ }
+ }
+ }
+ if (NumLo <= 2 && NumHi <= 2) {
+ V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &Mask1[0], Mask1.size()));
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (Locs[i].first == -1)
+ continue;
+ else {
+ unsigned Idx = (i < NumElems/2) ? 0 : NumElems;
+ Idx += Locs[i].first * (NumElems/2) + Locs[i].second;
+ Mask2[i] = DAG.getConstant(Idx, MaskEVT);
+ }
+ }
+
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &Mask2[0], Mask2.size()));
+ }
+
+ // Break it into (shuffle shuffle_hi, shuffle_lo).
+ Locs.clear();
+ SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+ SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+ SmallVector<SDOperand,8> *MaskPtr = &LoMask;
+ unsigned MaskIdx = 0;
+ unsigned LoIdx = 0;
+ unsigned HiIdx = NumElems/2;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (i == NumElems/2) {
+ MaskPtr = &HiMask;
+ MaskIdx = 1;
+ LoIdx = 0;
+ HiIdx = NumElems/2;
+ }
+ SDOperand Elt = PermMask.getOperand(i);
+ if (Elt.getOpcode() == ISD::UNDEF) {
+ Locs[i] = std::make_pair(-1, -1);
+ } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) {
+ Locs[i] = std::make_pair(MaskIdx, LoIdx);
+ (*MaskPtr)[LoIdx] = Elt;
+ LoIdx++;
+ } else {
+ Locs[i] = std::make_pair(MaskIdx, HiIdx);
+ (*MaskPtr)[HiIdx] = Elt;
+ HiIdx++;
+ }
+ }
+
+ SDOperand LoShuffle =
+ DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &LoMask[0], LoMask.size()));
+ SDOperand HiShuffle =
+ DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &HiMask[0], HiMask.size()));
+ SmallVector<SDOperand, 8> MaskOps;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ if (Locs[i].first == -1) {
+ MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+ } else {
+ unsigned Idx = Locs[i].first * NumElems + Locs[i].second;
+ MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
+ }
+ }
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskOps[0], MaskOps.size()));
+ }
+
+ return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+ if (!isa<ConstantSDNode>(Op.getOperand(1)))
+ return SDOperand();
+
+ MVT::ValueType VT = Op.getValueType();
+ // TODO: handle v16i8.
+ if (MVT::getSizeInBits(VT) == 16) {
+ // Transform it so it match pextrw which produces a 32-bit result.
+ MVT::ValueType EVT = (MVT::ValueType)(VT+1);
+ SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
+ Op.getOperand(0), Op.getOperand(1));
+ SDOperand Assert = DAG.getNode(ISD::AssertZext, EVT, Extract,
+ DAG.getValueType(VT));
+ return DAG.getNode(ISD::TRUNCATE, VT, Assert);
+ } else if (MVT::getSizeInBits(VT) == 32) {
+ SDOperand Vec = Op.getOperand(0);
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+ if (Idx == 0)
+ return Op;
+ // SHUFPS the element to the lowest double word, then movss.
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+ SmallVector<SDOperand, 8> IdxVec;
+ IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
+ IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &IdxVec[0], IdxVec.size());
+ Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+ Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+ DAG.getConstant(0, getPointerTy()));
+ } else if (MVT::getSizeInBits(VT) == 64) {
+ SDOperand Vec = Op.getOperand(0);
+ unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+ if (Idx == 0)
+ return Op;
+
+ // UNPCKHPD the element to the lowest double word, then movsd.
+ // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+ // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+ SmallVector<SDOperand, 8> IdxVec;
+ IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
+ IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+ SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &IdxVec[0], IdxVec.size());
+ Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+ Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+ DAG.getConstant(0, getPointerTy()));
+ }
+
+ return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+ // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+ // as its second argument.
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType BaseVT = MVT::getVectorElementType(VT);
+ SDOperand N0 = Op.getOperand(0);
+ SDOperand N1 = Op.getOperand(1);
+ SDOperand N2 = Op.getOperand(2);
+ if (MVT::getSizeInBits(BaseVT) == 16) {
+ if (N1.getValueType() != MVT::i32)
+ N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
+ if (N2.getValueType() != MVT::i32)
+ N2 = DAG.getConstant(cast<ConstantSDNode>(N2)->getValue(),getPointerTy());
+ return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
+ } else if (MVT::getSizeInBits(BaseVT) == 32) {
+ unsigned Idx = cast<ConstantSDNode>(N2)->getValue();
+ if (Idx == 0) {
+ // Use a movss.
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, N1);
+ MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+ MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+ SmallVector<SDOperand, 8> MaskVec;
+ MaskVec.push_back(DAG.getConstant(4, BaseVT));
+ for (unsigned i = 1; i <= 3; ++i)
+ MaskVec.push_back(DAG.getConstant(i, BaseVT));
+ return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, N0, N1,
+ DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+ &MaskVec[0], MaskVec.size()));
+ } else {
+ // Use two pinsrw instructions to insert a 32 bit value.
+ Idx <<= 1;
+ if (MVT::isFloatingPoint(N1.getValueType())) {
+ if (ISD::isNON_EXTLoad(N1.Val)) {
+ // Just load directly from f32mem to GR32.
+ LoadSDNode *LD = cast<LoadSDNode>(N1);
+ N1 = DAG.getLoad(MVT::i32, LD->getChain(), LD->getBasePtr(),
+ LD->getSrcValue(), LD->getSrcValueOffset());
+ } else {
+ N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4f32, N1);
+ N1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, N1);
+ N1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, N1,
+ DAG.getConstant(0, getPointerTy()));
+ }
+ }
+ N0 = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, N0);
+ N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+ DAG.getConstant(Idx, getPointerTy()));
+ N1 = DAG.getNode(ISD::SRL, MVT::i32, N1, DAG.getConstant(16, MVT::i8));
+ N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+ DAG.getConstant(Idx+1, getPointerTy()));
+ return DAG.getNode(ISD::BIT_CONVERT, VT, N0);
+ }
+ }
+
+ return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
+ return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt);
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDOperand
+X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
+ ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+ SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(),
+ getPointerTy(),
+ CP->getAlignment());
+ Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
+ GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
+ Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ Result);
+ }
+
+ // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
+ // load the value at address GV, not the value of GV itself. This means that
+ // the GlobalAddress must be in the base or index register of the address, not
+ // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
+ // The same applies for external symbols during PIC codegen
+ if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
+ Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, NULL, 0);
+
+ return Result;
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+static SDOperand
+LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const MVT::ValueType PtrVT) {
+ SDOperand InFlag;
+ SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ PtrVT), InFlag);
+ InFlag = Chain.getValue(1);
+
+ // emit leal symbol@TLSGD(,%ebx,1), %eax
+ SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
+ SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+ GA->getValueType(0),
+ GA->getOffset());
+ SDOperand Ops[] = { Chain, TGA, InFlag };
+ SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3);
+ InFlag = Result.getValue(2);
+ Chain = Result.getValue(1);
+
+ // call ___tls_get_addr. This function receives its argument in
+ // the register EAX.
+ Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag);
+ InFlag = Chain.getValue(1);
+
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDOperand Ops1[] = { Chain,
+ DAG.getTargetExternalSymbol("___tls_get_addr",
+ PtrVT),
+ DAG.getRegister(X86::EAX, PtrVT),
+ DAG.getRegister(X86::EBX, PtrVT),
+ InFlag };
+ Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5);
+ InFlag = Chain.getValue(1);
+
+ return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDOperand
+LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+ const MVT::ValueType PtrVT) {
+ // Get the Thread Pointer
+ SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
+ // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+ // exec)
+ SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+ GA->getValueType(0),
+ GA->getOffset());
+ SDOperand Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA);
+
+ if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
+ Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, NULL, 0);
+
+ // The address of the thread local variable is the add of the thread
+ // pointer with the offset of the variable.
+ return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
+ // TODO: implement the "local dynamic" model
+ // TODO: implement the "initial exec"model for pic executables
+ assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() &&
+ "TLS not implemented for non-ELF and 64-bit targets");
+ GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+ // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+ // otherwise use the "Local Exec"TLS Model
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+ return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy());
+ else
+ return LowerToTLSExecModel(GA, DAG, getPointerTy());
+}
+
+SDOperand
+X86TargetLowering::LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG) {
+ const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+ SDOperand Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+ Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDOperand X86TargetLowering::LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
+ JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+ SDOperand Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+ Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+ // With PIC, the address is actually $g + Offset.
+ if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ !Subtarget->isPICStyleRIPRel()) {
+ Result = DAG.getNode(ISD::ADD, getPointerTy(),
+ DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+ Result);
+ }
+
+ return Result;
+}
+
+SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) {
+ assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+ "Not an i64 shift!");
+ bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+ SDOperand ShOpLo = Op.getOperand(0);
+ SDOperand ShOpHi = Op.getOperand(1);
+ SDOperand ShAmt = Op.getOperand(2);
+ SDOperand Tmp1 = isSRA ?
+ DAG.getNode(ISD::SRA, MVT::i32, ShOpHi, DAG.getConstant(31, MVT::i8)) :
+ DAG.getConstant(0, MVT::i32);
+
+ SDOperand Tmp2, Tmp3;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Tmp2 = DAG.getNode(X86ISD::SHLD, MVT::i32, ShOpHi, ShOpLo, ShAmt);
+ Tmp3 = DAG.getNode(ISD::SHL, MVT::i32, ShOpLo, ShAmt);
+ } else {
+ Tmp2 = DAG.getNode(X86ISD::SHRD, MVT::i32, ShOpLo, ShOpHi, ShAmt);
+ Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, MVT::i32, ShOpHi, ShAmt);
+ }
+
+ const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+ SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
+ DAG.getConstant(32, MVT::i8));
+ SDOperand COps[]={DAG.getEntryNode(), AndNode, DAG.getConstant(0, MVT::i8)};
+ SDOperand InFlag = DAG.getNode(X86ISD::CMP, VTs, 2, COps, 3).getValue(1);
+
+ SDOperand Hi, Lo;
+ SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+
+ VTs = DAG.getNodeValueTypes(MVT::i32, MVT::Flag);
+ SmallVector<SDOperand, 4> Ops;
+ if (Op.getOpcode() == ISD::SHL_PARTS) {
+ Ops.push_back(Tmp2);
+ Ops.push_back(Tmp3);
+ Ops.push_back(CC);
+ Ops.push_back(InFlag);
+ Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+ InFlag = Hi.getValue(1);
+
+ Ops.clear();
+ Ops.push_back(Tmp3);
+ Ops.push_back(Tmp1);
+ Ops.push_back(CC);
+ Ops.push_back(InFlag);
+ Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+ } else {
+ Ops.push_back(Tmp2);
+ Ops.push_back(Tmp3);
+ Ops.push_back(CC);
+ Ops.push_back(InFlag);
+ Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+ InFlag = Lo.getValue(1);
+
+ Ops.clear();
+ Ops.push_back(Tmp3);
+ Ops.push_back(Tmp1);
+ Ops.push_back(CC);
+ Ops.push_back(InFlag);
+ Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+ }
+
+ VTs = DAG.getNodeValueTypes(MVT::i32, MVT::i32);
+ Ops.clear();
+ Ops.push_back(Lo);
+ Ops.push_back(Hi);
+ return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
+ assert(Op.getOperand(0).getValueType() <= MVT::i64 &&
+ Op.getOperand(0).getValueType() >= MVT::i16 &&
+ "Unknown SINT_TO_FP to lower!");
+
+ SDOperand Result;
+ MVT::ValueType SrcVT = Op.getOperand(0).getValueType();
+ unsigned Size = MVT::getSizeInBits(SrcVT)/8;
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+ SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ SDOperand Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0),
+ StackSlot, NULL, 0);
+
+ // Build the FILD
+ SDVTList Tys;
+ if (X86ScalarSSE)
+ Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+ else
+ Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(StackSlot);
+ Ops.push_back(DAG.getValueType(SrcVT));
+ Result = DAG.getNode(X86ScalarSSE ? X86ISD::FILD_FLAG :X86ISD::FILD,
+ Tys, &Ops[0], Ops.size());
+
+ if (X86ScalarSSE) {
+ Chain = Result.getValue(1);
+ SDOperand InFlag = Result.getValue(2);
+
+ // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+ // shouldn't be necessary except that RFP cannot be live across
+ // multiple blocks. When stackifier is fixed, they can be uncoupled.
+ MachineFunction &MF = DAG.getMachineFunction();
+ int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+ SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ Tys = DAG.getVTList(MVT::Other);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(Result);
+ Ops.push_back(StackSlot);
+ Ops.push_back(DAG.getValueType(Op.getValueType()));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
+ Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, NULL, 0);
+ }
+
+ return Result;
+}
+
+SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
+ assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 &&
+ "Unknown FP_TO_SINT to lower!");
+ // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+ // stack slot.
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8;
+ int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+ SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+ unsigned Opc;
+ switch (Op.getValueType()) {
+ default: assert(0 && "Invalid FP_TO_SINT to lower!");
+ case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+ case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+ case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+ }
+
+ SDOperand Chain = DAG.getEntryNode();
+ SDOperand Value = Op.getOperand(0);
+ if (X86ScalarSSE) {
+ assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+ Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0);
+ SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+ SDOperand Ops[] = {
+ Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+ };
+ Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+ Chain = Value.getValue(1);
+ SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+ StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+ }
+
+ // Build the FP_TO_INT*_IN_MEM
+ SDOperand Ops[] = { Chain, Value, StackSlot };
+ SDOperand FIST = DAG.getNode(Opc, MVT::Other, Ops, 3);
+
+ // Load the result.
+ return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType EltVT = VT;
+ if (MVT::isVector(VT))
+ EltVT = MVT::getVectorElementType(VT);
+ const Type *OpNTy = MVT::getTypeForValueType(EltVT);
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63)));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31)));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *CS = ConstantStruct::get(CV);
+ SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SmallVector<SDOperand, 3> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(CPIdx);
+ Ops.push_back(DAG.getSrcValue(NULL));
+ SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType EltVT = VT;
+ if (MVT::isVector(VT))
+ EltVT = MVT::getVectorElementType(VT);
+ const Type *OpNTy = MVT::getTypeForValueType(EltVT);
+ std::vector<Constant*> CV;
+ if (EltVT == MVT::f64) {
+ Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63));
+ CV.push_back(C);
+ CV.push_back(C);
+ } else {
+ Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31));
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ CV.push_back(C);
+ }
+ Constant *CS = ConstantStruct::get(CV);
+ SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+ SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+ SmallVector<SDOperand, 3> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(CPIdx);
+ Ops.push_back(DAG.getSrcValue(NULL));
+ SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand Op0 = Op.getOperand(0);
+ SDOperand Op1 = Op.getOperand(1);
+ MVT::ValueType VT = Op.getValueType();
+ MVT::ValueType SrcVT = Op1.getValueType();
+ const Type *SrcTy = MVT::getTypeForValueType(SrcVT);
+
+ // If second operand is smaller, extend it first.
+ if (MVT::getSizeInBits(SrcVT) < MVT::getSizeInBits(VT)) {
+ Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1);
+ SrcVT = VT;
+ }
+
+ // First get the sign bit of second operand.
+ std::vector<Constant*> CV;
+ if (SrcVT == MVT::f64) {
+ CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(1ULL << 63)));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ } else {
+ CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(1U << 31)));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ }
+ Constant *CS = ConstantStruct::get(CV);
+ SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+ SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other);
+ SmallVector<SDOperand, 3> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(CPIdx);
+ Ops.push_back(DAG.getSrcValue(NULL));
+ SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+ SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);
+
+ // Shift sign bit right or left if the two operands have different types.
+ if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) {
+ // Op0 is MVT::f32, Op1 is MVT::f64.
+ SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit);
+ SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit,
+ DAG.getConstant(32, MVT::i32));
+ SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit);
+ SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit,
+ DAG.getConstant(0, getPointerTy()));
+ }
+
+ // Clear first operand sign bit.
+ CV.clear();
+ if (VT == MVT::f64) {
+ CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(~(1ULL << 63))));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ } else {
+ CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(~(1U << 31))));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ CV.push_back(ConstantFP::get(SrcTy, 0.0));
+ }
+ CS = ConstantStruct::get(CV);
+ CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+ Tys = DAG.getVTList(VT, MVT::Other);
+ Ops.clear();
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(CPIdx);
+ Ops.push_back(DAG.getSrcValue(NULL));
+ SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+ SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2);
+
+ // Or the value with the sign bit.
+ return DAG.getNode(X86ISD::FOR, VT, Val, SignBit);
+}
+
+SDOperand X86TargetLowering::LowerSETCC(SDOperand Op, SelectionDAG &DAG,
+ SDOperand Chain) {
+ assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDOperand Cond;
+ SDOperand Op0 = Op.getOperand(0);
+ SDOperand Op1 = Op.getOperand(1);
+ SDOperand CC = Op.getOperand(2);
+ ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+ const MVT::ValueType *VTs1 = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+ const MVT::ValueType *VTs2 = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+ bool isFP = MVT::isFloatingPoint(Op.getOperand(1).getValueType());
+ unsigned X86CC;
+
+ if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
+ Op0, Op1, DAG)) {
+ SDOperand Ops1[] = { Chain, Op0, Op1 };
+ Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, Ops1, 3).getValue(1);
+ SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+ return DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+ }
+
+ assert(isFP && "Illegal integer SetCC!");
+
+ SDOperand COps[] = { Chain, Op0, Op1 };
+ Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, COps, 3).getValue(1);
+
+ switch (SetCCOpcode) {
+ default: assert(false && "Illegal floating point SetCC!");
+ case ISD::SETOEQ: { // !PF & ZF
+ SDOperand Ops1[] = { DAG.getConstant(X86::COND_NP, MVT::i8), Cond };
+ SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+ SDOperand Ops2[] = { DAG.getConstant(X86::COND_E, MVT::i8),
+ Tmp1.getValue(1) };
+ SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+ return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2);
+ }
+ case ISD::SETUNE: { // PF | !ZF
+ SDOperand Ops1[] = { DAG.getConstant(X86::COND_P, MVT::i8), Cond };
+ SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+ SDOperand Ops2[] = { DAG.getConstant(X86::COND_NE, MVT::i8),
+ Tmp1.getValue(1) };
+ SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+ return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2);
+ }
+ }
+}
+
+SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) {
+ bool addTest = true;
+ SDOperand Chain = DAG.getEntryNode();
+ SDOperand Cond = Op.getOperand(0);
+ SDOperand CC;
+ const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+ if (Cond.getOpcode() == ISD::SETCC)
+ Cond = LowerSETCC(Cond, DAG, Chain);
+
+ if (Cond.getOpcode() == X86ISD::SETCC) {
+ CC = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then make a copy of it
+ // (since flag operand cannot be shared). Use it as the condition setting
+ // operand in place of the X86ISD::SETCC.
+ // If the X86ISD::SETCC has more than one use, then perhaps it's better
+ // to use a test instead of duplicating the X86ISD::CMP (for register
+ // pressure reason)?
+ SDOperand Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ bool IllegalFPCMov = !X86ScalarSSE &&
+ MVT::isFloatingPoint(Op.getValueType()) &&
+ !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
+ if ((Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) &&
+ !IllegalFPCMov) {
+ SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+ Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+ Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+ }
+
+ VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::Flag);
+ SmallVector<SDOperand, 4> Ops;
+ // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+ // condition is true.
+ Ops.push_back(Op.getOperand(2));
+ Ops.push_back(Op.getOperand(1));
+ Ops.push_back(CC);
+ Ops.push_back(Cond.getValue(1));
+ return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) {
+ bool addTest = true;
+ SDOperand Chain = Op.getOperand(0);
+ SDOperand Cond = Op.getOperand(1);
+ SDOperand Dest = Op.getOperand(2);
+ SDOperand CC;
+ const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+ if (Cond.getOpcode() == ISD::SETCC)
+ Cond = LowerSETCC(Cond, DAG, Chain);
+
+ if (Cond.getOpcode() == X86ISD::SETCC) {
+ CC = Cond.getOperand(0);
+
+ // If condition flag is set by a X86ISD::CMP, then make a copy of it
+ // (since flag operand cannot be shared). Use it as the condition setting
+ // operand in place of the X86ISD::SETCC.
+ // If the X86ISD::SETCC has more than one use, then perhaps it's better
+ // to use a test instead of duplicating the X86ISD::CMP (for register
+ // pressure reason)?
+ SDOperand Cmp = Cond.getOperand(1);
+ unsigned Opc = Cmp.getOpcode();
+ if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) {
+ SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+ Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+ addTest = false;
+ }
+ }
+
+ if (addTest) {
+ CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+ SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+ Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+ }
+ return DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
+ Cond, Op.getOperand(2), CC, Cond.getValue(1));
+}
+
+SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
+ unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+
+ if (Subtarget->is64Bit())
+ return LowerX86_64CCCCallTo(Op, DAG, CallingConv);
+ else
+ switch (CallingConv) {
+ default:
+ assert(0 && "Unsupported calling convention");
+ case CallingConv::Fast:
+ // TODO: Implement fastcc
+ // Falls through
+ case CallingConv::C:
+ case CallingConv::X86_StdCall:
+ return LowerCCCCallTo(Op, DAG, CallingConv);
+ case CallingConv::X86_FastCall:
+ return LowerFastCCCallTo(Op, DAG, CallingConv);
+ }
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDOperand
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDOperand Op,
+ SelectionDAG &DAG) {
+ assert(Subtarget->isTargetCygMing() &&
+ "This should be used only on Cygwin/Mingw targets");
+
+ // Get the inputs.
+ SDOperand Chain = Op.getOperand(0);
+ SDOperand Size = Op.getOperand(1);
+ // FIXME: Ensure alignment here
+
+ SDOperand Flag;
+
+ MVT::ValueType IntPtr = getPointerTy();
+ MVT::ValueType SPTy = (Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+
+ Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
+ Flag = Chain.getValue(1);
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDOperand Ops[] = { Chain,
+ DAG.getTargetExternalSymbol("_alloca", IntPtr),
+ DAG.getRegister(X86::EAX, IntPtr),
+ Flag };
+ Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 4);
+ Flag = Chain.getValue(1);
+
+ Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
+
+ std::vector<MVT::ValueType> Tys;
+ Tys.push_back(SPTy);
+ Tys.push_back(MVT::Other);
+ SDOperand Ops1[2] = { Chain.getValue(0), Chain };
+ return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2);
+}
+
+SDOperand
+X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const Function* Fn = MF.getFunction();
+ if (Fn->hasExternalLinkage() &&
+ Subtarget->isTargetCygMing() &&
+ Fn->getName() == "main")
+ MF.getInfo<X86MachineFunctionInfo>()->setForceFramePointer(true);
+
+ unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+ if (Subtarget->is64Bit())
+ return LowerX86_64CCCArguments(Op, DAG);
+ else
+ switch(CC) {
+ default:
+ assert(0 && "Unsupported calling convention");
+ case CallingConv::Fast:
+ // TODO: implement fastcc.
+
+ // Falls through
+ case CallingConv::C:
+ return LowerCCCArguments(Op, DAG);
+ case CallingConv::X86_StdCall:
+ MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(StdCall);
+ return LowerCCCArguments(Op, DAG, true);
+ case CallingConv::X86_FastCall:
+ MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(FastCall);
+ return LowerFastCCArguments(Op, DAG);
+ }
+}
+
+SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand InFlag(0, 0);
+ SDOperand Chain = Op.getOperand(0);
+ unsigned Align =
+ (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+ if (Align == 0) Align = 1;
+
+ ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ // If not DWORD aligned, call memset if size is less than the threshold.
+ // It knows how to align to the right boundary first.
+ if ((Align & 3) != 0 ||
+ (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+ MVT::ValueType IntPtr = getPointerTy();
+ const Type *IntPtrTy = getTargetData()->getIntPtrType();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Node = Op.getOperand(1);
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ // Extend the unsigned i8 argument to be an int value for the call.
+ Entry.Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Op.getOperand(2));
+ Entry.Ty = IntPtrTy;
+ Args.push_back(Entry);
+ Entry.Node = Op.getOperand(3);
+ Args.push_back(Entry);
+ std::pair<SDOperand,SDOperand> CallResult =
+ LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+ DAG.getExternalSymbol("memset", IntPtr), Args, DAG);
+ return CallResult.second;
+ }
+
+ MVT::ValueType AVT;
+ SDOperand Count;
+ ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ unsigned BytesLeft = 0;
+ bool TwoRepStos = false;
+ if (ValC) {
+ unsigned ValReg;
+ uint64_t Val = ValC->getValue() & 255;
+
+ // If the value is a constant, then we can potentially use larger sets.
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ ValReg = X86::AX;
+ Val = (Val << 8) | Val;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ ValReg = X86::EAX;
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) { // QWORD aligned
+ AVT = MVT::i64;
+ ValReg = X86::RAX;
+ Val = (Val << 32) | Val;
+ }
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ ValReg = X86::AL;
+ Count = Op.getOperand(3);
+ break;
+ }
+
+ if (AVT > MVT::i8) {
+ if (I) {
+ unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+ Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+ BytesLeft = I->getValue() % UBytes;
+ } else {
+ assert(AVT >= MVT::i32 &&
+ "Do not use rep;stos if not at least DWORD aligned");
+ Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+ Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+ TwoRepStos = true;
+ }
+ }
+
+ Chain = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT),
+ InFlag);
+ InFlag = Chain.getValue(1);
+ } else {
+ AVT = MVT::i8;
+ Count = Op.getOperand(3);
+ Chain = DAG.getCopyToReg(Chain, X86::AL, Op.getOperand(2), InFlag);
+ InFlag = Chain.getValue(1);
+ }
+
+ Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+ Op.getOperand(1), InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(AVT));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+
+ if (TwoRepStos) {
+ InFlag = Chain.getValue(1);
+ Count = Op.getOperand(3);
+ MVT::ValueType CVT = Count.getValueType();
+ SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+ Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(MVT::i8));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+ } else if (BytesLeft) {
+ // Issue stores for the last 1 - 7 bytes.
+ SDOperand Value;
+ unsigned Val = ValC->getValue() & 255;
+ unsigned Offset = I->getValue() - BytesLeft;
+ SDOperand DstAddr = Op.getOperand(1);
+ MVT::ValueType AddrVT = DstAddr.getValueType();
+ if (BytesLeft >= 4) {
+ Val = (Val << 8) | Val;
+ Val = (Val << 16) | Val;
+ Value = DAG.getConstant(Val, MVT::i32);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+ DAG.getConstant(Offset, AddrVT)),
+ NULL, 0);
+ BytesLeft -= 4;
+ Offset += 4;
+ }
+ if (BytesLeft >= 2) {
+ Value = DAG.getConstant((Val << 8) | Val, MVT::i16);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+ DAG.getConstant(Offset, AddrVT)),
+ NULL, 0);
+ BytesLeft -= 2;
+ Offset += 2;
+ }
+ if (BytesLeft == 1) {
+ Value = DAG.getConstant(Val, MVT::i8);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+ DAG.getConstant(Offset, AddrVT)),
+ NULL, 0);
+ }
+ }
+
+ return Chain;
+}
+
+SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
+ SDOperand Chain = Op.getOperand(0);
+ unsigned Align =
+ (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+ if (Align == 0) Align = 1;
+
+ ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ // If not DWORD aligned, call memcpy if size is less than the threshold.
+ // It knows how to align to the right boundary first.
+ if ((Align & 3) != 0 ||
+ (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+ MVT::ValueType IntPtr = getPointerTy();
+ TargetLowering::ArgListTy Args;
+ TargetLowering::ArgListEntry Entry;
+ Entry.Ty = getTargetData()->getIntPtrType();
+ Entry.Node = Op.getOperand(1); Args.push_back(Entry);
+ Entry.Node = Op.getOperand(2); Args.push_back(Entry);
+ Entry.Node = Op.getOperand(3); Args.push_back(Entry);
+ std::pair<SDOperand,SDOperand> CallResult =
+ LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+ DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
+ return CallResult.second;
+ }
+
+ MVT::ValueType AVT;
+ SDOperand Count;
+ unsigned BytesLeft = 0;
+ bool TwoRepMovs = false;
+ switch (Align & 3) {
+ case 2: // WORD aligned
+ AVT = MVT::i16;
+ break;
+ case 0: // DWORD aligned
+ AVT = MVT::i32;
+ if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) // QWORD aligned
+ AVT = MVT::i64;
+ break;
+ default: // Byte aligned
+ AVT = MVT::i8;
+ Count = Op.getOperand(3);
+ break;
+ }
+
+ if (AVT > MVT::i8) {
+ if (I) {
+ unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+ Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+ BytesLeft = I->getValue() % UBytes;
+ } else {
+ assert(AVT >= MVT::i32 &&
+ "Do not use rep;movs if not at least DWORD aligned");
+ Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+ Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+ TwoRepMovs = true;
+ }
+ }
+
+ SDOperand InFlag(0, 0);
+ Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+ Count, InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+ Op.getOperand(1), InFlag);
+ InFlag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI,
+ Op.getOperand(2), InFlag);
+ InFlag = Chain.getValue(1);
+
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(AVT));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+
+ if (TwoRepMovs) {
+ InFlag = Chain.getValue(1);
+ Count = Op.getOperand(3);
+ MVT::ValueType CVT = Count.getValueType();
+ SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+ DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+ Chain = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+ Left, InFlag);
+ InFlag = Chain.getValue(1);
+ Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getValueType(MVT::i8));
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+ } else if (BytesLeft) {
+ // Issue loads and stores for the last 1 - 7 bytes.
+ unsigned Offset = I->getValue() - BytesLeft;
+ SDOperand DstAddr = Op.getOperand(1);
+ MVT::ValueType DstVT = DstAddr.getValueType();
+ SDOperand SrcAddr = Op.getOperand(2);
+ MVT::ValueType SrcVT = SrcAddr.getValueType();
+ SDOperand Value;
+ if (BytesLeft >= 4) {
+ Value = DAG.getLoad(MVT::i32, Chain,
+ DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+ DAG.getConstant(Offset, SrcVT)),
+ NULL, 0);
+ Chain = Value.getValue(1);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, DstVT, DstAddr,
+ DAG.getConstant(Offset, DstVT)),
+ NULL, 0);
+ BytesLeft -= 4;
+ Offset += 4;
+ }
+ if (BytesLeft >= 2) {
+ Value = DAG.getLoad(MVT::i16, Chain,
+ DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+ DAG.getConstant(Offset, SrcVT)),
+ NULL, 0);
+ Chain = Value.getValue(1);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, DstVT, DstAddr,
+ DAG.getConstant(Offset, DstVT)),
+ NULL, 0);
+ BytesLeft -= 2;
+ Offset += 2;
+ }
+
+ if (BytesLeft == 1) {
+ Value = DAG.getLoad(MVT::i8, Chain,
+ DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+ DAG.getConstant(Offset, SrcVT)),
+ NULL, 0);
+ Chain = Value.getValue(1);
+ Chain = DAG.getStore(Chain, Value,
+ DAG.getNode(ISD::ADD, DstVT, DstAddr,
+ DAG.getConstant(Offset, DstVT)),
+ NULL, 0);
+ }
+ }
+
+ return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) {
+ SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SDOperand TheOp = Op.getOperand(0);
+ SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1);
+ if (Subtarget->is64Bit()) {
+ SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
+ SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX,
+ MVT::i64, Copy1.getValue(2));
+ SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2,
+ DAG.getConstant(32, MVT::i8));
+ SDOperand Ops[] = {
+ DAG.getNode(ISD::OR, MVT::i64, Copy1, Tmp), Copy2.getValue(1)
+ };
+
+ Tys = DAG.getVTList(MVT::i64, MVT::Other);
+ return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2);
+ }
+
+ SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
+ SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::EDX,
+ MVT::i32, Copy1.getValue(2));
+ SDOperand Ops[] = { Copy1, Copy2, Copy2.getValue(1) };
+ Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+ return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 3);
+}
+
+SDOperand X86TargetLowering::LowerVASTART(SDOperand Op, SelectionDAG &DAG) {
+ SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+
+ if (!Subtarget->is64Bit()) {
+ // vastart just stores the address of the VarArgsFrameIndex slot into the
+ // memory location argument.
+ SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+ return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV->getValue(),
+ SV->getOffset());
+ }
+
+ // __va_list_tag:
+ // gp_offset (0 - 6 * 8)
+ // fp_offset (48 - 48 + 8 * 16)
+ // overflow_arg_area (point to parameters coming in memory).
+ // reg_save_area
+ SmallVector<SDOperand, 8> MemOps;
+ SDOperand FIN = Op.getOperand(1);
+ // Store gp_offset
+ SDOperand Store = DAG.getStore(Op.getOperand(0),
+ DAG.getConstant(VarArgsGPOffset, MVT::i32),
+ FIN, SV->getValue(), SV->getOffset());
+ MemOps.push_back(Store);
+
+ // Store fp_offset
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+ DAG.getConstant(4, getPointerTy()));
+ Store = DAG.getStore(Op.getOperand(0),
+ DAG.getConstant(VarArgsFPOffset, MVT::i32),
+ FIN, SV->getValue(), SV->getOffset());
+ MemOps.push_back(Store);
+
+ // Store ptr to overflow_arg_area
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+ DAG.getConstant(4, getPointerTy()));
+ SDOperand OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV->getValue(),
+ SV->getOffset());
+ MemOps.push_back(Store);
+
+ // Store ptr to reg_save_area.
+ FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+ DAG.getConstant(8, getPointerTy()));
+ SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+ Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV->getValue(),
+ SV->getOffset());
+ MemOps.push_back(Store);
+ return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size());
+}
+
+SDOperand X86TargetLowering::LowerVACOPY(SDOperand Op, SelectionDAG &DAG) {
+ // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+ SDOperand Chain = Op.getOperand(0);
+ SDOperand DstPtr = Op.getOperand(1);
+ SDOperand SrcPtr = Op.getOperand(2);
+ SrcValueSDNode *DstSV = cast<SrcValueSDNode>(Op.getOperand(3));
+ SrcValueSDNode *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4));
+
+ SrcPtr = DAG.getLoad(getPointerTy(), Chain, SrcPtr,
+ SrcSV->getValue(), SrcSV->getOffset());
+ Chain = SrcPtr.getValue(1);
+ for (unsigned i = 0; i < 3; ++i) {
+ SDOperand Val = DAG.getLoad(MVT::i64, Chain, SrcPtr,
+ SrcSV->getValue(), SrcSV->getOffset());
+ Chain = Val.getValue(1);
+ Chain = DAG.getStore(Chain, Val, DstPtr,
+ DstSV->getValue(), DstSV->getOffset());
+ if (i == 2)
+ break;
+ SrcPtr = DAG.getNode(ISD::ADD, getPointerTy(), SrcPtr,
+ DAG.getConstant(8, getPointerTy()));
+ DstPtr = DAG.getNode(ISD::ADD, getPointerTy(), DstPtr,
+ DAG.getConstant(8, getPointerTy()));
+ }
+ return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) {
+ unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue();
+ switch (IntNo) {
+ default: return SDOperand(); // Don't custom lower most intrinsics.
+ // Comparison intrinsics.
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd: {
+ unsigned Opc = 0;
+ ISD::CondCode CC = ISD::SETCC_INVALID;
+ switch (IntNo) {
+ default: break;
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse2_comilt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse2_comile_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse2_comigt_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse2_comige_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse2_comineq_sd:
+ Opc = X86ISD::COMI;
+ CC = ISD::SETNE;
+ break;
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETEQ;
+ break;
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLT;
+ break;
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETLE;
+ break;
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGT;
+ break;
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETGE;
+ break;
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ Opc = X86ISD::UCOMI;
+ CC = ISD::SETNE;
+ break;
+ }
+
+ unsigned X86CC;
+ SDOperand LHS = Op.getOperand(1);
+ SDOperand RHS = Op.getOperand(2);
+ translateX86CC(CC, true, X86CC, LHS, RHS, DAG);
+
+ const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+ SDOperand Ops1[] = { DAG.getEntryNode(), LHS, RHS };
+ SDOperand Cond = DAG.getNode(Opc, VTs, 2, Ops1, 3);
+ VTs = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+ SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+ SDOperand SetCC = DAG.getNode(X86ISD::SETCC, VTs, 2, Ops2, 2);
+ return DAG.getNode(ISD::ANY_EXTEND, MVT::i32, SetCC);
+ }
+ }
+}
+
+SDOperand X86TargetLowering::LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG) {
+ // Depths > 0 not supported yet!
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+ return SDOperand();
+
+ // Just load the return address
+ SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG) {
+ // Depths > 0 not supported yet!
+ if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+ return SDOperand();
+
+ SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+ return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI,
+ DAG.getConstant(4, getPointerTy()));
+}
+
+SDOperand X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDOperand Op,
+ SelectionDAG &DAG) {
+ // Is not yet supported on x86-64
+ if (Subtarget->is64Bit())
+ return SDOperand();
+
+ return DAG.getConstant(8, getPointerTy());
+}
+
+SDOperand X86TargetLowering::LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG)
+{
+ assert(!Subtarget->is64Bit() &&
+ "Lowering of eh_return builtin is not supported yet on x86-64");
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDOperand Chain = Op.getOperand(0);
+ SDOperand Offset = Op.getOperand(1);
+ SDOperand Handler = Op.getOperand(2);
+
+ SDOperand Frame = DAG.getRegister(RegInfo->getFrameRegister(MF),
+ getPointerTy());
+
+ SDOperand StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame,
+ DAG.getConstant(-4UL, getPointerTy()));
+ StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset);
+ Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0);
+ Chain = DAG.getCopyToReg(Chain, X86::ECX, StoreAddr);
+ MF.addLiveOut(X86::ECX);
+
+ return DAG.getNode(X86ISD::EH_RETURN, MVT::Other,
+ Chain, DAG.getRegister(X86::ECX, getPointerTy()));
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+ switch (Op.getOpcode()) {
+ default: assert(0 && "Should not custom lower this!");
+ case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
+ case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+ case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
+ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
+ case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+ case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
+ case ISD::SHL_PARTS:
+ case ISD::SRA_PARTS:
+ case ISD::SRL_PARTS: return LowerShift(Op, DAG);
+ case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
+ case ISD::FABS: return LowerFABS(Op, DAG);
+ case ISD::FNEG: return LowerFNEG(Op, DAG);
+ case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
+ case ISD::SETCC: return LowerSETCC(Op, DAG, DAG.getEntryNode());
+ case ISD::SELECT: return LowerSELECT(Op, DAG);
+ case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+ case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+ case ISD::CALL: return LowerCALL(Op, DAG);
+ case ISD::RET: return LowerRET(Op, DAG);
+ case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG);
+ case ISD::MEMSET: return LowerMEMSET(Op, DAG);
+ case ISD::MEMCPY: return LowerMEMCPY(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLCECOUNTER(Op, DAG);
+ case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
+ case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
+ case ISD::FRAME_TO_ARGS_OFFSET:
+ return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+ case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ }
+ return SDOperand();
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+ switch (Opcode) {
+ default: return NULL;
+ case X86ISD::SHLD: return "X86ISD::SHLD";
+ case X86ISD::SHRD: return "X86ISD::SHRD";
+ case X86ISD::FAND: return "X86ISD::FAND";
+ case X86ISD::FOR: return "X86ISD::FOR";
+ case X86ISD::FXOR: return "X86ISD::FXOR";
+ case X86ISD::FSRL: return "X86ISD::FSRL";
+ case X86ISD::FILD: return "X86ISD::FILD";
+ case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
+ case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+ case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+ case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+ case X86ISD::FLD: return "X86ISD::FLD";
+ case X86ISD::FST: return "X86ISD::FST";
+ case X86ISD::FP_GET_RESULT: return "X86ISD::FP_GET_RESULT";
+ case X86ISD::FP_SET_RESULT: return "X86ISD::FP_SET_RESULT";
+ case X86ISD::CALL: return "X86ISD::CALL";
+ case X86ISD::TAILCALL: return "X86ISD::TAILCALL";
+ case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
+ case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::COMI: return "X86ISD::COMI";
+ case X86ISD::UCOMI: return "X86ISD::UCOMI";
+ case X86ISD::SETCC: return "X86ISD::SETCC";
+ case X86ISD::CMOV: return "X86ISD::CMOV";
+ case X86ISD::BRCOND: return "X86ISD::BRCOND";
+ case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
+ case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
+ case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
+ case X86ISD::LOAD_PACK: return "X86ISD::LOAD_PACK";
+ case X86ISD::LOAD_UA: return "X86ISD::LOAD_UA";
+ case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
+ case X86ISD::Wrapper: return "X86ISD::Wrapper";
+ case X86ISD::S2VEC: return "X86ISD::S2VEC";
+ case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
+ case X86ISD::PINSRW: return "X86ISD::PINSRW";
+ case X86ISD::FMAX: return "X86ISD::FMAX";
+ case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
+ case X86ISD::FRCP: return "X86ISD::FRCP";
+ case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
+ case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER";
+ case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
+ }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+ const Type *Ty) const {
+ // X86 supports extremely general addressing modes.
+
+ // X86 allows a sign-extended 32-bit immediate field as a displacement.
+ if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+ return false;
+
+ if (AM.BaseGV) {
+ // X86-64 only supports addr of globals in small code model.
+ if (Subtarget->is64Bit() &&
+ getTargetMachine().getCodeModel() != CodeModel::Small)
+ return false;
+
+ // We can only fold this if we don't need a load either.
+ if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+ return false;
+ }
+
+ switch (AM.Scale) {
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 8:
+ // These scales always work.
+ break;
+ case 3:
+ case 5:
+ case 9:
+ // These scales are formed with basereg+scalereg. Only accept if there is
+ // no basereg yet.
+ if (AM.HasBaseReg)
+ return false;
+ break;
+ default: // Other stuff never works.
+ return false;
+ }
+
+ return true;
+}
+
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const {
+ // Only do shuffles on 128-bit vector types for now.
+ if (MVT::getSizeInBits(VT) == 64) return false;
+ return (Mask.Val->getNumOperands() <= 4 ||
+ isIdentityMask(Mask.Val) ||
+ isIdentityMask(Mask.Val, true) ||
+ isSplatMask(Mask.Val) ||
+ isPSHUFHW_PSHUFLWMask(Mask.Val) ||
+ X86::isUNPCKLMask(Mask.Val) ||
+ X86::isUNPCKHMask(Mask.Val) ||
+ X86::isUNPCKL_v_undef_Mask(Mask.Val) ||
+ X86::isUNPCKH_v_undef_Mask(Mask.Val));
+}
+
+bool X86TargetLowering::isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+ MVT::ValueType EVT,
+ SelectionDAG &DAG) const {
+ unsigned NumElts = BVOps.size();
+ // Only do shuffles on 128-bit vector types for now.
+ if (MVT::getSizeInBits(EVT) * NumElts == 64) return false;
+ if (NumElts == 2) return true;
+ if (NumElts == 4) {
+ return (isMOVLMask(&BVOps[0], 4) ||
+ isCommutedMOVL(&BVOps[0], 4, true) ||
+ isSHUFPMask(&BVOps[0], 4) ||
+ isCommutedSHUFP(&BVOps[0], 4));
+ }
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+X86TargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+ MachineBasicBlock *BB) {
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ switch (MI->getOpcode()) {
+ default: assert(false && "Unexpected instr type to insert");
+ case X86::CMOV_FR32:
+ case X86::CMOV_FR64:
+ case X86::CMOV_V4F32:
+ case X86::CMOV_V2F64:
+ case X86::CMOV_V2I64: {
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between, and a branch opcode to use.
+ const BasicBlock *LLVM_BB = BB->getBasicBlock();
+ ilist<MachineBasicBlock>::iterator It = BB;
+ ++It;
+
+ // thisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> copy0MBB
+ MachineBasicBlock *thisMBB = BB;
+ MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+ unsigned Opc =
+ X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+ BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB);
+ MachineFunction *F = BB->getParent();
+ F->getBasicBlockList().insert(It, copy0MBB);
+ F->getBasicBlockList().insert(It, sinkMBB);
+ // Update machine-CFG edges by first adding all successors of the current
+ // block to the new block which will contain the Phi node for the select.
+ for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+ e = BB->succ_end(); i != e; ++i)
+ sinkMBB->addSuccessor(*i);
+ // Next, remove all successors of the current block, and add the true
+ // and fallthrough blocks as its successors.
+ while(!BB->succ_empty())
+ BB->removeSuccessor(BB->succ_begin());
+ BB->addSuccessor(copy0MBB);
+ BB->addSuccessor(sinkMBB);
+
+ // copy0MBB:
+ // %FalseValue = ...
+ // # fallthrough to sinkMBB
+ BB = copy0MBB;
+
+ // Update machine-CFG edges
+ BB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // ...
+ BB = sinkMBB;
+ BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg())
+ .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+ .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+ delete MI; // The pseudo instruction is gone now.
+ return BB;
+ }
+
+ case X86::FP32_TO_INT16_IN_MEM:
+ case X86::FP32_TO_INT32_IN_MEM:
+ case X86::FP32_TO_INT64_IN_MEM:
+ case X86::FP64_TO_INT16_IN_MEM:
+ case X86::FP64_TO_INT32_IN_MEM:
+ case X86::FP64_TO_INT64_IN_MEM: {
+ // Change the floating point control register to use "round towards zero"
+ // mode when truncating to an integer value.
+ MachineFunction *F = BB->getParent();
+ int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
+ addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+ // Load the old value of the high byte of the control word...
+ unsigned OldCW =
+ F->getSSARegMap()->createVirtualRegister(X86::GR16RegisterClass);
+ addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx);
+
+ // Set the high part to be round to zero...
+ addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx)
+ .addImm(0xC7F);
+
+ // Reload the modified control word now...
+ addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ // Restore the memory image of control word to original value
+ addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx)
+ .addReg(OldCW);
+
+ // Get the X86 opcode to use.
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: assert(0 && "illegal opcode!");
+ case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+ case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+ case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+ case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+ case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+ case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+ }
+
+ X86AddressMode AM;
+ MachineOperand &Op = MI->getOperand(0);
+ if (Op.isRegister()) {
+ AM.BaseType = X86AddressMode::RegBase;
+ AM.Base.Reg = Op.getReg();
+ } else {
+ AM.BaseType = X86AddressMode::FrameIndexBase;
+ AM.Base.FrameIndex = Op.getFrameIndex();
+ }
+ Op = MI->getOperand(1);
+ if (Op.isImmediate())
+ AM.Scale = Op.getImm();
+ Op = MI->getOperand(2);
+ if (Op.isImmediate())
+ AM.IndexReg = Op.getImm();
+ Op = MI->getOperand(3);
+ if (Op.isGlobalAddress()) {
+ AM.GV = Op.getGlobal();
+ } else {
+ AM.Disp = Op.getImm();
+ }
+ addFullAddress(BuildMI(BB, TII->get(Opc)), AM)
+ .addReg(MI->getOperand(4).getReg());
+
+ // Reload the original control word now.
+ addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+ delete MI; // The pseudo instruction is gone now.
+ return BB;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+ uint64_t Mask,
+ uint64_t &KnownZero,
+ uint64_t &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ unsigned Opc = Op.getOpcode();
+ assert((Opc >= ISD::BUILTIN_OP_END ||
+ Opc == ISD::INTRINSIC_WO_CHAIN ||
+ Opc == ISD::INTRINSIC_W_CHAIN ||
+ Opc == ISD::INTRINSIC_VOID) &&
+ "Should use MaskedValueIsZero if you don't know whether Op"
+ " is a target node!");
+
+ KnownZero = KnownOne = 0; // Don't know anything.
+ switch (Opc) {
+ default: break;
+ case X86ISD::SETCC:
+ KnownZero |= (MVT::getIntVTBitMask(Op.getValueType()) ^ 1ULL);
+ break;
+ }
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {
+ MVT::ValueType VT = N->getValueType(0);
+ SDOperand PermMask = N->getOperand(2);
+ unsigned NumElems = PermMask.getNumOperands();
+ SDOperand V = (i < NumElems) ? N->getOperand(0) : N->getOperand(1);
+ i %= NumElems;
+ if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+ return (i == 0)
+ ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+ } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ SDOperand Idx = PermMask.getOperand(i);
+ if (Idx.getOpcode() == ISD::UNDEF)
+ return DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+ return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Idx)->getValue(),DAG);
+ }
+ return SDOperand();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + an offset.
+static bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) {
+ unsigned Opc = N->getOpcode();
+ if (Opc == X86ISD::Wrapper) {
+ if (dyn_cast<GlobalAddressSDNode>(N->getOperand(0))) {
+ GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+ return true;
+ }
+ } else if (Opc == ISD::ADD) {
+ SDOperand N1 = N->getOperand(0);
+ SDOperand N2 = N->getOperand(1);
+ if (isGAPlusOffset(N1.Val, GA, Offset)) {
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+ if (V) {
+ Offset += V->getSignExtended();
+ return true;
+ }
+ } else if (isGAPlusOffset(N2.Val, GA, Offset)) {
+ ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+ if (V) {
+ Offset += V->getSignExtended();
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+/// isConsecutiveLoad - Returns true if N is loading from an address of Base
+/// + Dist * Size.
+static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
+ MachineFrameInfo *MFI) {
+ if (N->getOperand(0).Val != Base->getOperand(0).Val)
+ return false;
+
+ SDOperand Loc = N->getOperand(1);
+ SDOperand BaseLoc = Base->getOperand(1);
+ if (Loc.getOpcode() == ISD::FrameIndex) {
+ if (BaseLoc.getOpcode() != ISD::FrameIndex)
+ return false;
+ int FI = dyn_cast<FrameIndexSDNode>(Loc)->getIndex();
+ int BFI = dyn_cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+ int FS = MFI->getObjectSize(FI);
+ int BFS = MFI->getObjectSize(BFI);
+ if (FS != BFS || FS != Size) return false;
+ return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Size);
+ } else {
+ GlobalValue *GV1 = NULL;
+ GlobalValue *GV2 = NULL;
+ int64_t Offset1 = 0;
+ int64_t Offset2 = 0;
+ bool isGA1 = isGAPlusOffset(Loc.Val, GV1, Offset1);
+ bool isGA2 = isGAPlusOffset(BaseLoc.Val, GV2, Offset2);
+ if (isGA1 && isGA2 && GV1 == GV2)
+ return Offset1 == (Offset2 + Dist*Size);
+ }
+
+ return false;
+}
+
+static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
+ const X86Subtarget *Subtarget) {
+ GlobalValue *GV;
+ int64_t Offset;
+ if (isGAPlusOffset(Base, GV, Offset))
+ return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
+ else {
+ assert(Base->getOpcode() == ISD::FrameIndex && "Unexpected base node!");
+ int BFI = dyn_cast<FrameIndexSDNode>(Base)->getIndex();
+ if (BFI < 0)
+ // Fixed objects do not specify alignment, however the offsets are known.
+ return ((Subtarget->getStackAlignment() % 16) == 0 &&
+ (MFI->getObjectOffset(BFI) % 16) == 0);
+ else
+ return MFI->getObjectAlignment(BFI) >= 16;
+ }
+ return false;
+}
+
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.
+static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MVT::ValueType VT = N->getValueType(0);
+ MVT::ValueType EVT = MVT::getVectorElementType(VT);
+ SDOperand PermMask = N->getOperand(2);
+ int NumElems = (int)PermMask.getNumOperands();
+ SDNode *Base = NULL;
+ for (int i = 0; i < NumElems; ++i) {
+ SDOperand Idx = PermMask.getOperand(i);
+ if (Idx.getOpcode() == ISD::UNDEF) {
+ if (!Base) return SDOperand();
+ } else {
+ SDOperand Arg =
+ getShuffleScalarElt(N, cast<ConstantSDNode>(Idx)->getValue(), DAG);
+ if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
+ return SDOperand();
+ if (!Base)
+ Base = Arg.Val;
+ else if (!isConsecutiveLoad(Arg.Val, Base,
+ i, MVT::getSizeInBits(EVT)/8,MFI))
+ return SDOperand();
+ }
+ }
+
+ bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget);
+ if (isAlign16) {
+ LoadSDNode *LD = cast<LoadSDNode>(Base);
+ return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
+ LD->getSrcValueOffset());
+ } else {
+ // Just use movups, it's shorter.
+ SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+ SmallVector<SDOperand, 3> Ops;
+ Ops.push_back(Base->getOperand(0));
+ Ops.push_back(Base->getOperand(1));
+ Ops.push_back(Base->getOperand(2));
+ return DAG.getNode(ISD::BIT_CONVERT, VT,
+ DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size()));
+ }
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDOperand Cond = N->getOperand(0);
+
+ // If we have SSE[12] support, try to form min/max nodes.
+ if (Subtarget->hasSSE2() &&
+ (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
+ if (Cond.getOpcode() == ISD::SETCC) {
+ // Get the LHS/RHS of the select.
+ SDOperand LHS = N->getOperand(1);
+ SDOperand RHS = N->getOperand(2);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ unsigned Opcode = 0;
+ if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min
+ case ISD::SETLT:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOGT: // (X > Y) ? X : Y -> max
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max
+ case ISD::SETGE:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+ switch (CC) {
+ default: break;
+ case ISD::SETOGT: // (X > Y) ? Y : X -> min
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min
+ case ISD::SETGE:
+ Opcode = X86ISD::FMIN;
+ break;
+
+ case ISD::SETOLE: // (X <= Y) ? Y : X -> max
+ case ISD::SETULE:
+ case ISD::SETLE:
+ if (!UnsafeFPMath) break;
+ // FALL THROUGH.
+ case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max
+ case ISD::SETLT:
+ Opcode = X86ISD::FMAX;
+ break;
+ }
+ }
+
+ if (Opcode)
+ return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS);
+ }
+
+ }
+
+ return SDOperand();
+}
+
+
+SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ switch (N->getOpcode()) {
+ default: break;
+ case ISD::VECTOR_SHUFFLE:
+ return PerformShuffleCombine(N, DAG, Subtarget);
+ case ISD::SELECT:
+ return PerformSELECTCombine(N, DAG, Subtarget);
+ }
+
+ return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+ if (Constraint.size() == 1) {
+ switch (Constraint[0]) {
+ case 'A':
+ case 'r':
+ case 'R':
+ case 'l':
+ case 'q':
+ case 'Q':
+ case 'x':
+ case 'Y':
+ return C_RegisterClass;
+ default:
+ break;
+ }
+ }
+ return TargetLowering::getConstraintType(Constraint);
+}
+
+/// isOperandValidForConstraint - Return the specified operand (possibly
+/// modified) if the specified SDOperand is valid for the specified target
+/// constraint letter, otherwise return null.
+SDOperand X86TargetLowering::
+isOperandValidForConstraint(SDOperand Op, char Constraint, SelectionDAG &DAG) {
+ switch (Constraint) {
+ default: break;
+ case 'I':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getValue() <= 31)
+ return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+ }
+ return SDOperand(0,0);
+ case 'N':
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+ if (C->getValue() <= 255)
+ return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+ }
+ return SDOperand(0,0);
+ case 'i': {
+ // Literal immediates are always ok.
+ if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op))
+ return DAG.getTargetConstant(CST->getValue(), Op.getValueType());
+
+ // If we are in non-pic codegen mode, we allow the address of a global (with
+ // an optional displacement) to be used with 'i'.
+ GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+ int64_t Offset = 0;
+
+ // Match either (GA) or (GA+C)
+ if (GA) {
+ Offset = GA->getOffset();
+ } else if (Op.getOpcode() == ISD::ADD) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+ if (C && GA) {
+ Offset = GA->getOffset()+C->getValue();
+ } else {
+ C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+ if (C && GA)
+ Offset = GA->getOffset()+C->getValue();
+ else
+ C = 0, GA = 0;
+ }
+ }
+
+ if (GA) {
+ // If addressing this global requires a load (e.g. in PIC mode), we can't
+ // match.
+ if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
+ false))
+ return SDOperand(0, 0);
+
+ Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+ Offset);
+ return Op;
+ }
+
+ // Otherwise, not valid for this mode.
+ return SDOperand(0, 0);
+ }
+ }
+ return TargetLowering::isOperandValidForConstraint(Op, Constraint, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ MVT::ValueType VT) const {
+ if (Constraint.size() == 1) {
+ // FIXME: not handling fp-stack yet!
+ switch (Constraint[0]) { // GCC X86 Constraint Letters
+ default: break; // Unknown constraint letter
+ case 'A': // EAX/EDX
+ if (VT == MVT::i32 || VT == MVT::i64)
+ return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
+ break;
+ case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode)
+ case 'Q': // Q_REGS
+ if (VT == MVT::i32)
+ return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+ else if (VT == MVT::i16)
+ return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+ else if (VT == MVT::i8)
+ return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::DL, 0);
+ break;
+ }
+ }
+
+ return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT::ValueType VT) const {
+ // First, see if this is a constraint that directly corresponds to an LLVM
+ // register class.
+ if (Constraint.size() == 1) {
+ // GCC Constraint Letters
+ switch (Constraint[0]) {
+ default: break;
+ case 'r': // GENERAL_REGS
+ case 'R': // LEGACY_REGS
+ case 'l': // INDEX_REGS
+ if (VT == MVT::i64 && Subtarget->is64Bit())
+ return std::make_pair(0U, X86::GR64RegisterClass);
+ if (VT == MVT::i32)
+ return std::make_pair(0U, X86::GR32RegisterClass);
+ else if (VT == MVT::i16)
+ return std::make_pair(0U, X86::GR16RegisterClass);
+ else if (VT == MVT::i8)
+ return std::make_pair(0U, X86::GR8RegisterClass);
+ break;
+ case 'y': // MMX_REGS if MMX allowed.
+ if (!Subtarget->hasMMX()) break;
+ return std::make_pair(0U, X86::VR64RegisterClass);
+ break;
+ case 'Y': // SSE_REGS if SSE2 allowed
+ if (!Subtarget->hasSSE2()) break;
+ // FALL THROUGH.
+ case 'x': // SSE_REGS if SSE1 allowed
+ if (!Subtarget->hasSSE1()) break;
+
+ switch (VT) {
+ default: break;
+ // Scalar SSE types.
+ case MVT::f32:
+ case MVT::i32:
+ return std::make_pair(0U, X86::FR32RegisterClass);
+ case MVT::f64:
+ case MVT::i64:
+ return std::make_pair(0U, X86::FR64RegisterClass);
+ // Vector types.
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ case MVT::v2i64:
+ case MVT::v4f32:
+ case MVT::v2f64:
+ return std::make_pair(0U, X86::VR128RegisterClass);
+ }
+ break;
+ }
+ }
+
+ // Use the default implementation in TargetLowering to convert the register
+ // constraint into a member of a register class.
+ std::pair<unsigned, const TargetRegisterClass*> Res;
+ Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+ // Not found as a standard register?
+ if (Res.second == 0) {
+ // GCC calls "st(0)" just plain "st".
+ if (StringsEqualNoCase("{st}", Constraint)) {
+ Res.first = X86::ST0;
+ Res.second = X86::RSTRegisterClass;
+ }
+
+ return Res;
+ }
+
+ // Otherwise, check to see if this is a register class of the wrong value
+ // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+ // turn into {ax},{dx}.
+ if (Res.second->hasType(VT))
+ return Res; // Correct type already, nothing to do.
+
+ // All of the single-register GCC register classes map their values onto
+ // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we
+ // really want an 8-bit or 32-bit register, map to the appropriate register
+ // class and return the appropriate register.
+ if (Res.second != X86::GR16RegisterClass)
+ return Res;
+
+ if (VT == MVT::i8) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::AL; break;
+ case X86::DX: DestReg = X86::DL; break;
+ case X86::CX: DestReg = X86::CL; break;
+ case X86::BX: DestReg = X86::BL; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = Res.second = X86::GR8RegisterClass;
+ }
+ } else if (VT == MVT::i32) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::EAX; break;
+ case X86::DX: DestReg = X86::EDX; break;
+ case X86::CX: DestReg = X86::ECX; break;
+ case X86::BX: DestReg = X86::EBX; break;
+ case X86::SI: DestReg = X86::ESI; break;
+ case X86::DI: DestReg = X86::EDI; break;
+ case X86::BP: DestReg = X86::EBP; break;
+ case X86::SP: DestReg = X86::ESP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = Res.second = X86::GR32RegisterClass;
+ }
+ } else if (VT == MVT::i64) {
+ unsigned DestReg = 0;
+ switch (Res.first) {
+ default: break;
+ case X86::AX: DestReg = X86::RAX; break;
+ case X86::DX: DestReg = X86::RDX; break;
+ case X86::CX: DestReg = X86::RCX; break;
+ case X86::BX: DestReg = X86::RBX; break;
+ case X86::SI: DestReg = X86::RSI; break;
+ case X86::DI: DestReg = X86::RDI; break;
+ case X86::BP: DestReg = X86::RBP; break;
+ case X86::SP: DestReg = X86::RSP; break;
+ }
+ if (DestReg) {
+ Res.first = DestReg;
+ Res.second = Res.second = X86::GR64RegisterClass;
+ }
+ }
+
+ return Res;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..07a96d3
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,437 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+ namespace X86ISD {
+ // X86 Specific DAG Nodes
+ enum NodeType {
+ // Start the numbering where the builtin ops leave off.
+ FIRST_NUMBER = ISD::BUILTIN_OP_END+X86::INSTRUCTION_LIST_END,
+
+ /// SHLD, SHRD - Double shift instructions. These correspond to
+ /// X86::SHLDxx and X86::SHRDxx instructions.
+ SHLD,
+ SHRD,
+
+ /// FAND - Bitwise logical AND of floating point values. This corresponds
+ /// to X86::ANDPS or X86::ANDPD.
+ FAND,
+
+ /// FOR - Bitwise logical OR of floating point values. This corresponds
+ /// to X86::ORPS or X86::ORPD.
+ FOR,
+
+ /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+ /// to X86::XORPS or X86::XORPD.
+ FXOR,
+
+ /// FSRL - Bitwise logical right shift of floating point values. These
+ /// corresponds to X86::PSRLDQ.
+ FSRL,
+
+ /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+ /// integer source in memory and FP reg result. This corresponds to the
+ /// X86::FILD*m instructions. It has three inputs (token chain, address,
+ /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+ /// also produces a flag).
+ FILD,
+ FILD_FLAG,
+
+ /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+ /// integer destination in memory and a FP reg source. This corresponds
+ /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+ /// has two inputs (token chain and address) and two outputs (int value
+ /// and token chain).
+ FP_TO_INT16_IN_MEM,
+ FP_TO_INT32_IN_MEM,
+ FP_TO_INT64_IN_MEM,
+
+ /// FLD - This instruction implements an extending load to FP stack slots.
+ /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+ /// operand, ptr to load from, and a ValueType node indicating the type
+ /// to load to.
+ FLD,
+
+ /// FST - This instruction implements a truncating store to FP stack
+ /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+ /// chain operand, value to store, address, and a ValueType to store it
+ /// as.
+ FST,
+
+ /// FP_GET_RESULT - This corresponds to FpGETRESULT pseudo instruction
+ /// which copies from ST(0) to the destination. It takes a chain and
+ /// writes a RFP result and a chain.
+ FP_GET_RESULT,
+
+ /// FP_SET_RESULT - This corresponds to FpSETRESULT pseudo instruction
+ /// which copies the source operand to ST(0). It takes a chain+value and
+ /// returns a chain and a flag.
+ FP_SET_RESULT,
+
+ /// CALL/TAILCALL - These operations represent an abstract X86 call
+ /// instruction, which includes a bunch of information. In particular the
+ /// operands of these node are:
+ ///
+ /// #0 - The incoming token chain
+ /// #1 - The callee
+ /// #2 - The number of arg bytes the caller pushes on the stack.
+ /// #3 - The number of arg bytes the callee pops off the stack.
+ /// #4 - The value to pass in AL/AX/EAX (optional)
+ /// #5 - The value to pass in DL/DX/EDX (optional)
+ ///
+ /// The result values of these nodes are:
+ ///
+ /// #0 - The outgoing token chain
+ /// #1 - The first register result value (optional)
+ /// #2 - The second register result value (optional)
+ ///
+ /// The CALL vs TAILCALL distinction boils down to whether the callee is
+ /// known not to modify the caller's stack frame, as is standard with
+ /// LLVM.
+ CALL,
+ TAILCALL,
+
+ /// RDTSC_DAG - This operation implements the lowering for
+ /// readcyclecounter
+ RDTSC_DAG,
+
+ /// X86 compare and logical compare instructions.
+ CMP, TEST, COMI, UCOMI,
+
+ /// X86 SetCC. Operand 1 is condition code, and operand 2 is the flag
+ /// operand produced by a CMP instruction.
+ SETCC,
+
+ /// X86 conditional moves. Operand 1 and operand 2 are the two values
+ /// to select from (operand 1 is a R/W operand). Operand 3 is the
+ /// condition code, and operand 4 is the flag operand produced by a CMP
+ /// or TEST instruction. It also writes a flag result.
+ CMOV,
+
+ /// X86 conditional branches. Operand 1 is the chain operand, operand 2
+ /// is the block to branch if condition is true, operand 3 is the
+ /// condition code, and operand 4 is the flag operand produced by a CMP
+ /// or TEST instruction.
+ BRCOND,
+
+ /// Return with a flag operand. Operand 1 is the chain operand, operand
+ /// 2 is the number of bytes of stack to pop.
+ RET_FLAG,
+
+ /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+ REP_STOS,
+
+ /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+ REP_MOVS,
+
+ /// LOAD_PACK Load a 128-bit packed float / double value. It has the same
+ /// operands as a normal load.
+ LOAD_PACK,
+
+ /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as
+ /// a normal load.
+ LOAD_UA,
+
+ /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+ /// at function entry, used for PIC code.
+ GlobalBaseReg,
+
+ /// Wrapper - A wrapper node for TargetConstantPool,
+ /// TargetExternalSymbol, and TargetGlobalAddress.
+ Wrapper,
+
+ /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+ /// relative displacements.
+ WrapperRIP,
+
+ /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not
+ /// have to match the operand type.
+ S2VEC,
+
+ /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+ /// i32, corresponds to X86::PEXTRW.
+ PEXTRW,
+
+ /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+ /// corresponds to X86::PINSRW.
+ PINSRW,
+
+ /// FMAX, FMIN - Floating point max and min.
+ ///
+ FMAX, FMIN,
+
+ /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+ /// approximation. Note that these typically require refinement
+ /// in order to obtain suitable precision.
+ FRSQRT, FRCP,
+
+ // Thread Local Storage
+ TLSADDR, THREAD_POINTER,
+
+ // Exception Handling helpers
+ EH_RETURN
+ };
+ }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+ /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFDMask(SDNode *N);
+
+ /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFHWMask(SDNode *N);
+
+ /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+ bool isPSHUFLWMask(SDNode *N);
+
+ /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+ bool isSHUFPMask(SDNode *N);
+
+ /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+ bool isMOVHLPSMask(SDNode *N);
+
+ /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+ /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+ /// <2, 3, 2, 3>
+ bool isMOVHLPS_v_undef_Mask(SDNode *N);
+
+ /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+ bool isMOVLPMask(SDNode *N);
+
+ /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+ /// as well as MOVLHPS.
+ bool isMOVHPMask(SDNode *N);
+
+ /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+ bool isUNPCKLMask(SDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+ bool isUNPCKHMask(SDNode *N, bool V2IsSplat = false);
+
+ /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+ /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+ /// <0, 0, 1, 1>
+ bool isUNPCKL_v_undef_Mask(SDNode *N);
+
+ /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+ /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+ /// <2, 2, 3, 3>
+ bool isUNPCKH_v_undef_Mask(SDNode *N);
+
+ /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSS,
+ /// MOVSD, and MOVD, i.e. setting the lowest element.
+ bool isMOVLMask(SDNode *N);
+
+ /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+ bool isMOVSHDUPMask(SDNode *N);
+
+ /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+ bool isMOVSLDUPMask(SDNode *N);
+
+ /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a splat of a single element.
+ bool isSplatMask(SDNode *N);
+
+ /// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+ /// specifies a splat of zero element.
+ bool isSplatLoMask(SDNode *N);
+
+ /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+ /// instructions.
+ unsigned getShuffleSHUFImmediate(SDNode *N);
+
+ /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+ /// instructions.
+ unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+ /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
+ /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+ /// instructions.
+ unsigned getShufflePSHUFLWImmediate(SDNode *N);
+ }
+
+ //===--------------------------------------------------------------------===//
+ // X86TargetLowering - X86 Implementation of the TargetLowering interface
+ class X86TargetLowering : public TargetLowering {
+ int VarArgsFrameIndex; // FrameIndex for start of varargs area.
+ int RegSaveFrameIndex; // X86-64 vararg func register save area.
+ unsigned VarArgsGPOffset; // X86-64 vararg func int reg offset.
+ unsigned VarArgsFPOffset; // X86-64 vararg func fp reg offset.
+ int ReturnAddrIndex; // FrameIndex for return slot.
+ int BytesToPopOnReturn; // Number of arg bytes ret should pop.
+ int BytesCallerReserves; // Number of arg bytes caller makes.
+ public:
+ X86TargetLowering(TargetMachine &TM);
+
+ // Return the number of bytes that a function should pop when it returns (in
+ // addition to the space used by the return address).
+ //
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+ // Return the number of bytes that the caller reserves for arguments passed
+ // to this function.
+ unsigned getBytesCallerReserves() const { return BytesCallerReserves; }
+
+ /// getStackPtrReg - Return the stack pointer register we are using: either
+ /// ESP or RSP.
+ unsigned getStackPtrReg() const { return X86StackPtr; }
+
+ /// LowerOperation - Provide custom lowering hooks for some operations.
+ ///
+ virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+
+ virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+ virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+ MachineBasicBlock *MBB);
+
+ /// getTargetNodeName - This method returns the name of a target specific
+ /// DAG node.
+ virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+ /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+ /// in Mask are known to be either zero or one and return them in the
+ /// KnownZero/KnownOne bitsets.
+ virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+ uint64_t Mask,
+ uint64_t &KnownZero,
+ uint64_t &KnownOne,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const;
+
+ SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+ ConstraintType getConstraintType(const std::string &Constraint) const;
+
+ std::vector<unsigned>
+ getRegClassForInlineAsmConstraint(const std::string &Constraint,
+ MVT::ValueType VT) const;
+ /// isOperandValidForConstraint - Return the specified operand (possibly
+ /// modified) if the specified SDOperand is valid for the specified target
+ /// constraint letter, otherwise return null.
+ SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter,
+ SelectionDAG &DAG);
+
+ /// getRegForInlineAsmConstraint - Given a physical register constraint
+ /// (e.g. {edx}), return the register number and the register class for the
+ /// register. This should only be used for C_Register constraints. On
+ /// error, this returns a register number of 0.
+ std::pair<unsigned, const TargetRegisterClass*>
+ getRegForInlineAsmConstraint(const std::string &Constraint,
+ MVT::ValueType VT) const;
+
+ /// isLegalAddressingMode - Return true if the addressing mode represented
+ /// by AM is legal for this target, for a load/store of the specified type.
+ virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+ /// isShuffleMaskLegal - Targets can use this to indicate that they only
+ /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+ /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+ /// values are assumed to be legal.
+ virtual bool isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const;
+
+ /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+ /// used by Targets can use this to indicate if there is a suitable
+ /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+ /// pool entry.
+ virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+ MVT::ValueType EVT,
+ SelectionDAG &DAG) const;
+ private:
+ /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+ /// make the right decision when generating code for different targets.
+ const X86Subtarget *Subtarget;
+ const MRegisterInfo *RegInfo;
+
+ /// X86StackPtr - X86 physical register used as stack ptr.
+ unsigned X86StackPtr;
+
+ /// X86ScalarSSE - Select between SSE2 or x87 floating point ops.
+ bool X86ScalarSSE;
+
+ SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall,
+ unsigned CallingConv, SelectionDAG &DAG);
+
+ // C and StdCall Calling Convention implementation.
+ SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+ bool isStdCall = false);
+ SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+ // X86-64 C Calling Convention implementation.
+ SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC);
+
+ // Fast and FastCall Calling Convention implementation.
+ SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+ SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerShift(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFABS(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFNEG(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG, SDOperand Chain);
+ SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerVACOPY(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerFRAME_TO_ARGS_OFFSET(SDOperand Op, SelectionDAG &DAG);
+ SDOperand LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG);
+ };
+}
+
+#endif // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..c0fa58d
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,125 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call. X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense. Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly. The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+ enum {
+ RegBase,
+ FrameIndexBase
+ } BaseType;
+
+ union {
+ unsigned Reg;
+ int FrameIndex;
+ } Base;
+
+ unsigned Scale;
+ unsigned IndexReg;
+ unsigned Disp;
+ GlobalValue *GV;
+
+ X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) {
+ Base.Reg = 0;
+ }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
+ unsigned Reg) {
+ // Because memory references are always represented with four
+ // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
+ unsigned Reg, int Offset) {
+ return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+ unsigned Reg1, unsigned Reg2) {
+ return MIB.addReg(Reg1).addImm(1).addReg(Reg2).addImm(0);
+}
+
+inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+ const X86AddressMode &AM) {
+ assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+ if (AM.BaseType == X86AddressMode::RegBase)
+ MIB.addReg(AM.Base.Reg);
+ else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+ MIB.addFrameIndex(AM.Base.FrameIndex);
+ else
+ assert (0);
+ MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+ if (AM.GV)
+ return MIB.addGlobalAddress(AM.GV, AM.Disp);
+ else
+ return MIB.addImm(AM.Disp);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function. This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+ return MIB.addFrameIndex(FI).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool. The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output. This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+ int Offset = 0) {
+ return MIB.addConstantPoolIndex(CPI).addImm(1).addReg(0).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..11aeb07
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,456 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set -------*- C++ -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet : SDTypeProfile<1, 0, [SDTCisFP<0>]>;
+def SDTX86FpSet : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDTX86Fld : SDTypeProfile<1, 2, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst : SDTypeProfile<0, 3, [SDTCisFP<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def X86fpget : SDNode<"X86ISD::FP_GET_RESULT", SDTX86FpGet,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86fpset : SDNode<"X86ISD::FP_SET_RESULT", SDTX86FpSet,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
+ [SDNPHasChain]>;
+def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
+ [SDNPHasChain, SDNPInFlag]>;
+def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
+ [SDNPHasChain]>;
+def X86fildflag : SDNode<"X86ISD::FILD_FLAG",SDTX86Fild,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+ [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+ return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+ def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+ (ops i16mem:$dst, RFP32:$src),
+ "#FP32_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+ (ops i32mem:$dst, RFP32:$src),
+ "#FP32_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+ def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+ (ops i64mem:$dst, RFP32:$src),
+ "#FP32_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+ def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+ (ops i16mem:$dst, RFP64:$src),
+ "#FP64_TO_INT16_IN_MEM PSEUDO!",
+ [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+ (ops i32mem:$dst, RFP64:$src),
+ "#FP64_TO_INT32_IN_MEM PSEUDO!",
+ [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+ def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+ (ops i64mem:$dst, RFP64:$src),
+ "#FP64_TO_INT64_IN_MEM PSEUDO!",
+ [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+ let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+ def FP_REG_KILL : I<0, Pseudo, (ops), "#FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with three instructions here. The
+// first two instructions, generated by the instruction selector, uses "RFP32"
+// or "RFP64" registers: traditional register files to reference 32-bit or
+// 64-bit floating point values. These sizes apply to the values, not the
+// registers, which are always 64 bits; RFP32 and RFP64 can be copied to
+// each other without losing information. These instructions are all psuedo
+// instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of 32-bit and
+// 64-bit registers.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler. These use "RST" registers, although frequently
+// the actual register(s) used are implicit. These are always 64-bits.
+// The FP stackifier pass converts one to the other after register allocation
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag ops, string asm> : I<o, F, ops, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag ops, FPFormat fp, list<dag> pattern>
+ : X86Inst<0, Pseudo, NoImm, ops, ""> {
+ let FPForm = fp; let FPFormBits = FPForm.Value;
+ let Pattern = pattern;
+}
+
+// Random Pseudo Instructions.
+def FpGETRESULT32 : FpI_<(ops RFP32:$dst), SpecialFP,
+ [(set RFP32:$dst, X86fpget)]>; // FPR = ST(0)
+
+def FpGETRESULT64 : FpI_<(ops RFP64:$dst), SpecialFP,
+ [(set RFP64:$dst, X86fpget)]>; // FPR = ST(0)
+
+let noResults = 1 in {
+ def FpSETRESULT32 : FpI_<(ops RFP32:$src), SpecialFP,
+ [(X86fpset RFP32:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+
+ def FpSETRESULT64 : FpI_<(ops RFP64:$src), SpecialFP,
+ [(X86fpset RFP64:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+}
+// FpI - Floating Point Psuedo Instruction template. Predicated on FPStack.
+class FpI<dag ops, FPFormat fp, list<dag> pattern> :
+ FpI_<ops, fp, pattern>, Requires<[FPStack]>;
+
+// Register copies. Just copies, the 64->32 version does not truncate.
+def MOV_Fp3232 : FpI<(ops RFP32:$dst, RFP32:$src), SpecialFP, []>;
+def MOV_Fp3264 : FpI<(ops RFP64:$dst, RFP32:$src), SpecialFP, []>;
+def MOV_Fp6432 : FpI<(ops RFP32:$dst, RFP64:$src), SpecialFP, []>;
+def MOV_Fp6464 : FpI<(ops RFP64:$dst, RFP64:$src), SpecialFP, []>;
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), TwoArgFP,
+ [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), TwoArgFP,
+ [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m : FpI<(ops RFP32:$dst, RFP32:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst,
+ (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m : FpI<(ops RFP64:$dst, RFP64:$src1, f64mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpI<(ops RFP64:$dst, RFP64:$src1, f32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst,
+ (OpNode RFP64:$src1, (extloadf32 addr:$src2)))]>;
+def _F32m : FPI<0xD8, fp, (ops f32mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{s} $src"))>;
+def _F64m : FPI<0xDC, fp, (ops f64mem:$src),
+ !strconcat("f", !strconcat(asmstring, "{l} $src"))>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i16mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i16mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i32mem:$src2), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src1,
+ (X86fild addr:$src2, i32)))]>;
+def _FI16m : FPI<0xDE, fp, (ops i16mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{s} $src"))>;
+def _FI32m : FPI<0xDA, fp, (ops i32mem:$src),
+ !strconcat("fi", !strconcat(asmstring, "{l} $src"))>;
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (ops RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (ops RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+ : FPI<o, AddRegFrm, (ops RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r : FPST0rInst <0xC0, "fadd $op">;
+def ADD_FrST0 : FPrST0Inst <0xC0, "fadd {%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0 : FPrST0PInst<0xC0, "faddp $op">;
+def SUBR_FST0r : FPST0rInst <0xE8, "fsubr $op">;
+def SUB_FrST0 : FPrST0Inst <0xE8, "fsub{r} {%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0 : FPrST0PInst<0xE8, "fsub{r}p $op">;
+def SUB_FST0r : FPST0rInst <0xE0, "fsub $op">;
+def SUBR_FrST0 : FPrST0Inst <0xE0, "fsub{|r} {%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p $op">;
+def MUL_FST0r : FPST0rInst <0xC8, "fmul $op">;
+def MUL_FrST0 : FPrST0Inst <0xC8, "fmul {%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0 : FPrST0PInst<0xC8, "fmulp $op">;
+def DIVR_FST0r : FPST0rInst <0xF8, "fdivr $op">;
+def DIV_FrST0 : FPrST0Inst <0xF8, "fdiv{r} {%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0 : FPrST0PInst<0xF8, "fdiv{r}p $op">;
+def DIV_FST0r : FPST0rInst <0xF0, "fdiv $op">;
+def DIVR_FrST0 : FPrST0Inst <0xF0, "fdiv{|r} {%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p $op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src), OneArgFPRW,
+ [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src), OneArgFPRW,
+ [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _F : FPI<opcode, RawFrm, (ops), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+def TST_Fp32 : FpI<(ops RFP32:$src), OneArgFP,
+ []>;
+def TST_Fp64 : FpI<(ops RFP64:$src), OneArgFP,
+ []>;
+def TST_F : FPI<0xE4, RawFrm, (ops), "ftst">, D9;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+ def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), CondMovFP,
+ [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+ cc))]>;
+ def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), CondMovFP,
+ [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+ cc))]>;
+}
+let isTwoAddress = 1 in {
+defm CMOVB : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE : FPCMov<X86_COND_E>;
+defm CMOVP : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F : FPI<0xC0, AddRegFrm, (ops RST:$op),
+ "fcmovb {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (ops RST:$op),
+ "fcmovbe {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F : FPI<0xC8, AddRegFrm, (ops RST:$op),
+ "fcmove {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F : FPI<0xD8, AddRegFrm, (ops RST:$op),
+ "fcmovu {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (ops RST:$op),
+ "fcmovnb {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (ops RST:$op),
+ "fcmovnbe {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (ops RST:$op),
+ "fcmovne {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (ops RST:$op),
+ "fcmovnu {$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+def LD_Fp32m : FpI<(ops RFP32:$dst, f32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (loadf32 addr:$src))]>;
+def LD_Fp64m : FpI<(ops RFP64:$dst, f64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def ILD_Fp16m32: FpI<(ops RFP32:$dst, i16mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpI<(ops RFP32:$dst, i32mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpI<(ops RFP32:$dst, i64mem:$src), ZeroArgFP,
+ [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpI<(ops RFP64:$dst, i16mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpI<(ops RFP64:$dst, i32mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpI<(ops RFP64:$dst, i64mem:$src), ZeroArgFP,
+ [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP,
+ [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP,
+ [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP,
+ [(store RFP64:$src, addr:$op)]>;
+
+def ST_FpP32m : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m32 : FpI<(ops i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32 : FpI<(ops i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32 : FpI<(ops i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64 : FpI<(ops i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64 : FpI<(ops i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64 : FpI<(ops i64mem:$op, RFP64:$src), OneArgFP, []>;
+
+def LD_F32m : FPI<0xD9, MRM0m, (ops f32mem:$src), "fld{s} $src">;
+def LD_F64m : FPI<0xDD, MRM0m, (ops f64mem:$src), "fld{l} $src">;
+def ILD_F16m : FPI<0xDF, MRM0m, (ops i16mem:$src), "fild{s} $src">;
+def ILD_F32m : FPI<0xDB, MRM0m, (ops i32mem:$src), "fild{l} $src">;
+def ILD_F64m : FPI<0xDF, MRM5m, (ops i64mem:$src), "fild{ll} $src">;
+def ST_F32m : FPI<0xD9, MRM2m, (ops f32mem:$dst), "fst{s} $dst">;
+def ST_F64m : FPI<0xDD, MRM2m, (ops f64mem:$dst), "fst{l} $dst">;
+def ST_FP32m : FPI<0xD9, MRM3m, (ops f32mem:$dst), "fstp{s} $dst">;
+def ST_FP64m : FPI<0xDD, MRM3m, (ops f64mem:$dst), "fstp{l} $dst">;
+def IST_F16m : FPI<0xDF, MRM2m, (ops i16mem:$dst), "fist{s} $dst">;
+def IST_F32m : FPI<0xDB, MRM2m, (ops i32mem:$dst), "fist{l} $dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (ops i16mem:$dst), "fistp{s} $dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (ops i32mem:$dst), "fistp{l} $dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (ops i64mem:$dst), "fistp{ll} $dst">;
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(ops i16mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(ops i32mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(ops i64mem:$op, RFP32:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(ops i16mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(ops i32mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(ops i64mem:$op, RFP64:$src), OneArgFP,
+ [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+ Requires<[HasSSE3]>;
+
+def ISTT_FP16m : FPI<0xDF, MRM1m, (ops i16mem:$dst), "fisttp{s} $dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (ops i32mem:$dst), "fisttp{l} $dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (ops i64mem:$dst), "fisttp{ll} $dst">;
+
+// FP Stack manipulation instructions.
+def LD_Frr : FPI<0xC0, AddRegFrm, (ops RST:$op), "fld $op">, D9;
+def ST_Frr : FPI<0xD0, AddRegFrm, (ops RST:$op), "fst $op">, DD;
+def ST_FPrr : FPI<0xD8, AddRegFrm, (ops RST:$op), "fstp $op">, DD;
+def XCH_F : FPI<0xC8, AddRegFrm, (ops RST:$op), "fxch $op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpI<(ops RFP32:$dst), ZeroArgFP,
+ [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpI<(ops RFP32:$dst), ZeroArgFP,
+ [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpI<(ops RFP64:$dst), ZeroArgFP,
+ [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpI<(ops RFP64:$dst), ZeroArgFP,
+ [(set RFP64:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (ops), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (ops), "fld1">, D9;
+
+
+// Floating point compares.
+def UCOM_Fpr32 : FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr32: FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(X86cmp RFP32:$lhs, RFP32:$rhs)]>; // CC = ST(0) cmp ST(i)
+def UCOM_Fpr64 : FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+ []>; // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr64: FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(X86cmp RFP64:$lhs, RFP64:$rhs)]>; // CC = ST(0) cmp ST(i)
+
+def UCOM_Fr : FPI<0xE0, AddRegFrm, // FPSW = cmp ST(0) with ST(i)
+ (ops RST:$reg),
+ "fucom $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPr : FPI<0xE8, AddRegFrm, // FPSW = cmp ST(0) with ST(i), pop
+ (ops RST:$reg),
+ "fucomp $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPPr : FPI<0xE9, RawFrm, // cmp ST(0) with ST(1), pop, pop
+ (ops),
+ "fucompp">, DA, Imp<[ST0],[]>;
+
+def UCOM_FIr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i)
+ (ops RST:$reg),
+ "fucomi {$reg, %st(0)|%ST(0), $reg}">, DB, Imp<[ST0],[]>;
+def UCOM_FIPr : FPI<0xE8, AddRegFrm, // CC = cmp ST(0) with ST(i), pop
+ (ops RST:$reg),
+ "fucomip {$reg, %st(0)|%ST(0), $reg}">, DF, Imp<[ST0],[]>;
+
+// Floating point flag ops.
+def FNSTSW8r : I<0xE0, RawFrm, // AX = fp flags
+ (ops), "fnstsw", []>, DF, Imp<[],[AX]>;
+
+def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
+ (ops i16mem:$dst), "fnstcw $dst", []>;
+def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
+ (ops i16mem:$dst), "fldcw $dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+
+// Required for CALL which return f32 / f64 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStack]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStack]>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+def : Pat<(extloadf32 addr:$src),
+ (MOV_Fp3264 (LD_Fp32m addr:$src))>, Requires<[FPStack]>;
+def : Pat<(fextend RFP32:$src), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStack]>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..06b14fe
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,567 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+using namespace llvm;
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+ : TargetInstrInfo(X86Insts, sizeof(X86Insts)/sizeof(X86Insts[0])),
+ TM(tm), RI(tm, *this) {
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+ unsigned& sourceReg,
+ unsigned& destReg) const {
+ MachineOpCode oc = MI.getOpcode();
+ if (oc == X86::MOV8rr || oc == X86::MOV16rr ||
+ oc == X86::MOV32rr || oc == X86::MOV64rr ||
+ oc == X86::MOV16to16_ || oc == X86::MOV32to32_ ||
+ oc == X86::MOV_Fp3232 || oc == X86::MOVSSrr || oc == X86::MOVSDrr ||
+ oc == X86::MOV_Fp3264 || oc == X86::MOV_Fp6432 || oc == X86::MOV_Fp6464 ||
+ oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr ||
+ oc == X86::MOVAPSrr || oc == X86::MOVAPDrr ||
+ oc == X86::MOVSS2PSrr || oc == X86::MOVSD2PDrr ||
+ oc == X86::MOVPS2SSrr || oc == X86::MOVPD2SDrr ||
+ oc == X86::MMX_MOVD64rr || oc == X86::MMX_MOVQ64rr) {
+ assert(MI.getNumOperands() >= 2 &&
+ MI.getOperand(0).isRegister() &&
+ MI.getOperand(1).isRegister() &&
+ "invalid register-register move instruction");
+ sourceReg = MI.getOperand(1).getReg();
+ destReg = MI.getOperand(0).getReg();
+ return true;
+ }
+ return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV16_rm:
+ case X86::MOV32rm:
+ case X86::MOV32_rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() &&
+ MI->getOperand(3).isRegister() && MI->getOperand(4).isImmediate() &&
+ MI->getOperand(2).getImmedValue() == 1 &&
+ MI->getOperand(3).getReg() == 0 &&
+ MI->getOperand(4).getImmedValue() == 0) {
+ FrameIndex = MI->getOperand(1).getFrameIndex();
+ return MI->getOperand(0).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
+ int &FrameIndex) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8mr:
+ case X86::MOV16mr:
+ case X86::MOV16_mr:
+ case X86::MOV32mr:
+ case X86::MOV32_mr:
+ case X86::MOV64mr:
+ case X86::ST_FpP64m:
+ case X86::MOVSSmr:
+ case X86::MOVSDmr:
+ case X86::MOVAPSmr:
+ case X86::MOVAPDmr:
+ case X86::MMX_MOVD64mr:
+ case X86::MMX_MOVQ64mr:
+ case X86::MMX_MOVNTQmr:
+ if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() &&
+ MI->getOperand(2).isRegister() && MI->getOperand(3).isImmediate() &&
+ MI->getOperand(1).getImmedValue() == 1 &&
+ MI->getOperand(2).getReg() == 0 &&
+ MI->getOperand(3).getImmedValue() == 0) {
+ FrameIndex = MI->getOperand(0).getFrameIndex();
+ return MI->getOperand(4).getReg();
+ }
+ break;
+ }
+ return 0;
+}
+
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(MachineInstr *MI) const {
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::MOV8rm:
+ case X86::MOV16rm:
+ case X86::MOV16_rm:
+ case X86::MOV32rm:
+ case X86::MOV32_rm:
+ case X86::MOV64rm:
+ case X86::LD_Fp64m:
+ case X86::MOVSSrm:
+ case X86::MOVSDrm:
+ case X86::MOVAPSrm:
+ case X86::MOVAPDrm:
+ case X86::MMX_MOVD64rm:
+ case X86::MMX_MOVQ64rm:
+ // Loads from constant pools are trivially rematerializable.
+ return MI->getOperand(1).isRegister() && MI->getOperand(2).isImmediate() &&
+ MI->getOperand(3).isRegister() && MI->getOperand(4).isConstantPoolIndex() &&
+ MI->getOperand(1).getReg() == 0 &&
+ MI->getOperand(2).getImmedValue() == 1 &&
+ MI->getOperand(3).getReg() == 0;
+ }
+ // All other instructions marked M_REMATERIALIZABLE are always trivially
+ // rematerializable.
+ return true;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand. This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables &LV) const {
+ MachineInstr *MI = MBBI;
+ // All instructions input are two-addr instructions. Get the known operands.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+
+ MachineInstr *NewMI = NULL;
+ // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
+ // we have better subtarget support, enable the 16-bit LEA generation here.
+ bool DisableLEA16 = true;
+
+ switch (MI->getOpcode()) {
+ default: return 0;
+ case X86::SHUFPSrri: {
+ assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+ if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+
+ unsigned A = MI->getOperand(0).getReg();
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ unsigned M = MI->getOperand(3).getImm();
+ if (B != C) return 0;
+ NewMI = BuildMI(get(X86::PSHUFDri), A).addReg(B).addImm(M);
+ break;
+ }
+ case X86::SHL64ri: {
+ assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ NewMI = BuildMI(get(X86::LEA64r), Dest)
+ .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+ break;
+ }
+ case X86::SHL32ri: {
+ assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
+ X86::LEA64_32r : X86::LEA32r;
+ NewMI = BuildMI(get(Opc), Dest)
+ .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+ break;
+ }
+ case X86::SHL16ri: {
+ assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+ if (DisableLEA16) return 0;
+
+ // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+ // the flags produced by a shift yet, so this is safe.
+ unsigned Dest = MI->getOperand(0).getReg();
+ unsigned Src = MI->getOperand(1).getReg();
+ unsigned ShAmt = MI->getOperand(2).getImm();
+ if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+ NewMI = BuildMI(get(X86::LEA16r), Dest)
+ .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+ break;
+ }
+ }
+
+ // FIXME: None of these instructions are promotable to LEAs without
+ // additional information. In particular, LEA doesn't set the flags that
+ // add and inc do. :(
+ if (0)
+ switch (MI->getOpcode()) {
+ case X86::INC32r:
+ case X86::INC64_32r:
+ assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+ NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, 1);
+ break;
+ case X86::INC16r:
+ case X86::INC64_16r:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+ NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, 1);
+ break;
+ case X86::DEC32r:
+ case X86::DEC64_32r:
+ assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+ NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, -1);
+ break;
+ case X86::DEC16r:
+ case X86::DEC64_16r:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+ NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, -1);
+ break;
+ case X86::ADD32rr:
+ assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+ NewMI = addRegReg(BuildMI(get(X86::LEA32r), Dest), Src,
+ MI->getOperand(2).getReg());
+ break;
+ case X86::ADD16rr:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+ NewMI = addRegReg(BuildMI(get(X86::LEA16r), Dest), Src,
+ MI->getOperand(2).getReg());
+ break;
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+ if (MI->getOperand(2).isImmediate())
+ NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src,
+ MI->getOperand(2).getImmedValue());
+ break;
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ if (DisableLEA16) return 0;
+ assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+ if (MI->getOperand(2).isImmediate())
+ NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src,
+ MI->getOperand(2).getImmedValue());
+ break;
+ case X86::SHL16ri:
+ if (DisableLEA16) return 0;
+ case X86::SHL32ri:
+ assert(MI->getNumOperands() == 3 && MI->getOperand(2).isImmediate() &&
+ "Unknown shl instruction!");
+ unsigned ShAmt = MI->getOperand(2).getImmedValue();
+ if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
+ X86AddressMode AM;
+ AM.Scale = 1 << ShAmt;
+ AM.IndexReg = Src;
+ unsigned Opc = MI->getOpcode() == X86::SHL32ri ? X86::LEA32r :X86::LEA16r;
+ NewMI = addFullAddress(BuildMI(get(Opc), Dest), AM);
+ }
+ break;
+ }
+
+ if (NewMI) {
+ NewMI->copyKillDeadInfo(MI);
+ LV.instructionChanged(MI, NewMI); // Update live variables
+ MFI->insert(MBBI, NewMI); // Insert the new inst
+ }
+ return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *X86InstrInfo::commuteInstruction(MachineInstr *MI) const {
+ // FIXME: Can commute cmoves by changing the condition!
+ switch (MI->getOpcode()) {
+ case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+ case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+ case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+ case X86::SHLD32rri8:{// A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+ unsigned Opc;
+ unsigned Size;
+ switch (MI->getOpcode()) {
+ default: assert(0 && "Unreachable!");
+ case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+ case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+ case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+ case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+ }
+ unsigned Amt = MI->getOperand(3).getImmedValue();
+ unsigned A = MI->getOperand(0).getReg();
+ unsigned B = MI->getOperand(1).getReg();
+ unsigned C = MI->getOperand(2).getReg();
+ bool BisKill = MI->getOperand(1).isKill();
+ bool CisKill = MI->getOperand(2).isKill();
+ return BuildMI(get(Opc), A).addReg(C, false, false, CisKill)
+ .addReg(B, false, false, BisKill).addImm(Size-Amt);
+ }
+ default:
+ return TargetInstrInfo::commuteInstruction(MI);
+ }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+ switch (BrOpc) {
+ default: return X86::COND_INVALID;
+ case X86::JE: return X86::COND_E;
+ case X86::JNE: return X86::COND_NE;
+ case X86::JL: return X86::COND_L;
+ case X86::JLE: return X86::COND_LE;
+ case X86::JG: return X86::COND_G;
+ case X86::JGE: return X86::COND_GE;
+ case X86::JB: return X86::COND_B;
+ case X86::JBE: return X86::COND_BE;
+ case X86::JA: return X86::COND_A;
+ case X86::JAE: return X86::COND_AE;
+ case X86::JS: return X86::COND_S;
+ case X86::JNS: return X86::COND_NS;
+ case X86::JP: return X86::COND_P;
+ case X86::JNP: return X86::COND_NP;
+ case X86::JO: return X86::COND_O;
+ case X86::JNO: return X86::COND_NO;
+ }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+ switch (CC) {
+ default: assert(0 && "Illegal condition code!");
+ case X86::COND_E: return X86::JE;
+ case X86::COND_NE: return X86::JNE;
+ case X86::COND_L: return X86::JL;
+ case X86::COND_LE: return X86::JLE;
+ case X86::COND_G: return X86::JG;
+ case X86::COND_GE: return X86::JGE;
+ case X86::COND_B: return X86::JB;
+ case X86::COND_BE: return X86::JBE;
+ case X86::COND_A: return X86::JA;
+ case X86::COND_AE: return X86::JAE;
+ case X86::COND_S: return X86::JS;
+ case X86::COND_NS: return X86::JNS;
+ case X86::COND_P: return X86::JP;
+ case X86::COND_NP: return X86::JNP;
+ case X86::COND_O: return X86::JO;
+ case X86::COND_NO: return X86::JNO;
+ }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+ switch (CC) {
+ default: assert(0 && "Illegal condition code!");
+ case X86::COND_E: return X86::COND_NE;
+ case X86::COND_NE: return X86::COND_E;
+ case X86::COND_L: return X86::COND_GE;
+ case X86::COND_LE: return X86::COND_G;
+ case X86::COND_G: return X86::COND_LE;
+ case X86::COND_GE: return X86::COND_L;
+ case X86::COND_B: return X86::COND_AE;
+ case X86::COND_BE: return X86::COND_A;
+ case X86::COND_A: return X86::COND_BE;
+ case X86::COND_AE: return X86::COND_B;
+ case X86::COND_S: return X86::COND_NS;
+ case X86::COND_NS: return X86::COND_S;
+ case X86::COND_P: return X86::COND_NP;
+ case X86::COND_NP: return X86::COND_P;
+ case X86::COND_O: return X86::COND_NO;
+ case X86::COND_NO: return X86::COND_O;
+ }
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+ if (MI->getOpcode() == X86::FP_REG_KILL)
+ return false;
+
+ const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+ if (TID->Flags & M_TERMINATOR_FLAG) {
+ // Conditional branch is a special case.
+ if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0)
+ return true;
+ if ((TID->Flags & M_PREDICABLE) == 0)
+ return true;
+ return !isPredicated(MI);
+ }
+ return false;
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+ MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ std::vector<MachineOperand> &Cond) const {
+ // If the block has no terminators, it just falls into the block after it.
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+ return false;
+
+ // Get the last instruction in the block.
+ MachineInstr *LastInst = I;
+
+ // If there is only one terminator instruction, process it.
+ if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+ if (!isBranch(LastInst->getOpcode()))
+ return true;
+
+ // If the block ends with a branch there are 3 possibilities:
+ // it's an unconditional, conditional, or indirect branch.
+
+ if (LastInst->getOpcode() == X86::JMP) {
+ TBB = LastInst->getOperand(0).getMachineBasicBlock();
+ return false;
+ }
+ X86::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+ if (BranchCode == X86::COND_INVALID)
+ return true; // Can't handle indirect branch.
+
+ // Otherwise, block ends with fall-through condbranch.
+ TBB = LastInst->getOperand(0).getMachineBasicBlock();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ return false;
+ }
+
+ // Get the instruction before it if it's a terminator.
+ MachineInstr *SecondLastInst = I;
+
+ // If there are three terminators, we don't know what sort of block this is.
+ if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+ return true;
+
+ // If the block ends with X86::JMP and a conditional branch, handle it.
+ X86::CondCode BranchCode = GetCondFromBranchOpc(SecondLastInst->getOpcode());
+ if (BranchCode != X86::COND_INVALID && LastInst->getOpcode() == X86::JMP) {
+ TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+ Cond.push_back(MachineOperand::CreateImm(BranchCode));
+ FBB = LastInst->getOperand(0).getMachineBasicBlock();
+ return false;
+ }
+
+ // If the block ends with two X86::JMPs, handle it. The second one is not
+ // executed, so remove it.
+ if (SecondLastInst->getOpcode() == X86::JMP &&
+ LastInst->getOpcode() == X86::JMP) {
+ TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+ I = LastInst;
+ I->eraseFromParent();
+ return false;
+ }
+
+ // Otherwise, can't handle this.
+ return true;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+ MachineBasicBlock::iterator I = MBB.end();
+ if (I == MBB.begin()) return 0;
+ --I;
+ if (I->getOpcode() != X86::JMP &&
+ GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ return 0;
+
+ // Remove the branch.
+ I->eraseFromParent();
+
+ I = MBB.end();
+
+ if (I == MBB.begin()) return 1;
+ --I;
+ if (GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+ return 1;
+
+ // Remove the branch.
+ I->eraseFromParent();
+ return 2;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const std::vector<MachineOperand> &Cond) const {
+ // Shouldn't be a fall through.
+ assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+ assert((Cond.size() == 1 || Cond.size() == 0) &&
+ "X86 branch conditions have one component!");
+
+ if (FBB == 0) { // One way branch.
+ if (Cond.empty()) {
+ // Unconditional branch?
+ BuildMI(&MBB, get(X86::JMP)).addMBB(TBB);
+ } else {
+ // Conditional branch.
+ unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+ BuildMI(&MBB, get(Opc)).addMBB(TBB);
+ }
+ return 1;
+ }
+
+ // Two-way Conditional branch.
+ unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+ BuildMI(&MBB, get(Opc)).addMBB(TBB);
+ BuildMI(&MBB, get(X86::JMP)).addMBB(FBB);
+ return 2;
+}
+
+bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+ if (MBB.empty()) return false;
+
+ switch (MBB.back().getOpcode()) {
+ case X86::RET: // Return.
+ case X86::RETI:
+ case X86::TAILJMPd:
+ case X86::TAILJMPr:
+ case X86::TAILJMPm:
+ case X86::JMP: // Uncond branch.
+ case X86::JMP32r: // Indirect branch.
+ case X86::JMP32m: // Indirect branch through mem.
+ return true;
+ default: return false;
+ }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+ assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+ Cond[0].setImm(GetOppositeBranchCondition((X86::CondCode)Cond[0].getImm()));
+ return false;
+}
+
+const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ if (Subtarget->is64Bit())
+ return &X86::GR64RegClass;
+ else
+ return &X86::GR32RegClass;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..ec30cc7
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,287 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86RegisterInfo.h"
+
+namespace llvm {
+ class X86RegisterInfo;
+ class X86TargetMachine;
+
+namespace X86 {
+ // X86 specific condition code. These correspond to X86_*_COND in
+ // X86InstrInfo.td. They must be kept in synch.
+ enum CondCode {
+ COND_A = 0,
+ COND_AE = 1,
+ COND_B = 2,
+ COND_BE = 3,
+ COND_E = 4,
+ COND_G = 5,
+ COND_GE = 6,
+ COND_L = 7,
+ COND_LE = 8,
+ COND_NE = 9,
+ COND_NO = 10,
+ COND_NP = 11,
+ COND_NS = 12,
+ COND_O = 13,
+ COND_P = 14,
+ COND_S = 15,
+ COND_INVALID
+ };
+
+ // Turn condition code into conditional branch opcode.
+ unsigned GetCondBranchFromCond(CondCode CC);
+
+ /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+ /// e.g. turning COND_E to COND_NE.
+ CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+ enum {
+ //===------------------------------------------------------------------===//
+ // Instruction types. These are the standard/most common forms for X86
+ // instructions.
+ //
+
+ // PseudoFrm - This represents an instruction that is a pseudo instruction
+ // or one that has not been implemented yet. It is illegal to code generate
+ // it, but tolerated for intermediate implementation stages.
+ Pseudo = 0,
+
+ /// Raw - This form is for instructions that don't have any operands, so
+ /// they are just a fixed opcode value, like 'leave'.
+ RawFrm = 1,
+
+ /// AddRegFrm - This form is used for instructions like 'push r32' that have
+ /// their one register operand added to their opcode.
+ AddRegFrm = 2,
+
+ /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is a register.
+ ///
+ MRMDestReg = 3,
+
+ /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a destination, which in this case is memory.
+ ///
+ MRMDestMem = 4,
+
+ /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is a register.
+ ///
+ MRMSrcReg = 5,
+
+ /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+ /// to specify a source, which in this case is memory.
+ ///
+ MRMSrcMem = 6,
+
+ /// MRM[0-7][rm] - These forms are used to represent instructions that use
+ /// a Mod/RM byte, and use the middle field to hold extended opcode
+ /// information. In the intel manual these are represented as /0, /1, ...
+ ///
+
+ // First, instructions that operate on a register r/m operand...
+ MRM0r = 16, MRM1r = 17, MRM2r = 18, MRM3r = 19, // Format /0 /1 /2 /3
+ MRM4r = 20, MRM5r = 21, MRM6r = 22, MRM7r = 23, // Format /4 /5 /6 /7
+
+ // Next, instructions that operate on a memory r/m operand...
+ MRM0m = 24, MRM1m = 25, MRM2m = 26, MRM3m = 27, // Format /0 /1 /2 /3
+ MRM4m = 28, MRM5m = 29, MRM6m = 30, MRM7m = 31, // Format /4 /5 /6 /7
+
+ // MRMInitReg - This form is used for instructions whose source and
+ // destinations are the same register.
+ MRMInitReg = 32,
+
+ FormMask = 63,
+
+ //===------------------------------------------------------------------===//
+ // Actual flags...
+
+ // OpSize - Set if this instruction requires an operand size prefix (0x66),
+ // which most often indicates that the instruction operates on 16 bit data
+ // instead of 32 bit data.
+ OpSize = 1 << 6,
+
+ // AsSize - Set if this instruction requires an operand size prefix (0x67),
+ // which most often indicates that the instruction address 16 bit address
+ // instead of 32 bit address (or 32 bit address in 64 bit mode).
+ AdSize = 1 << 7,
+
+ //===------------------------------------------------------------------===//
+ // Op0Mask - There are several prefix bytes that are used to form two byte
+ // opcodes. These are currently 0x0F, 0xF3, and 0xD8-0xDF. This mask is
+ // used to obtain the setting of this field. If no bits in this field is
+ // set, there is no prefix byte for obtaining a multibyte opcode.
+ //
+ Op0Shift = 8,
+ Op0Mask = 0xF << Op0Shift,
+
+ // TB - TwoByte - Set if this instruction has a two byte opcode, which
+ // starts with a 0x0F byte before the real opcode.
+ TB = 1 << Op0Shift,
+
+ // REP - The 0xF3 prefix byte indicating repetition of the following
+ // instruction.
+ REP = 2 << Op0Shift,
+
+ // D8-DF - These escape opcodes are used by the floating point unit. These
+ // values must remain sequential.
+ D8 = 3 << Op0Shift, D9 = 4 << Op0Shift,
+ DA = 5 << Op0Shift, DB = 6 << Op0Shift,
+ DC = 7 << Op0Shift, DD = 8 << Op0Shift,
+ DE = 9 << Op0Shift, DF = 10 << Op0Shift,
+
+ // XS, XD - These prefix codes are for single and double precision scalar
+ // floating point operations performed in the SSE registers.
+ XD = 11 << Op0Shift, XS = 12 << Op0Shift,
+
+ // T8, TA - Prefix after the 0x0F prefix.
+ T8 = 13 << Op0Shift, TA = 14 << Op0Shift,
+
+ //===------------------------------------------------------------------===//
+ // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+ // They are used to specify GPRs and SSE registers, 64-bit operand size,
+ // etc. We only cares about REX.W and REX.R bits and only the former is
+ // statically determined.
+ //
+ REXShift = 12,
+ REX_W = 1 << REXShift,
+
+ //===------------------------------------------------------------------===//
+ // This three-bit field describes the size of an immediate operand. Zero is
+ // unused so that we can tell if we forgot to set a value.
+ ImmShift = 13,
+ ImmMask = 7 << ImmShift,
+ Imm8 = 1 << ImmShift,
+ Imm16 = 2 << ImmShift,
+ Imm32 = 3 << ImmShift,
+ Imm64 = 4 << ImmShift,
+
+ //===------------------------------------------------------------------===//
+ // FP Instruction Classification... Zero is non-fp instruction.
+
+ // FPTypeMask - Mask for all of the FP types...
+ FPTypeShift = 16,
+ FPTypeMask = 7 << FPTypeShift,
+
+ // NotFP - The default, set for instructions that do not use FP registers.
+ NotFP = 0 << FPTypeShift,
+
+ // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+ ZeroArgFP = 1 << FPTypeShift,
+
+ // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+ OneArgFP = 2 << FPTypeShift,
+
+ // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+ // result back to ST(0). For example, fcos, fsqrt, etc.
+ //
+ OneArgFPRW = 3 << FPTypeShift,
+
+ // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+ // explicit argument, storing the result to either ST(0) or the implicit
+ // argument. For example: fadd, fsub, fmul, etc...
+ TwoArgFP = 4 << FPTypeShift,
+
+ // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+ // explicit argument, but have no destination. Example: fucom, fucomi, ...
+ CompareFP = 5 << FPTypeShift,
+
+ // CondMovFP - "2 operand" floating point conditional move instructions.
+ CondMovFP = 6 << FPTypeShift,
+
+ // SpecialFP - Special instruction forms. Dispatch by opcode explicitly.
+ SpecialFP = 7 << FPTypeShift,
+
+ // Bits 19 -> 23 are unused
+ OpcodeShift = 24,
+ OpcodeMask = 0xFF << OpcodeShift
+ };
+}
+
+class X86InstrInfo : public TargetInstrInfo {
+ X86TargetMachine &TM;
+ const X86RegisterInfo RI;
+public:
+ X86InstrInfo(X86TargetMachine &tm);
+
+ /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
+ /// such, whenever a client has an instance of instruction info, it should
+ /// always be able to get register info as well (through this method).
+ ///
+ virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+ // Return true if the instruction is a register to register move and
+ // leave the source and dest operands in the passed parameters.
+ //
+ bool isMoveInstr(const MachineInstr& MI, unsigned& sourceReg,
+ unsigned& destReg) const;
+ unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+ unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+ bool isReallyTriviallyReMaterializable(MachineInstr *MI) const;
+
+ /// convertToThreeAddress - This method must be implemented by targets that
+ /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
+ /// may be able to convert a two-address instruction into a true
+ /// three-address instruction on demand. This allows the X86 target (for
+ /// example) to convert ADD and SHL instructions into LEA instructions if they
+ /// would require register copies due to two-addressness.
+ ///
+ /// This method returns a null pointer if the transformation cannot be
+ /// performed, otherwise it returns the new instruction.
+ ///
+ virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+ MachineBasicBlock::iterator &MBBI,
+ LiveVariables &LV) const;
+
+ /// commuteInstruction - We have a few instructions that must be hacked on to
+ /// commute them.
+ ///
+ virtual MachineInstr *commuteInstruction(MachineInstr *MI) const;
+
+ // Branch analysis.
+ virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+ virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+ MachineBasicBlock *&FBB,
+ std::vector<MachineOperand> &Cond) const;
+ virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+ virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+ MachineBasicBlock *FBB,
+ const std::vector<MachineOperand> &Cond) const;
+ virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+ virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+
+ const TargetRegisterClass *getPointerRegClass() const;
+
+ // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+ // specified opcode number.
+ //
+ unsigned char getBaseOpcodeFor(const TargetInstrDescriptor *TID) const {
+ return TID->TSFlags >> X86II::OpcodeShift;
+ }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..b24f644
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,2674 @@
+//===- X86InstrInfo.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov : SDTypeProfile<1, 3,
+ [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>]>;
+
+def SDTX86BrCond : SDTypeProfile<0, 2,
+ [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+
+def SDTX86SetCC : SDTypeProfile<1, 1,
+ [SDTCisVT<0, i8>, SDTCisVT<1, i8>]>;
+
+def SDTX86Ret : SDTypeProfile<0, 1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+def SDT_X86CallSeqEnd : SDTypeProfile<0, 2, [ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>;
+
+def SDT_X86Call : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDTX86RepStr : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+
+def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
+def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
+
+def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest,
+ [SDNPHasChain, SDNPOutFlag]>;
+
+def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov,
+ [SDNPInFlag, SDNPOutFlag]>;
+def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
+ [SDNPHasChain, SDNPInFlag]>;
+def X86setcc : SDNode<"X86ISD::SETCC", SDTX86SetCC,
+ [SDNPInFlag, SDNPOutFlag]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+ [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86callseq_start :
+ SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+ SDNode<"ISD::CALLSEQ_END", SDT_X86CallSeqEnd,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86call : SDNode<"X86ISD::CALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86tailcall: SDNode<"X86ISD::TAILCALL", SDT_X86Call,
+ [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86rdtsc : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+ [SDNPHasChain, SDNPOutFlag]>;
+
+def X86Wrapper : SDNode<"X86ISD::Wrapper", SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP", SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+ [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+ [SDNPHasChain]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+ let PrintMethod = printMethod;
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+def i8mem : X86MemOperand<"printi8mem">;
+def i16mem : X86MemOperand<"printi16mem">;
+def i32mem : X86MemOperand<"printi32mem">;
+def i64mem : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+def f32mem : X86MemOperand<"printf32mem">;
+def f64mem : X86MemOperand<"printf64mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+
+def lea32mem : Operand<i32> {
+ let PrintMethod = "printi32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+def SSECC : Operand<i8> {
+ let PrintMethod = "printSSECC";
+}
+
+def piclabel: Operand<i32> {
+ let PrintMethod = "printPICLabel";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm : Operand<i16>;
+// 32-bits but only 8 bits are significant.
+def i32i8imm : Operand<i32>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr : ComplexPattern<iPTR, 4, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+ [add, mul, shl, or, frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction. This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+ bits<6> Value = val;
+}
+
+def Pseudo : Format<0>; def RawFrm : Format<1>;
+def AddRegFrm : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg : Format<5>;
+def MRMSrcMem : Format<6>;
+def MRM0r : Format<16>; def MRM1r : Format<17>; def MRM2r : Format<18>;
+def MRM3r : Format<19>; def MRM4r : Format<20>; def MRM5r : Format<21>;
+def MRM6r : Format<22>; def MRM7r : Format<23>;
+def MRM0m : Format<24>; def MRM1m : Format<25>; def MRM2m : Format<26>;
+def MRM3m : Format<27>; def MRM4m : Format<28>; def MRM5m : Format<29>;
+def MRM6m : Format<30>; def MRM7m : Format<31>;
+def MRMInitReg : Format<32>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1 : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2 : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3 : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">;
+def FPStack : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode : Predicate<"Subtarget->is64Bit()">;
+def SmallCode : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+
+//===----------------------------------------------------------------------===//
+// X86 specific pattern fragments.
+//
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+ bits<3> Value = val;
+}
+def NoImm : ImmType<0>;
+def Imm8 : ImmType<1>;
+def Imm16 : ImmType<2>;
+def Imm32 : ImmType<3>;
+def Imm64 : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has. This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+ bits<3> Value = val;
+}
+def NotFP : FPFormat<0>;
+def ZeroArgFP : FPFormat<1>;
+def OneArgFP : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP : FPFormat<4>;
+def CompareFP : FPFormat<5>;
+def CondMovFP : FPFormat<6>;
+def SpecialFP : FPFormat<7>;
+
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr>
+ : Instruction {
+ let Namespace = "X86";
+
+ bits<8> Opcode = opcod;
+ Format Form = f;
+ bits<6> FormBits = Form.Value;
+ ImmType ImmT = i;
+ bits<3> ImmTypeBits = ImmT.Value;
+
+ dag OperandList = ops;
+ string AsmString = AsmStr;
+
+ //
+ // Attributes specific to X86 instructions...
+ //
+ bit hasOpSizePrefix = 0; // Does this inst have a 0x66 prefix?
+ bit hasAdSizePrefix = 0; // Does this inst have a 0x67 prefix?
+
+ bits<4> Prefix = 0; // Which prefix byte does this inst have?
+ bit hasREX_WPrefix = 0; // Does this inst requires the REX.W prefix?
+ FPFormat FPForm; // What flavor of FP instruction is this?
+ bits<3> FPFormBits = 0;
+}
+
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W { bit hasREX_WPrefix = 1; }
+class TB { bits<4> Prefix = 1; }
+class REP { bits<4> Prefix = 2; }
+class D8 { bits<4> Prefix = 3; }
+class D9 { bits<4> Prefix = 4; }
+class DA { bits<4> Prefix = 5; }
+class DB { bits<4> Prefix = 6; }
+class DC { bits<4> Prefix = 7; }
+class DD { bits<4> Prefix = 8; }
+class DE { bits<4> Prefix = 9; }
+class DF { bits<4> Prefix = 10; }
+class XD { bits<4> Prefix = 11; }
+class XS { bits<4> Prefix = 12; }
+class T8 { bits<4> Prefix = 13; }
+class TA { bits<4> Prefix = 14; }
+
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A : PatLeaf<(i8 0)>;
+def X86_COND_AE : PatLeaf<(i8 1)>;
+def X86_COND_B : PatLeaf<(i8 2)>;
+def X86_COND_BE : PatLeaf<(i8 3)>;
+def X86_COND_E : PatLeaf<(i8 4)>;
+def X86_COND_G : PatLeaf<(i8 5)>;
+def X86_COND_GE : PatLeaf<(i8 6)>;
+def X86_COND_L : PatLeaf<(i8 7)>;
+def X86_COND_LE : PatLeaf<(i8 8)>;
+def X86_COND_NE : PatLeaf<(i8 9)>;
+def X86_COND_NO : PatLeaf<(i8 10)>;
+def X86_COND_NP : PatLeaf<(i8 11)>;
+def X86_COND_NS : PatLeaf<(i8 12)>;
+def X86_COND_O : PatLeaf<(i8 13)>;
+def X86_COND_P : PatLeaf<(i8 14)>;
+def X86_COND_S : PatLeaf<(i8 15)>;
+
+def i16immSExt8 : PatLeaf<(i16 imm), [{
+ // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int16_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def i32immSExt8 : PatLeaf<(i32 imm), [{
+ // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int32_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+// Helper fragments for loads.
+def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>;
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>;
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+
+def sextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (sextloadi1 node:$ptr))>;
+def sextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (sextloadi1 node:$ptr))>;
+def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1 : PatFrag<(ops node:$ptr), (i8 (zextloadi1 node:$ptr))>;
+def zextloadi16i1 : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1 : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1 : PatFrag<(ops node:$ptr), (i8 (extloadi1 node:$ptr))>;
+def extloadi16i1 : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1 : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8 : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8 : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16 : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class I<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+ : X86Inst<o, f, NoImm, ops, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm8 , ops, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm16, ops, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm32, ops, asm> {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+def ADJCALLSTACKDOWN : I<0, Pseudo, (ops i32imm:$amt), "#ADJCALLSTACKDOWN",
+ [(X86callseq_start imm:$amt)]>, Imp<[ESP],[ESP]>;
+def ADJCALLSTACKUP : I<0, Pseudo, (ops i32imm:$amt1, i32imm:$amt2),
+ "#ADJCALLSTACKUP",
+ [(X86callseq_end imm:$amt1, imm:$amt2)]>,
+ Imp<[ESP],[ESP]>;
+def IMPLICIT_USE : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_USE", []>;
+def IMPLICIT_DEF : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_DEF", []>;
+def IMPLICIT_DEF_GR8 : I<0, Pseudo, (ops GR8:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set GR8:$dst, (undef))]>;
+def IMPLICIT_DEF_GR16 : I<0, Pseudo, (ops GR16:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set GR16:$dst, (undef))]>;
+def IMPLICIT_DEF_GR32 : I<0, Pseudo, (ops GR32:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set GR32:$dst, (undef))]>;
+
+// Nop
+def NOOP : I<0x90, RawFrm, (ops), "nop", []>;
+
+// Truncate
+def TRUNC_32_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src),
+ "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>;
+def TRUNC_16_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src),
+ "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>;
+def TRUNC_32to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src),
+ "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+ [(set GR16:$dst, (trunc GR32:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, noResults = 1 in {
+ def RET : I<0xC3, RawFrm, (ops), "ret", [(X86retflag 0)]>;
+ def RETI : Ii16<0xC2, RawFrm, (ops i16imm:$amt), "ret $amt",
+ [(X86retflag imm:$amt)]>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1, noResults = 1 in
+ class IBr<bits<8> opcode, dag ops, string asm, list<dag> pattern> :
+ I<opcode, RawFrm, ops, asm, pattern>;
+
+// Indirect branches
+let isBranch = 1, isBarrier = 1 in
+ def JMP : IBr<0xE9, (ops brtarget:$dst), "jmp $dst", [(br bb:$dst)]>;
+
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+ def JMP32r : I<0xFF, MRM4r, (ops GR32:$dst), "jmp{l} {*}$dst",
+ [(brind GR32:$dst)]>;
+ def JMP32m : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp{l} {*}$dst",
+ [(brind (loadi32 addr:$dst))]>;
+}
+
+// Conditional branches
+def JE : IBr<0x84, (ops brtarget:$dst), "je $dst",
+ [(X86brcond bb:$dst, X86_COND_E)]>, TB;
+def JNE : IBr<0x85, (ops brtarget:$dst), "jne $dst",
+ [(X86brcond bb:$dst, X86_COND_NE)]>, TB;
+def JL : IBr<0x8C, (ops brtarget:$dst), "jl $dst",
+ [(X86brcond bb:$dst, X86_COND_L)]>, TB;
+def JLE : IBr<0x8E, (ops brtarget:$dst), "jle $dst",
+ [(X86brcond bb:$dst, X86_COND_LE)]>, TB;
+def JG : IBr<0x8F, (ops brtarget:$dst), "jg $dst",
+ [(X86brcond bb:$dst, X86_COND_G)]>, TB;
+def JGE : IBr<0x8D, (ops brtarget:$dst), "jge $dst",
+ [(X86brcond bb:$dst, X86_COND_GE)]>, TB;
+
+def JB : IBr<0x82, (ops brtarget:$dst), "jb $dst",
+ [(X86brcond bb:$dst, X86_COND_B)]>, TB;
+def JBE : IBr<0x86, (ops brtarget:$dst), "jbe $dst",
+ [(X86brcond bb:$dst, X86_COND_BE)]>, TB;
+def JA : IBr<0x87, (ops brtarget:$dst), "ja $dst",
+ [(X86brcond bb:$dst, X86_COND_A)]>, TB;
+def JAE : IBr<0x83, (ops brtarget:$dst), "jae $dst",
+ [(X86brcond bb:$dst, X86_COND_AE)]>, TB;
+
+def JS : IBr<0x88, (ops brtarget:$dst), "js $dst",
+ [(X86brcond bb:$dst, X86_COND_S)]>, TB;
+def JNS : IBr<0x89, (ops brtarget:$dst), "jns $dst",
+ [(X86brcond bb:$dst, X86_COND_NS)]>, TB;
+def JP : IBr<0x8A, (ops brtarget:$dst), "jp $dst",
+ [(X86brcond bb:$dst, X86_COND_P)]>, TB;
+def JNP : IBr<0x8B, (ops brtarget:$dst), "jnp $dst",
+ [(X86brcond bb:$dst, X86_COND_NP)]>, TB;
+def JO : IBr<0x80, (ops brtarget:$dst), "jo $dst",
+ [(X86brcond bb:$dst, X86_COND_O)]>, TB;
+def JNO : IBr<0x81, (ops brtarget:$dst), "jno $dst",
+ [(X86brcond bb:$dst, X86_COND_NO)]>, TB;
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+ // All calls clobber the non-callee saved registers...
+ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in {
+ def CALLpcrel32 : I<0xE8, RawFrm, (ops i32imm:$dst, variable_ops),
+ "call ${dst:call}", []>;
+ def CALL32r : I<0xFF, MRM2r, (ops GR32:$dst, variable_ops),
+ "call {*}$dst", [(X86call GR32:$dst)]>;
+ def CALL32m : I<0xFF, MRM2m, (ops i32mem:$dst, variable_ops),
+ "call {*}$dst", []>;
+ }
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+ def TAILJMPd : IBr<0xE9, (ops i32imm:$dst), "jmp ${dst:call} # TAIL CALL",
+ []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+ def TAILJMPr : I<0xFF, MRM4r, (ops GR32:$dst), "jmp {*}$dst # TAIL CALL",
+ []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+ def TAILJMPm : I<0xFF, MRM4m, (ops i32mem:$dst),
+ "jmp {*}$dst # TAIL CALL", []>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+def LEAVE : I<0xC9, RawFrm,
+ (ops), "leave", []>, Imp<[EBP,ESP],[EBP,ESP]>;
+def POP32r : I<0x58, AddRegFrm,
+ (ops GR32:$reg), "pop{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def PUSH32r : I<0x50, AddRegFrm,
+ (ops GR32:$reg), "push{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def MovePCtoStack : I<0, Pseudo, (ops piclabel:$label),
+ "call $label", []>;
+
+let isTwoAddress = 1 in // GR32 = bswap GR32
+ def BSWAP32r : I<0xC8, AddRegFrm,
+ (ops GR32:$dst, GR32:$src),
+ "bswap{l} $dst",
+ [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+def XCHG8rr : I<0x86, MRMDestReg, // xchg GR8, GR8
+ (ops GR8:$src1, GR8:$src2),
+ "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rr : I<0x87, MRMDestReg, // xchg GR16, GR16
+ (ops GR16:$src1, GR16:$src2),
+ "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rr : I<0x87, MRMDestReg, // xchg GR32, GR32
+ (ops GR32:$src1, GR32:$src2),
+ "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def XCHG8mr : I<0x86, MRMDestMem,
+ (ops i8mem:$src1, GR8:$src2),
+ "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16mr : I<0x87, MRMDestMem,
+ (ops i16mem:$src1, GR16:$src2),
+ "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32mr : I<0x87, MRMDestMem,
+ (ops i32mem:$src1, GR32:$src2),
+ "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG8rm : I<0x86, MRMSrcMem,
+ (ops GR8:$src1, i8mem:$src2),
+ "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rm : I<0x87, MRMSrcMem,
+ (ops GR16:$src1, i16mem:$src2),
+ "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rm : I<0x87, MRMSrcMem,
+ (ops GR32:$src1, i32mem:$src2),
+ "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def LEA16r : I<0x8D, MRMSrcMem,
+ (ops GR16:$dst, i32mem:$src),
+ "lea{w} {$src|$dst}, {$dst|$src}", []>, OpSize;
+def LEA32r : I<0x8D, MRMSrcMem,
+ (ops GR32:$dst, lea32mem:$src),
+ "lea{l} {$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+def REP_MOVSB : I<0xA4, RawFrm, (ops), "{rep;movsb|rep movsb}",
+ [(X86rep_movs i8)]>,
+ Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (ops), "{rep;movsw|rep movsw}",
+ [(X86rep_movs i16)]>,
+ Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (ops), "{rep;movsl|rep movsd}",
+ [(X86rep_movs i32)]>,
+ Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+
+def REP_STOSB : I<0xAA, RawFrm, (ops), "{rep;stosb|rep stosb}",
+ [(X86rep_stos i8)]>,
+ Imp<[AL,ECX,EDI], [ECX,EDI]>, REP;
+def REP_STOSW : I<0xAB, RawFrm, (ops), "{rep;stosw|rep stosw}",
+ [(X86rep_stos i16)]>,
+ Imp<[AX,ECX,EDI], [ECX,EDI]>, REP, OpSize;
+def REP_STOSD : I<0xAB, RawFrm, (ops), "{rep;stosl|rep stosd}",
+ [(X86rep_stos i32)]>,
+ Imp<[EAX,ECX,EDI], [ECX,EDI]>, REP;
+
+def RDTSC : I<0x31, RawFrm, (ops), "rdtsc", [(X86rdtsc)]>,
+ TB, Imp<[],[RAX,RDX]>;
+
+//===----------------------------------------------------------------------===//
+// Input/Output Instructions...
+//
+def IN8rr : I<0xEC, RawFrm, (ops),
+ "in{b} {%dx, %al|%AL, %DX}",
+ []>, Imp<[DX], [AL]>;
+def IN16rr : I<0xED, RawFrm, (ops),
+ "in{w} {%dx, %ax|%AX, %DX}",
+ []>, Imp<[DX], [AX]>, OpSize;
+def IN32rr : I<0xED, RawFrm, (ops),
+ "in{l} {%dx, %eax|%EAX, %DX}",
+ []>, Imp<[DX],[EAX]>;
+
+def IN8ri : Ii8<0xE4, RawFrm, (ops i16i8imm:$port),
+ "in{b} {$port, %al|%AL, $port}",
+ []>,
+ Imp<[], [AL]>;
+def IN16ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+ "in{w} {$port, %ax|%AX, $port}",
+ []>,
+ Imp<[], [AX]>, OpSize;
+def IN32ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+ "in{l} {$port, %eax|%EAX, $port}",
+ []>,
+ Imp<[],[EAX]>;
+
+def OUT8rr : I<0xEE, RawFrm, (ops),
+ "out{b} {%al, %dx|%DX, %AL}",
+ []>, Imp<[DX, AL], []>;
+def OUT16rr : I<0xEF, RawFrm, (ops),
+ "out{w} {%ax, %dx|%DX, %AX}",
+ []>, Imp<[DX, AX], []>, OpSize;
+def OUT32rr : I<0xEF, RawFrm, (ops),
+ "out{l} {%eax, %dx|%DX, %EAX}",
+ []>, Imp<[DX, EAX], []>;
+
+def OUT8ir : Ii8<0xE6, RawFrm, (ops i16i8imm:$port),
+ "out{b} {%al, $port|$port, %AL}",
+ []>,
+ Imp<[AL], []>;
+def OUT16ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+ "out{w} {%ax, $port|$port, %AX}",
+ []>,
+ Imp<[AX], []>, OpSize;
+def OUT32ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+ "out{l} {%eax, $port|$port, %EAX}",
+ []>,
+ Imp<[EAX], []>;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+def MOV8rr : I<0x88, MRMDestReg, (ops GR8 :$dst, GR8 :$src),
+ "mov{b} {$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (ops GR16:$dst, GR16:$src),
+ "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (ops GR32:$dst, GR32:$src),
+ "mov{l} {$src, $dst|$dst, $src}", []>;
+let isReMaterializable = 1 in {
+def MOV8ri : Ii8 <0xB0, AddRegFrm, (ops GR8 :$dst, i8imm :$src),
+ "mov{b} {$src, $dst|$dst, $src}",
+ [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (ops GR16:$dst, i16imm:$src),
+ "mov{w} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (ops GR32:$dst, i32imm:$src),
+ "mov{l} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, imm:$src)]>;
+}
+def MOV8mi : Ii8 <0xC6, MRM0m, (ops i8mem :$dst, i8imm :$src),
+ "mov{b} {$src, $dst|$dst, $src}",
+ [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (ops i16mem:$dst, i16imm:$src),
+ "mov{w} {$src, $dst|$dst, $src}",
+ [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (ops i32mem:$dst, i32imm:$src),
+ "mov{l} {$src, $dst|$dst, $src}",
+ [(store (i32 imm:$src), addr:$dst)]>;
+
+def MOV8rm : I<0x8A, MRMSrcMem, (ops GR8 :$dst, i8mem :$src),
+ "mov{b} {$src, $dst|$dst, $src}",
+ [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (ops GR16:$dst, i16mem:$src),
+ "mov{w} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (load addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (ops GR32:$dst, i32mem:$src),
+ "mov{l} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (load addr:$src))]>;
+
+def MOV8mr : I<0x88, MRMDestMem, (ops i8mem :$dst, GR8 :$src),
+ "mov{b} {$src, $dst|$dst, $src}",
+ [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+ "mov{w} {$src, $dst|$dst, $src}",
+ [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+ "mov{l} {$src, $dst|$dst, $src}",
+ [(store GR32:$src, addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+def MUL8r : I<0xF6, MRM4r, (ops GR8:$src), "mul{b} $src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, GR8:$src))]>,
+ Imp<[AL],[AX]>; // AL,AH = AL*GR8
+def MUL16r : I<0xF7, MRM4r, (ops GR16:$src), "mul{w} $src", []>,
+ Imp<[AX],[AX,DX]>, OpSize; // AX,DX = AX*GR16
+def MUL32r : I<0xF7, MRM4r, (ops GR32:$src), "mul{l} $src", []>,
+ Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*GR32
+def MUL8m : I<0xF6, MRM4m, (ops i8mem :$src),
+ "mul{b} $src",
+ // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+ // This probably ought to be moved to a def : Pat<> if the
+ // syntax can be accepted.
+ [(set AL, (mul AL, (loadi8 addr:$src)))]>,
+ Imp<[AL],[AX]>; // AL,AH = AL*[mem8]
+def MUL16m : I<0xF7, MRM4m, (ops i16mem:$src),
+ "mul{w} $src", []>, Imp<[AX],[AX,DX]>,
+ OpSize; // AX,DX = AX*[mem16]
+def MUL32m : I<0xF7, MRM4m, (ops i32mem:$src),
+ "mul{l} $src", []>, Imp<[EAX],[EAX,EDX]>;// EAX,EDX = EAX*[mem32]
+
+def IMUL8r : I<0xF6, MRM5r, (ops GR8:$src), "imul{b} $src", []>,
+ Imp<[AL],[AX]>; // AL,AH = AL*GR8
+def IMUL16r : I<0xF7, MRM5r, (ops GR16:$src), "imul{w} $src", []>,
+ Imp<[AX],[AX,DX]>, OpSize; // AX,DX = AX*GR16
+def IMUL32r : I<0xF7, MRM5r, (ops GR32:$src), "imul{l} $src", []>,
+ Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*GR32
+def IMUL8m : I<0xF6, MRM5m, (ops i8mem :$src),
+ "imul{b} $src", []>, Imp<[AL],[AX]>; // AL,AH = AL*[mem8]
+def IMUL16m : I<0xF7, MRM5m, (ops i16mem:$src),
+ "imul{w} $src", []>, Imp<[AX],[AX,DX]>,
+ OpSize; // AX,DX = AX*[mem16]
+def IMUL32m : I<0xF7, MRM5m, (ops i32mem:$src),
+ "imul{l} $src", []>,
+ Imp<[EAX],[EAX,EDX]>; // EAX,EDX = EAX*[mem32]
+
+// unsigned division/remainder
+def DIV8r : I<0xF6, MRM6r, (ops GR8:$src), // AX/r8 = AL,AH
+ "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16r : I<0xF7, MRM6r, (ops GR16:$src), // DX:AX/r16 = AX,DX
+ "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32r : I<0xF7, MRM6r, (ops GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def DIV8m : I<0xF6, MRM6m, (ops i8mem:$src), // AX/[mem8] = AL,AH
+ "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16m : I<0xF7, MRM6m, (ops i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32m : I<0xF7, MRM6m, (ops i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX
+ "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+// Signed division/remainder.
+def IDIV8r : I<0xF6, MRM7r, (ops GR8:$src), // AX/r8 = AL,AH
+ "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16r: I<0xF7, MRM7r, (ops GR16:$src), // DX:AX/r16 = AX,DX
+ "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32r: I<0xF7, MRM7r, (ops GR32:$src), // EDX:EAX/r32 = EAX,EDX
+ "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def IDIV8m : I<0xF6, MRM7m, (ops i8mem:$src), // AX/[mem8] = AL,AH
+ "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16m: I<0xF7, MRM7m, (ops i16mem:$src), // DX:AX/[mem16] = AX,DX
+ "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32m: I<0xF7, MRM7m, (ops i32mem:$src), // EDX:EAX/[mem32] = EAX,EDX
+ "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+
+//===----------------------------------------------------------------------===//
+// Two address Instructions...
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+def CMOVB16rr : I<0x42, MRMSrcReg, // if <u, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_B))]>,
+ TB, OpSize;
+def CMOVB16rm : I<0x42, MRMSrcMem, // if <u, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_B))]>,
+ TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg, // if <u, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_B))]>,
+ TB;
+def CMOVB32rm : I<0x42, MRMSrcMem, // if <u, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_B))]>,
+ TB;
+
+def CMOVAE16rr: I<0x43, MRMSrcReg, // if >=u, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_AE))]>,
+ TB, OpSize;
+def CMOVAE16rm: I<0x43, MRMSrcMem, // if >=u, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_AE))]>,
+ TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg, // if >=u, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_AE))]>,
+ TB;
+def CMOVAE32rm: I<0x43, MRMSrcMem, // if >=u, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_AE))]>,
+ TB;
+
+def CMOVE16rr : I<0x44, MRMSrcReg, // if ==, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_E))]>,
+ TB, OpSize;
+def CMOVE16rm : I<0x44, MRMSrcMem, // if ==, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_E))]>,
+ TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg, // if ==, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_E))]>,
+ TB;
+def CMOVE32rm : I<0x44, MRMSrcMem, // if ==, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_E))]>,
+ TB;
+
+def CMOVNE16rr: I<0x45, MRMSrcReg, // if !=, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NE))]>,
+ TB, OpSize;
+def CMOVNE16rm: I<0x45, MRMSrcMem, // if !=, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NE))]>,
+ TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg, // if !=, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NE))]>,
+ TB;
+def CMOVNE32rm: I<0x45, MRMSrcMem, // if !=, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NE))]>,
+ TB;
+
+def CMOVBE16rr: I<0x46, MRMSrcReg, // if <=u, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_BE))]>,
+ TB, OpSize;
+def CMOVBE16rm: I<0x46, MRMSrcMem, // if <=u, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_BE))]>,
+ TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg, // if <=u, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_BE))]>,
+ TB;
+def CMOVBE32rm: I<0x46, MRMSrcMem, // if <=u, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_BE))]>,
+ TB;
+
+def CMOVA16rr : I<0x47, MRMSrcReg, // if >u, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_A))]>,
+ TB, OpSize;
+def CMOVA16rm : I<0x47, MRMSrcMem, // if >u, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_A))]>,
+ TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg, // if >u, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_A))]>,
+ TB;
+def CMOVA32rm : I<0x47, MRMSrcMem, // if >u, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_A))]>,
+ TB;
+
+def CMOVL16rr : I<0x4C, MRMSrcReg, // if <s, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_L))]>,
+ TB, OpSize;
+def CMOVL16rm : I<0x4C, MRMSrcMem, // if <s, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_L))]>,
+ TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg, // if <s, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_L))]>,
+ TB;
+def CMOVL32rm : I<0x4C, MRMSrcMem, // if <s, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_L))]>,
+ TB;
+
+def CMOVGE16rr: I<0x4D, MRMSrcReg, // if >=s, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_GE))]>,
+ TB, OpSize;
+def CMOVGE16rm: I<0x4D, MRMSrcMem, // if >=s, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_GE))]>,
+ TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg, // if >=s, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_GE))]>,
+ TB;
+def CMOVGE32rm: I<0x4D, MRMSrcMem, // if >=s, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_GE))]>,
+ TB;
+
+def CMOVLE16rr: I<0x4E, MRMSrcReg, // if <=s, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_LE))]>,
+ TB, OpSize;
+def CMOVLE16rm: I<0x4E, MRMSrcMem, // if <=s, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_LE))]>,
+ TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg, // if <=s, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_LE))]>,
+ TB;
+def CMOVLE32rm: I<0x4E, MRMSrcMem, // if <=s, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_LE))]>,
+ TB;
+
+def CMOVG16rr : I<0x4F, MRMSrcReg, // if >s, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_G))]>,
+ TB, OpSize;
+def CMOVG16rm : I<0x4F, MRMSrcMem, // if >s, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_G))]>,
+ TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg, // if >s, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_G))]>,
+ TB;
+def CMOVG32rm : I<0x4F, MRMSrcMem, // if >s, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_G))]>,
+ TB;
+
+def CMOVS16rr : I<0x48, MRMSrcReg, // if signed, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_S))]>,
+ TB, OpSize;
+def CMOVS16rm : I<0x48, MRMSrcMem, // if signed, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_S))]>,
+ TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg, // if signed, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_S))]>,
+ TB;
+def CMOVS32rm : I<0x48, MRMSrcMem, // if signed, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_S))]>,
+ TB;
+
+def CMOVNS16rr: I<0x49, MRMSrcReg, // if !signed, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NS))]>,
+ TB, OpSize;
+def CMOVNS16rm: I<0x49, MRMSrcMem, // if !signed, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NS))]>,
+ TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg, // if !signed, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NS))]>,
+ TB;
+def CMOVNS32rm: I<0x49, MRMSrcMem, // if !signed, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NS))]>,
+ TB;
+
+def CMOVP16rr : I<0x4A, MRMSrcReg, // if parity, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_P))]>,
+ TB, OpSize;
+def CMOVP16rm : I<0x4A, MRMSrcMem, // if parity, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_P))]>,
+ TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg, // if parity, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_P))]>,
+ TB;
+def CMOVP32rm : I<0x4A, MRMSrcMem, // if parity, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_P))]>,
+ TB;
+
+def CMOVNP16rr : I<0x4B, MRMSrcReg, // if !parity, GR16 = GR16
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+ X86_COND_NP))]>,
+ TB, OpSize;
+def CMOVNP16rm : I<0x4B, MRMSrcMem, // if !parity, GR16 = [mem16]
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+ X86_COND_NP))]>,
+ TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg, // if !parity, GR32 = GR32
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+ X86_COND_NP))]>,
+ TB;
+def CMOVNP32rm : I<0x4B, MRMSrcMem, // if !parity, GR32 = [mem32]
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+ X86_COND_NP))]>,
+ TB;
+
+
+// unary instructions
+let CodeSize = 2 in {
+def NEG8r : I<0xF6, MRM3r, (ops GR8 :$dst, GR8 :$src), "neg{b} $dst",
+ [(set GR8:$dst, (ineg GR8:$src))]>;
+def NEG16r : I<0xF7, MRM3r, (ops GR16:$dst, GR16:$src), "neg{w} $dst",
+ [(set GR16:$dst, (ineg GR16:$src))]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (ops GR32:$dst, GR32:$src), "neg{l} $dst",
+ [(set GR32:$dst, (ineg GR32:$src))]>;
+let isTwoAddress = 0 in {
+ def NEG8m : I<0xF6, MRM3m, (ops i8mem :$dst), "neg{b} $dst",
+ [(store (ineg (loadi8 addr:$dst)), addr:$dst)]>;
+ def NEG16m : I<0xF7, MRM3m, (ops i16mem:$dst), "neg{w} $dst",
+ [(store (ineg (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+ def NEG32m : I<0xF7, MRM3m, (ops i32mem:$dst), "neg{l} $dst",
+ [(store (ineg (loadi32 addr:$dst)), addr:$dst)]>;
+
+}
+
+def NOT8r : I<0xF6, MRM2r, (ops GR8 :$dst, GR8 :$src), "not{b} $dst",
+ [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (ops GR16:$dst, GR16:$src), "not{w} $dst",
+ [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (ops GR32:$dst, GR32:$src), "not{l} $dst",
+ [(set GR32:$dst, (not GR32:$src))]>;
+let isTwoAddress = 0 in {
+ def NOT8m : I<0xF6, MRM2m, (ops i8mem :$dst), "not{b} $dst",
+ [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+ def NOT16m : I<0xF7, MRM2m, (ops i16mem:$dst), "not{w} $dst",
+ [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+ def NOT32m : I<0xF7, MRM2m, (ops i32mem:$dst), "not{l} $dst",
+ [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let CodeSize = 2 in
+def INC8r : I<0xFE, MRM0r, (ops GR8 :$dst, GR8 :$src), "inc{b} $dst",
+ [(set GR8:$dst, (add GR8:$src, 1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+ [(set GR16:$dst, (add GR16:$src, 1))]>,
+ OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+ [(set GR32:$dst, (add GR32:$src, 1))]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+ def INC8m : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst",
+ [(store (add (loadi8 addr:$dst), 1), addr:$dst)]>;
+ def INC16m : I<0xFF, MRM0m, (ops i16mem:$dst), "inc{w} $dst",
+ [(store (add (loadi16 addr:$dst), 1), addr:$dst)]>, OpSize;
+ def INC32m : I<0xFF, MRM0m, (ops i32mem:$dst), "inc{l} $dst",
+ [(store (add (loadi32 addr:$dst), 1), addr:$dst)]>;
+}
+
+let CodeSize = 2 in
+def DEC8r : I<0xFE, MRM1r, (ops GR8 :$dst, GR8 :$src), "dec{b} $dst",
+ [(set GR8:$dst, (add GR8:$src, -1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+ [(set GR16:$dst, (add GR16:$src, -1))]>,
+ OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+ [(set GR32:$dst, (add GR32:$src, -1))]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+ def DEC8m : I<0xFE, MRM1m, (ops i8mem :$dst), "dec{b} $dst",
+ [(store (add (loadi8 addr:$dst), -1), addr:$dst)]>;
+ def DEC16m : I<0xFF, MRM1m, (ops i16mem:$dst), "dec{w} $dst",
+ [(store (add (loadi16 addr:$dst), -1), addr:$dst)]>, OpSize;
+ def DEC32m : I<0xFF, MRM1m, (ops i32mem:$dst), "dec{l} $dst",
+ [(store (add (loadi32 addr:$dst), -1), addr:$dst)]>;
+}
+
+// Logical operators...
+let isCommutable = 1 in { // X = AND Y, Z --> X = AND Z, Y
+def AND8rr : I<0x20, MRMDestReg,
+ (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+ "and{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, GR8:$src2))]>;
+def AND16rr : I<0x21, MRMDestReg,
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "and{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, GR16:$src2))]>, OpSize;
+def AND32rr : I<0x21, MRMDestReg,
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "and{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>;
+}
+
+def AND8rm : I<0x22, MRMSrcMem,
+ (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+ "and{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, (load addr:$src2)))]>;
+def AND16rm : I<0x23, MRMSrcMem,
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "and{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, (load addr:$src2)))]>, OpSize;
+def AND32rm : I<0x23, MRMSrcMem,
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "and{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, (load addr:$src2)))]>;
+
+def AND8ri : Ii8<0x80, MRM4r,
+ (ops GR8 :$dst, GR8 :$src1, i8imm :$src2),
+ "and{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (and GR8:$src1, imm:$src2))]>;
+def AND16ri : Ii16<0x81, MRM4r,
+ (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "and{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, imm:$src2))]>, OpSize;
+def AND32ri : Ii32<0x81, MRM4r,
+ (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "and{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>;
+def AND16ri8 : Ii8<0x83, MRM4r,
+ (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "and{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r,
+ (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "and{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def AND8mr : I<0x20, MRMDestMem,
+ (ops i8mem :$dst, GR8 :$src),
+ "and{b} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR8:$src), addr:$dst)]>;
+ def AND16mr : I<0x21, MRMDestMem,
+ (ops i16mem:$dst, GR16:$src),
+ "and{w} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR16:$src), addr:$dst)]>,
+ OpSize;
+ def AND32mr : I<0x21, MRMDestMem,
+ (ops i32mem:$dst, GR32:$src),
+ "and{l} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR32:$src), addr:$dst)]>;
+ def AND8mi : Ii8<0x80, MRM4m,
+ (ops i8mem :$dst, i8imm :$src),
+ "and{b} {$src, $dst|$dst, $src}",
+ [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+ def AND16mi : Ii16<0x81, MRM4m,
+ (ops i16mem:$dst, i16imm:$src),
+ "and{w} {$src, $dst|$dst, $src}",
+ [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+ OpSize;
+ def AND32mi : Ii32<0x81, MRM4m,
+ (ops i32mem:$dst, i32imm:$src),
+ "and{l} {$src, $dst|$dst, $src}",
+ [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+ def AND16mi8 : Ii8<0x83, MRM4m,
+ (ops i16mem:$dst, i16i8imm :$src),
+ "and{w} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+ OpSize;
+ def AND32mi8 : Ii8<0x83, MRM4m,
+ (ops i32mem:$dst, i32i8imm :$src),
+ "and{l} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in { // X = OR Y, Z --> X = OR Z, Y
+def OR8rr : I<0x08, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+ "or{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+def OR16rr : I<0x09, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "or{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>, OpSize;
+def OR32rr : I<0x09, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "or{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>;
+}
+def OR8rm : I<0x0A, MRMSrcMem , (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+ "or{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+def OR16rm : I<0x0B, MRMSrcMem , (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "or{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>, OpSize;
+def OR32rm : I<0x0B, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "or{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, (load addr:$src2)))]>;
+
+def OR8ri : Ii8 <0x80, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+ "or{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+def OR16ri : Ii16<0x81, MRM1r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "or{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>, OpSize;
+def OR32ri : Ii32<0x81, MRM1r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "or{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>;
+
+def OR16ri8 : Ii8<0x83, MRM1r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "or{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2))]>, OpSize;
+def OR32ri8 : Ii8<0x83, MRM1r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "or{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+ def OR8mr : I<0x08, MRMDestMem, (ops i8mem:$dst, GR8:$src),
+ "or{b} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+ def OR16mr : I<0x09, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+ "or{w} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>, OpSize;
+ def OR32mr : I<0x09, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+ "or{l} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR32:$src), addr:$dst)]>;
+ def OR8mi : Ii8<0x80, MRM1m, (ops i8mem :$dst, i8imm:$src),
+ "or{b} {$src, $dst|$dst, $src}",
+ [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+ def OR16mi : Ii16<0x81, MRM1m, (ops i16mem:$dst, i16imm:$src),
+ "or{w} {$src, $dst|$dst, $src}",
+ [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+ OpSize;
+ def OR32mi : Ii32<0x81, MRM1m, (ops i32mem:$dst, i32imm:$src),
+ "or{l} {$src, $dst|$dst, $src}",
+ [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+ def OR16mi8 : Ii8<0x83, MRM1m, (ops i16mem:$dst, i16i8imm:$src),
+ "or{w} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+ OpSize;
+ def OR32mi8 : Ii8<0x83, MRM1m, (ops i32mem:$dst, i32i8imm:$src),
+ "or{l} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+def XOR8rr : I<0x30, MRMDestReg,
+ (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+ "xor{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>;
+def XOR16rr : I<0x31, MRMDestReg,
+ (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "xor{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize;
+def XOR32rr : I<0x31, MRMDestReg,
+ (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "xor{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+}
+
+def XOR8rm : I<0x32, MRMSrcMem ,
+ (ops GR8 :$dst, GR8:$src1, i8mem :$src2),
+ "xor{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2)))]>;
+def XOR16rm : I<0x33, MRMSrcMem ,
+ (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "xor{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>, OpSize;
+def XOR32rm : I<0x33, MRMSrcMem ,
+ (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "xor{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2)))]>;
+
+def XOR8ri : Ii8<0x80, MRM6r,
+ (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+ "xor{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (xor GR8:$src1, imm:$src2))]>;
+def XOR16ri : Ii16<0x81, MRM6r,
+ (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "xor{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, imm:$src2))]>, OpSize;
+def XOR32ri : Ii32<0x81, MRM6r,
+ (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "xor{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>;
+def XOR16ri8 : Ii8<0x83, MRM6r,
+ (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "xor{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r,
+ (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "xor{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+ def XOR8mr : I<0x30, MRMDestMem,
+ (ops i8mem :$dst, GR8 :$src),
+ "xor{b} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR8:$src), addr:$dst)]>;
+ def XOR16mr : I<0x31, MRMDestMem,
+ (ops i16mem:$dst, GR16:$src),
+ "xor{w} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR16:$src), addr:$dst)]>,
+ OpSize;
+ def XOR32mr : I<0x31, MRMDestMem,
+ (ops i32mem:$dst, GR32:$src),
+ "xor{l} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR32:$src), addr:$dst)]>;
+ def XOR8mi : Ii8<0x80, MRM6m,
+ (ops i8mem :$dst, i8imm :$src),
+ "xor{b} {$src, $dst|$dst, $src}",
+ [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+ def XOR16mi : Ii16<0x81, MRM6m,
+ (ops i16mem:$dst, i16imm:$src),
+ "xor{w} {$src, $dst|$dst, $src}",
+ [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+ OpSize;
+ def XOR32mi : Ii32<0x81, MRM6m,
+ (ops i32mem:$dst, i32imm:$src),
+ "xor{l} {$src, $dst|$dst, $src}",
+ [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+ def XOR16mi8 : Ii8<0x83, MRM6m,
+ (ops i16mem:$dst, i16i8imm :$src),
+ "xor{w} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+ OpSize;
+ def XOR32mi8 : Ii8<0x83, MRM6m,
+ (ops i32mem:$dst, i32i8imm :$src),
+ "xor{l} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+// Shift instructions
+def SHL8rCL : I<0xD2, MRM4r, (ops GR8 :$dst, GR8 :$src),
+ "shl{b} {%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (shl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHL16rCL : I<0xD3, MRM4r, (ops GR16:$dst, GR16:$src),
+ "shl{w} {%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (shl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (ops GR32:$dst, GR32:$src),
+ "shl{l} {%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (shl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHL8ri : Ii8<0xC0, MRM4r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+ "shl{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def SHL16ri : Ii8<0xC1, MRM4r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+ "shl{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri : Ii8<0xC1, MRM4r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+ "shl{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+}
+
+// Shift left by one. Not used because (add x, x) is slightly cheaper.
+def SHL8r1 : I<0xD0, MRM4r, (ops GR8 :$dst, GR8 :$src1),
+ "shl{b} $dst", []>;
+def SHL16r1 : I<0xD1, MRM4r, (ops GR16:$dst, GR16:$src1),
+ "shl{w} $dst", []>, OpSize;
+def SHL32r1 : I<0xD1, MRM4r, (ops GR32:$dst, GR32:$src1),
+ "shl{l} $dst", []>;
+
+let isTwoAddress = 0 in {
+ def SHL8mCL : I<0xD2, MRM4m, (ops i8mem :$dst),
+ "shl{b} {%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SHL16mCL : I<0xD3, MRM4m, (ops i16mem:$dst),
+ "shl{w} {%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>, OpSize;
+ def SHL32mCL : I<0xD3, MRM4m, (ops i32mem:$dst),
+ "shl{l} {%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SHL8mi : Ii8<0xC0, MRM4m, (ops i8mem :$dst, i8imm:$src),
+ "shl{b} {$src, $dst|$dst, $src}",
+ [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHL16mi : Ii8<0xC1, MRM4m, (ops i16mem:$dst, i8imm:$src),
+ "shl{w} {$src, $dst|$dst, $src}",
+ [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHL32mi : Ii8<0xC1, MRM4m, (ops i32mem:$dst, i8imm:$src),
+ "shl{l} {$src, $dst|$dst, $src}",
+ [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHL8m1 : I<0xD0, MRM4m, (ops i8mem :$dst),
+ "shl{b} $dst",
+ [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHL16m1 : I<0xD1, MRM4m, (ops i16mem:$dst),
+ "shl{w} $dst",
+ [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SHL32m1 : I<0xD1, MRM4m, (ops i32mem:$dst),
+ "shl{l} $dst",
+ [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SHR8rCL : I<0xD2, MRM5r, (ops GR8 :$dst, GR8 :$src),
+ "shr{b} {%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (srl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHR16rCL : I<0xD3, MRM5r, (ops GR16:$dst, GR16:$src),
+ "shr{w} {%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (srl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (ops GR32:$dst, GR32:$src),
+ "shr{l} {%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (srl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHR8ri : Ii8<0xC0, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+ "shr{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri : Ii8<0xC1, MRM5r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+ "shr{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri : Ii8<0xC1, MRM5r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+ "shr{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1 : I<0xD0, MRM5r, (ops GR8:$dst, GR8:$src1),
+ "shr{b} $dst",
+ [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1 : I<0xD1, MRM5r, (ops GR16:$dst, GR16:$src1),
+ "shr{w} $dst",
+ [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1 : I<0xD1, MRM5r, (ops GR32:$dst, GR32:$src1),
+ "shr{l} $dst",
+ [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ def SHR8mCL : I<0xD2, MRM5m, (ops i8mem :$dst),
+ "shr{b} {%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SHR16mCL : I<0xD3, MRM5m, (ops i16mem:$dst),
+ "shr{w} {%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>, OpSize;
+ def SHR32mCL : I<0xD3, MRM5m, (ops i32mem:$dst),
+ "shr{l} {%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SHR8mi : Ii8<0xC0, MRM5m, (ops i8mem :$dst, i8imm:$src),
+ "shr{b} {$src, $dst|$dst, $src}",
+ [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SHR16mi : Ii8<0xC1, MRM5m, (ops i16mem:$dst, i8imm:$src),
+ "shr{w} {$src, $dst|$dst, $src}",
+ [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SHR32mi : Ii8<0xC1, MRM5m, (ops i32mem:$dst, i8imm:$src),
+ "shr{l} {$src, $dst|$dst, $src}",
+ [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SHR8m1 : I<0xD0, MRM5m, (ops i8mem :$dst),
+ "shr{b} $dst",
+ [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SHR16m1 : I<0xD1, MRM5m, (ops i16mem:$dst),
+ "shr{w} $dst",
+ [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+ def SHR32m1 : I<0xD1, MRM5m, (ops i32mem:$dst),
+ "shr{l} $dst",
+ [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SAR8rCL : I<0xD2, MRM7r, (ops GR8 :$dst, GR8 :$src),
+ "sar{b} {%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (sra GR8:$src, CL))]>, Imp<[CL],[]>;
+def SAR16rCL : I<0xD3, MRM7r, (ops GR16:$dst, GR16:$src),
+ "sar{w} {%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (sra GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (ops GR32:$dst, GR32:$src),
+ "sar{l} {%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (sra GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SAR8ri : Ii8<0xC0, MRM7r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+ "sar{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri : Ii8<0xC1, MRM7r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+ "sar{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+ OpSize;
+def SAR32ri : Ii8<0xC1, MRM7r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+ "sar{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1 : I<0xD0, MRM7r, (ops GR8 :$dst, GR8 :$src1),
+ "sar{b} $dst",
+ [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1 : I<0xD1, MRM7r, (ops GR16:$dst, GR16:$src1),
+ "sar{w} $dst",
+ [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1 : I<0xD1, MRM7r, (ops GR32:$dst, GR32:$src1),
+ "sar{l} $dst",
+ [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ def SAR8mCL : I<0xD2, MRM7m, (ops i8mem :$dst),
+ "sar{b} {%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SAR16mCL : I<0xD3, MRM7m, (ops i16mem:$dst),
+ "sar{w} {%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>, OpSize;
+ def SAR32mCL : I<0xD3, MRM7m, (ops i32mem:$dst),
+ "sar{l} {%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def SAR8mi : Ii8<0xC0, MRM7m, (ops i8mem :$dst, i8imm:$src),
+ "sar{b} {$src, $dst|$dst, $src}",
+ [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def SAR16mi : Ii8<0xC1, MRM7m, (ops i16mem:$dst, i8imm:$src),
+ "sar{w} {$src, $dst|$dst, $src}",
+ [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def SAR32mi : Ii8<0xC1, MRM7m, (ops i32mem:$dst, i8imm:$src),
+ "sar{l} {$src, $dst|$dst, $src}",
+ [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Shift by 1
+ def SAR8m1 : I<0xD0, MRM7m, (ops i8mem :$dst),
+ "sar{b} $dst",
+ [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def SAR16m1 : I<0xD1, MRM7m, (ops i16mem:$dst),
+ "sar{w} $dst",
+ [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def SAR32m1 : I<0xD1, MRM7m, (ops i32mem:$dst),
+ "sar{l} $dst",
+ [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+def ROL8rCL : I<0xD2, MRM0r, (ops GR8 :$dst, GR8 :$src),
+ "rol{b} {%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (rotl GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROL16rCL : I<0xD3, MRM0r, (ops GR16:$dst, GR16:$src),
+ "rol{w} {%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (rotl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (ops GR32:$dst, GR32:$src),
+ "rol{l} {%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (rotl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROL8ri : Ii8<0xC0, MRM0r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+ "rol{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri : Ii8<0xC1, MRM0r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+ "rol{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROL32ri : Ii8<0xC1, MRM0r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+ "rol{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1 : I<0xD0, MRM0r, (ops GR8 :$dst, GR8 :$src1),
+ "rol{b} $dst",
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1 : I<0xD1, MRM0r, (ops GR16:$dst, GR16:$src1),
+ "rol{w} $dst",
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1 : I<0xD1, MRM0r, (ops GR32:$dst, GR32:$src1),
+ "rol{l} $dst",
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ def ROL8mCL : I<0xD2, MRM0m, (ops i8mem :$dst),
+ "rol{b} {%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def ROL16mCL : I<0xD3, MRM0m, (ops i16mem:$dst),
+ "rol{w} {%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>, OpSize;
+ def ROL32mCL : I<0xD3, MRM0m, (ops i32mem:$dst),
+ "rol{l} {%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def ROL8mi : Ii8<0xC0, MRM0m, (ops i8mem :$dst, i8imm:$src),
+ "rol{b} {$src, $dst|$dst, $src}",
+ [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROL16mi : Ii8<0xC1, MRM0m, (ops i16mem:$dst, i8imm:$src),
+ "rol{w} {$src, $dst|$dst, $src}",
+ [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROL32mi : Ii8<0xC1, MRM0m, (ops i32mem:$dst, i8imm:$src),
+ "rol{l} {$src, $dst|$dst, $src}",
+ [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROL8m1 : I<0xD0, MRM0m, (ops i8mem :$dst),
+ "rol{b} $dst",
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROL16m1 : I<0xD1, MRM0m, (ops i16mem:$dst),
+ "rol{w} $dst",
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROL32m1 : I<0xD1, MRM0m, (ops i32mem:$dst),
+ "rol{l} $dst",
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def ROR8rCL : I<0xD2, MRM1r, (ops GR8 :$dst, GR8 :$src),
+ "ror{b} {%cl, $dst|$dst, %CL}",
+ [(set GR8:$dst, (rotr GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROR16rCL : I<0xD3, MRM1r, (ops GR16:$dst, GR16:$src),
+ "ror{w} {%cl, $dst|$dst, %CL}",
+ [(set GR16:$dst, (rotr GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (ops GR32:$dst, GR32:$src),
+ "ror{l} {%cl, $dst|$dst, %CL}",
+ [(set GR32:$dst, (rotr GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROR8ri : Ii8<0xC0, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+ "ror{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri : Ii8<0xC1, MRM1r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+ "ror{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROR32ri : Ii8<0xC1, MRM1r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+ "ror{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1 : I<0xD0, MRM1r, (ops GR8 :$dst, GR8 :$src1),
+ "ror{b} $dst",
+ [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1 : I<0xD1, MRM1r, (ops GR16:$dst, GR16:$src1),
+ "ror{w} $dst",
+ [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1 : I<0xD1, MRM1r, (ops GR32:$dst, GR32:$src1),
+ "ror{l} $dst",
+ [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+ def ROR8mCL : I<0xD2, MRM1m, (ops i8mem :$dst),
+ "ror{b} {%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def ROR16mCL : I<0xD3, MRM1m, (ops i16mem:$dst),
+ "ror{w} {%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>, OpSize;
+ def ROR32mCL : I<0xD3, MRM1m, (ops i32mem:$dst),
+ "ror{l} {%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+ def ROR8mi : Ii8<0xC0, MRM1m, (ops i8mem :$dst, i8imm:$src),
+ "ror{b} {$src, $dst|$dst, $src}",
+ [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+ def ROR16mi : Ii8<0xC1, MRM1m, (ops i16mem:$dst, i8imm:$src),
+ "ror{w} {$src, $dst|$dst, $src}",
+ [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+ OpSize;
+ def ROR32mi : Ii8<0xC1, MRM1m, (ops i32mem:$dst, i8imm:$src),
+ "ror{l} {$src, $dst|$dst, $src}",
+ [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+ // Rotate by 1
+ def ROR8m1 : I<0xD0, MRM1m, (ops i8mem :$dst),
+ "ror{b} $dst",
+ [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+ def ROR16m1 : I<0xD1, MRM1m, (ops i16mem:$dst),
+ "ror{w} $dst",
+ [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+ OpSize;
+ def ROR32m1 : I<0xD1, MRM1m, (ops i32mem:$dst),
+ "ror{l} $dst",
+ [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+def SHLD32rrCL : I<0xA5, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+ Imp<[CL],[]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+ Imp<[CL],[]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+ Imp<[CL],[]>, TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+ Imp<[CL],[]>, TB, OpSize;
+
+let isCommutable = 1 in { // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+ (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+ (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+ "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+ (i8 imm:$src3)))]>,
+ TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+ (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+ (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+ "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+ (i8 imm:$src3)))]>,
+ TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+ def SHLD32mrCL : I<0xA5, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>,
+ Imp<[CL],[]>, TB;
+ def SHRD32mrCL : I<0xAD, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+ addr:$dst)]>,
+ Imp<[CL],[]>, TB;
+ def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+ (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+ def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
+ (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+ "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB;
+
+ def SHLD16mrCL : I<0xA5, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+ "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>,
+ Imp<[CL],[]>, TB, OpSize;
+ def SHRD16mrCL : I<0xAD, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+ "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+ addr:$dst)]>,
+ Imp<[CL],[]>, TB, OpSize;
+ def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+ (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+ def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
+ (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+ "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+ (i8 imm:$src3)), addr:$dst)]>,
+ TB, OpSize;
+}
+
+
+// Arithmetic.
+let isCommutable = 1 in { // X = ADD Y, Z --> X = ADD Z, Y
+def ADD8rr : I<0x00, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+ "add{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, GR8:$src2))]>;
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def ADD16rr : I<0x01, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, GR16:$src2))]>, OpSize;
+def ADD32rr : I<0x01, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+def ADD8rm : I<0x02, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+ "add{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, (load addr:$src2)))]>;
+def ADD16rm : I<0x03, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, (load addr:$src2)))]>, OpSize;
+def ADD32rm : I<0x03, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, (load addr:$src2)))]>;
+
+def ADD8ri : Ii8<0x80, MRM0r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+ "add{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (add GR8:$src1, imm:$src2))]>;
+
+let isConvertibleToThreeAddress = 1 in { // Can transform into LEA.
+def ADD16ri : Ii16<0x81, MRM0r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, imm:$src2))]>, OpSize;
+def ADD32ri : Ii32<0x81, MRM0r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, imm:$src2))]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2))]>;
+}
+
+let isTwoAddress = 0 in {
+ def ADD8mr : I<0x00, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+ "add{b} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def ADD16mr : I<0x01, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def ADD32mr : I<0x01, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def ADD8mi : Ii8<0x80, MRM0m, (ops i8mem :$dst, i8imm :$src2),
+ "add{b} {$src2, $dst|$dst, $src2}",
+ [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADD16mi : Ii16<0x81, MRM0m, (ops i16mem:$dst, i16imm:$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def ADD32mi : Ii32<0x81, MRM0m, (ops i32mem:$dst, i32imm:$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADD16mi8 : Ii8<0x83, MRM0m, (ops i16mem:$dst, i16i8imm :$src2),
+ "add{w} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def ADD32mi8 : Ii8<0x83, MRM0m, (ops i32mem:$dst, i32i8imm :$src2),
+ "add{l} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = ADC Y, Z --> X = ADC Z, Y
+def ADC32rr : I<0x11, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+def ADC32rm : I<0x13, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC32ri : Ii32<0x81, MRM2r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def ADC32mr : I<0x11, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def ADC32mi : Ii32<0x81, MRM2m, (ops i32mem:$dst, i32imm:$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def ADC32mi8 : Ii8<0x83, MRM2m, (ops i32mem:$dst, i32i8imm :$src2),
+ "adc{l} {$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SUB8rr : I<0x28, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+ "sub{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, GR8:$src2))]>;
+def SUB16rr : I<0x29, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, GR16:$src2))]>, OpSize;
+def SUB32rr : I<0x29, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>;
+def SUB8rm : I<0x2A, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+ "sub{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2)))]>;
+def SUB16rm : I<0x2B, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2)))]>, OpSize;
+def SUB32rm : I<0x2B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2)))]>;
+
+def SUB8ri : Ii8 <0x80, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+ "sub{b} {$src2, $dst|$dst, $src2}",
+ [(set GR8:$dst, (sub GR8:$src1, imm:$src2))]>;
+def SUB16ri : Ii16<0x81, MRM5r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, imm:$src2))]>, OpSize;
+def SUB32ri : Ii32<0x81, MRM5r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, imm:$src2))]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+ def SUB8mr : I<0x28, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+ "sub{b} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR8:$src2), addr:$dst)]>;
+ def SUB16mr : I<0x29, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR16:$src2), addr:$dst)]>,
+ OpSize;
+ def SUB32mr : I<0x29, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def SUB8mi : Ii8<0x80, MRM5m, (ops i8mem :$dst, i8imm:$src2),
+ "sub{b} {$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SUB16mi : Ii16<0x81, MRM5m, (ops i16mem:$dst, i16imm:$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+ OpSize;
+ def SUB32mi : Ii32<0x81, MRM5m, (ops i32mem:$dst, i32imm:$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(store (sub (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SUB16mi8 : Ii8<0x83, MRM5m, (ops i16mem:$dst, i16i8imm :$src2),
+ "sub{w} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+ OpSize;
+ def SUB32mi8 : Ii8<0x83, MRM5m, (ops i32mem:$dst, i32i8imm :$src2),
+ "sub{l} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SBB32rr : I<0x19, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+ def SBB32mr : I<0x19, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+ def SBB8mi : Ii32<0x80, MRM3m, (ops i8mem:$dst, i8imm:$src2),
+ "sbb{b} {$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB32mi : Ii32<0x81, MRM3m, (ops i32mem:$dst, i32imm:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+ def SBB32mi8 : Ii8<0x83, MRM3m, (ops i32mem:$dst, i32i8imm :$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+def SBB32rm : I<0x1B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB32ri : Ii32<0x81, MRM3r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "sbb{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+
+let isCommutable = 1 in { // X = IMUL Y, Z --> X = IMUL Z, Y
+def IMUL16rr : I<0xAF, MRMSrcReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+ "imul{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, GR16:$src2))]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+ "imul{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>, TB;
+}
+def IMUL16rm : I<0xAF, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+ "imul{w} {$src2, $dst|$dst, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2)))]>,
+ TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+ "imul{l} {$src2, $dst|$dst, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2)))]>, TB;
+
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
+ (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+ "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, imm:$src2))]>, OpSize;
+def IMUL32rri : Ii32<0x69, MRMSrcReg, // GR32 = GR32*I32
+ (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+ "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
+ (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+ "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2))]>,
+ OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg, // GR32 = GR32*I8
+ (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+ "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2))]>;
+
+def IMUL16rmi : Ii16<0x69, MRMSrcMem, // GR16 = [mem16]*I16
+ (ops GR16:$dst, i16mem:$src1, i16imm:$src2),
+ "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul (load addr:$src1), imm:$src2))]>,
+ OpSize;
+def IMUL32rmi : Ii32<0x69, MRMSrcMem, // GR32 = [mem32]*I32
+ (ops GR32:$dst, i32mem:$src1, i32imm:$src2),
+ "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul (load addr:$src1), imm:$src2))]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem, // GR16 = [mem16]*I8
+ (ops GR16:$dst, i16mem:$src1, i16i8imm :$src2),
+ "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR16:$dst, (mul (load addr:$src1), i16immSExt8:$src2))]>,
+ OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem, // GR32 = [mem32]*I8
+ (ops GR32:$dst, i32mem:$src1, i32i8imm: $src2),
+ "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (mul (load addr:$src1), i32immSExt8:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let isCommutable = 1 in { // TEST X, Y --> TEST Y, X
+def TEST8rr : I<0x84, MRMDestReg, (ops GR8:$src1, GR8:$src2),
+ "test{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR8:$src1, GR8:$src2), 0)]>;
+def TEST16rr : I<0x85, MRMDestReg, (ops GR16:$src1, GR16:$src2),
+ "test{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR16:$src1, GR16:$src2), 0)]>, OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (ops GR32:$src1, GR32:$src2),
+ "test{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR32:$src1, GR32:$src2), 0)]>;
+}
+
+def TEST8rm : I<0x84, MRMSrcMem, (ops GR8 :$src1, i8mem :$src2),
+ "test{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (ops GR16:$src1, i16mem:$src2),
+ "test{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0)]>,
+ OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (ops GR32:$src1, i32mem:$src2),
+ "test{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0)]>;
+
+def TEST8ri : Ii8 <0xF6, MRM0r, // flags = GR8 & imm8
+ (ops GR8:$src1, i8imm:$src2),
+ "test{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR8:$src1, imm:$src2), 0)]>;
+def TEST16ri : Ii16<0xF7, MRM0r, // flags = GR16 & imm16
+ (ops GR16:$src1, i16imm:$src2),
+ "test{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR16:$src1, imm:$src2), 0)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r, // flags = GR32 & imm32
+ (ops GR32:$src1, i32imm:$src2),
+ "test{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR32:$src1, imm:$src2), 0)]>;
+
+def TEST8mi : Ii8 <0xF6, MRM0m, // flags = [mem8] & imm8
+ (ops i8mem:$src1, i8imm:$src2),
+ "test{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0)]>;
+def TEST16mi : Ii16<0xF7, MRM0m, // flags = [mem16] & imm16
+ (ops i16mem:$src1, i16imm:$src2),
+ "test{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0)]>,
+ OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m, // flags = [mem32] & imm32
+ (ops i32mem:$src1, i32imm:$src2),
+ "test{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0)]>;
+
+
+// Condition code ops, incl. set if equal/not equal/...
+def SAHF : I<0x9E, RawFrm, (ops), "sahf", []>, Imp<[AH],[]>; // flags = AH
+def LAHF : I<0x9F, RawFrm, (ops), "lahf", []>, Imp<[],[AH]>; // AH = flags
+
+def SETEr : I<0x94, MRM0r,
+ (ops GR8 :$dst),
+ "sete $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_E))]>,
+ TB; // GR8 = ==
+def SETEm : I<0x94, MRM0m,
+ (ops i8mem:$dst),
+ "sete $dst",
+ [(store (X86setcc X86_COND_E), addr:$dst)]>,
+ TB; // [mem8] = ==
+def SETNEr : I<0x95, MRM0r,
+ (ops GR8 :$dst),
+ "setne $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NE))]>,
+ TB; // GR8 = !=
+def SETNEm : I<0x95, MRM0m,
+ (ops i8mem:$dst),
+ "setne $dst",
+ [(store (X86setcc X86_COND_NE), addr:$dst)]>,
+ TB; // [mem8] = !=
+def SETLr : I<0x9C, MRM0r,
+ (ops GR8 :$dst),
+ "setl $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_L))]>,
+ TB; // GR8 = < signed
+def SETLm : I<0x9C, MRM0m,
+ (ops i8mem:$dst),
+ "setl $dst",
+ [(store (X86setcc X86_COND_L), addr:$dst)]>,
+ TB; // [mem8] = < signed
+def SETGEr : I<0x9D, MRM0r,
+ (ops GR8 :$dst),
+ "setge $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_GE))]>,
+ TB; // GR8 = >= signed
+def SETGEm : I<0x9D, MRM0m,
+ (ops i8mem:$dst),
+ "setge $dst",
+ [(store (X86setcc X86_COND_GE), addr:$dst)]>,
+ TB; // [mem8] = >= signed
+def SETLEr : I<0x9E, MRM0r,
+ (ops GR8 :$dst),
+ "setle $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_LE))]>,
+ TB; // GR8 = <= signed
+def SETLEm : I<0x9E, MRM0m,
+ (ops i8mem:$dst),
+ "setle $dst",
+ [(store (X86setcc X86_COND_LE), addr:$dst)]>,
+ TB; // [mem8] = <= signed
+def SETGr : I<0x9F, MRM0r,
+ (ops GR8 :$dst),
+ "setg $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_G))]>,
+ TB; // GR8 = > signed
+def SETGm : I<0x9F, MRM0m,
+ (ops i8mem:$dst),
+ "setg $dst",
+ [(store (X86setcc X86_COND_G), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETBr : I<0x92, MRM0r,
+ (ops GR8 :$dst),
+ "setb $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_B))]>,
+ TB; // GR8 = < unsign
+def SETBm : I<0x92, MRM0m,
+ (ops i8mem:$dst),
+ "setb $dst",
+ [(store (X86setcc X86_COND_B), addr:$dst)]>,
+ TB; // [mem8] = < unsign
+def SETAEr : I<0x93, MRM0r,
+ (ops GR8 :$dst),
+ "setae $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_AE))]>,
+ TB; // GR8 = >= unsign
+def SETAEm : I<0x93, MRM0m,
+ (ops i8mem:$dst),
+ "setae $dst",
+ [(store (X86setcc X86_COND_AE), addr:$dst)]>,
+ TB; // [mem8] = >= unsign
+def SETBEr : I<0x96, MRM0r,
+ (ops GR8 :$dst),
+ "setbe $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_BE))]>,
+ TB; // GR8 = <= unsign
+def SETBEm : I<0x96, MRM0m,
+ (ops i8mem:$dst),
+ "setbe $dst",
+ [(store (X86setcc X86_COND_BE), addr:$dst)]>,
+ TB; // [mem8] = <= unsign
+def SETAr : I<0x97, MRM0r,
+ (ops GR8 :$dst),
+ "seta $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_A))]>,
+ TB; // GR8 = > signed
+def SETAm : I<0x97, MRM0m,
+ (ops i8mem:$dst),
+ "seta $dst",
+ [(store (X86setcc X86_COND_A), addr:$dst)]>,
+ TB; // [mem8] = > signed
+
+def SETSr : I<0x98, MRM0r,
+ (ops GR8 :$dst),
+ "sets $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_S))]>,
+ TB; // GR8 = <sign bit>
+def SETSm : I<0x98, MRM0m,
+ (ops i8mem:$dst),
+ "sets $dst",
+ [(store (X86setcc X86_COND_S), addr:$dst)]>,
+ TB; // [mem8] = <sign bit>
+def SETNSr : I<0x99, MRM0r,
+ (ops GR8 :$dst),
+ "setns $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NS))]>,
+ TB; // GR8 = !<sign bit>
+def SETNSm : I<0x99, MRM0m,
+ (ops i8mem:$dst),
+ "setns $dst",
+ [(store (X86setcc X86_COND_NS), addr:$dst)]>,
+ TB; // [mem8] = !<sign bit>
+def SETPr : I<0x9A, MRM0r,
+ (ops GR8 :$dst),
+ "setp $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_P))]>,
+ TB; // GR8 = parity
+def SETPm : I<0x9A, MRM0m,
+ (ops i8mem:$dst),
+ "setp $dst",
+ [(store (X86setcc X86_COND_P), addr:$dst)]>,
+ TB; // [mem8] = parity
+def SETNPr : I<0x9B, MRM0r,
+ (ops GR8 :$dst),
+ "setnp $dst",
+ [(set GR8:$dst, (X86setcc X86_COND_NP))]>,
+ TB; // GR8 = not parity
+def SETNPm : I<0x9B, MRM0m,
+ (ops i8mem:$dst),
+ "setnp $dst",
+ [(store (X86setcc X86_COND_NP), addr:$dst)]>,
+ TB; // [mem8] = not parity
+
+// Integer comparisons
+def CMP8rr : I<0x38, MRMDestReg,
+ (ops GR8 :$src1, GR8 :$src2),
+ "cmp{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, GR8:$src2)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+ (ops GR16:$src1, GR16:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, GR16:$src2)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+ (ops GR32:$src1, GR32:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, GR32:$src2)]>;
+def CMP8mr : I<0x38, MRMDestMem,
+ (ops i8mem :$src1, GR8 :$src2),
+ "cmp{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi8 addr:$src1), GR8:$src2)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+ (ops i16mem:$src1, GR16:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), GR16:$src2)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+ (ops i32mem:$src1, GR32:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), GR32:$src2)]>;
+def CMP8rm : I<0x3A, MRMSrcMem,
+ (ops GR8 :$src1, i8mem :$src2),
+ "cmp{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, (loadi8 addr:$src2))]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+ (ops GR16:$src1, i16mem:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, (loadi16 addr:$src2))]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+ (ops GR32:$src1, i32mem:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, (loadi32 addr:$src2))]>;
+def CMP8ri : Ii8<0x80, MRM7r,
+ (ops GR8:$src1, i8imm:$src2),
+ "cmp{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR8:$src1, imm:$src2)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+ (ops GR16:$src1, i16imm:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, imm:$src2)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+ (ops GR32:$src1, i32imm:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, imm:$src2)]>;
+def CMP8mi : Ii8 <0x80, MRM7m,
+ (ops i8mem :$src1, i8imm :$src2),
+ "cmp{b} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi8 addr:$src1), imm:$src2)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+ (ops i16mem:$src1, i16imm:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), imm:$src2)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+ (ops i32mem:$src1, i32imm:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), imm:$src2)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+ (ops GR16:$src1, i16i8imm:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR16:$src1, i16immSExt8:$src2)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+ (ops i16mem:$src1, i16i8imm:$src2),
+ "cmp{w} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+ (ops i32mem:$src1, i32i8imm:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+ (ops GR32:$src1, i32i8imm:$src2),
+ "cmp{l} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR32:$src1, i32immSExt8:$src2)]>;
+
+// Sign/Zero extenders
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+ "movs{bw|x} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (sext GR8:$src))]>, TB, OpSize;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+ "movs{bw|x} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+ "movs{bl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+ "movs{bl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+ "movs{wl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+ "movs{wl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+ "movz{bw|x} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (zext GR8:$src))]>, TB, OpSize;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+ "movz{bw|x} {$src, $dst|$dst, $src}",
+ [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+ "movz{bl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+ "movz{bl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+ "movz{wl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+ "movz{wl|x} {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+def CBW : I<0x98, RawFrm, (ops),
+ "{cbtw|cbw}", []>, Imp<[AL],[AX]>, OpSize; // AX = signext(AL)
+def CWDE : I<0x98, RawFrm, (ops),
+ "{cwtl|cwde}", []>, Imp<[AX],[EAX]>; // EAX = signext(AX)
+
+def CWD : I<0x99, RawFrm, (ops),
+ "{cwtd|cwd}", []>, Imp<[AX],[AX,DX]>, OpSize; // DX:AX = signext(AX)
+def CDQ : I<0x99, RawFrm, (ops),
+ "{cltd|cdq}", []>, Imp<[EAX],[EAX,EDX]>; // EDX:EAX = signext(EAX)
+
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+def MOV8r0 : I<0x30, MRMInitReg, (ops GR8 :$dst),
+ "xor{b} $dst, $dst",
+ [(set GR8:$dst, 0)]>;
+def MOV16r0 : I<0x31, MRMInitReg, (ops GR16:$dst),
+ "xor{w} $dst, $dst",
+ [(set GR16:$dst, 0)]>, OpSize;
+def MOV32r0 : I<0x31, MRMInitReg, (ops GR32:$dst),
+ "xor{l} $dst, $dst",
+ [(set GR32:$dst, 0)]>;
+
+// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only
+// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX).
+def MOV16to16_ : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16:$src),
+ "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32to32_ : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32:$src),
+ "mov{l} {$src, $dst|$dst, $src}", []>;
+
+def MOV16_rr : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16_:$src),
+ "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rr : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32_:$src),
+ "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_rm : I<0x8B, MRMSrcMem, (ops GR16_:$dst, i16mem:$src),
+ "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rm : I<0x8B, MRMSrcMem, (ops GR32_:$dst, i32mem:$src),
+ "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16_:$src),
+ "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32_:$src),
+ "mov{l} {$src, $dst|$dst, $src}", []>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+def TLS_addr : I<0, Pseudo, (ops GR32:$dst, i32imm:$sym),
+ "leal ${sym:mem}(,%ebx,1), $dst",
+ [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>,
+ Imp<[EBX],[]>;
+
+let AddedComplexity = 10 in
+def TLS_gs_rr : I<0, Pseudo, (ops GR32:$dst, GR32:$src),
+ "movl %gs:($src), $dst",
+ [(set GR32:$dst, (load (add X86TLStp, GR32:$src)))]>;
+
+let AddedComplexity = 15 in
+def TLS_gs_ri : I<0, Pseudo, (ops GR32:$dst, i32imm:$src),
+ "movl %gs:${src:mem}, $dst",
+ [(set GR32:$dst,
+ (load (add X86TLStp, (X86Wrapper tglobaltlsaddr:$src))))]>;
+
+def TLS_tp : I<0, Pseudo, (ops GR32:$dst),
+ "movl %gs:0, $dst",
+ [(set GR32:$dst, X86TLStp)]>;
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC : I<0, Pseudo, (ops i32imm:$line, i32imm:$col, i32imm:$file),
+ "; .loc $file, $line, $col",
+ [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+ (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+ hasCtrlDep = 1, noResults = 1 in {
+def EH_RETURN : I<0xC3, RawFrm, (ops GR32:$addr),
+ "ret #eh_return, addr: $addr",
+ [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)), (MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+ (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+ (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+ (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+ (ADD32ri GR32:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV32mi addr:$dst, texternalsym:$src)>;
+
+// Calls
+def : Pat<(X86tailcall GR32:$dst),
+ (CALL32r GR32:$dst)>;
+
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+ (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+ (CALLpcrel32 texternalsym:$dst)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+ (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+ (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+ (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+ (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+ (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+ (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+ (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+ (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(truncstorei1 (i8 imm:$src), addr:$dst),
+ (MOV8mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+ (MOV8mr addr:$dst, GR8:$src)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+ (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+ (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+ (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi16i1 addr:$src), (MOVSX16rm8 addr:$src)>;
+def : Pat<(sextloadi32i1 addr:$src), (MOVSX32rm8 addr:$src)>;
+def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8 GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>;
+def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
+def : Pat<(i16 (anyext (loadi8 addr:$src))), (MOVZX16rm8 addr:$src)>;
+def : Pat<(i32 (anyext (loadi8 addr:$src))), (MOVZX32rm8 addr:$src)>;
+def : Pat<(i32 (anyext (loadi16 addr:$src))), (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+ (shl GR32:$src2, (sub 32, CL:$amt))),
+ (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+ (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+ (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+ (srl GR32:$src2, (sub 32, CL:$amt))),
+ (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+ (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+ (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+ (shl GR16:$src2, (sub 16, CL:$amt))),
+ (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+ (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+ (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+ (srl GR16:$src2, (sub 16, CL:$amt))),
+ (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+ (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+ (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrX86-64.td"
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..c774460
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,645 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction templates
+//===----------------------------------------------------------------------===//
+
+// MMXI - MMX instructions with TB prefix.
+// MMX2I - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID - MMX instructions with XD prefix.
+// MMXIS - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXRI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, XS, Requires<[HasMMX]>;
+
+// Some 'special' instructions
+def IMPLICIT_DEF_VR64 : I<0, Pseudo, (ops VR64:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set VR64:$dst, (v8i8 (undef)))]>,
+ Requires<[HasMMX]>;
+
+// 64-bit vector undef's.
+def : Pat<(v8i8 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v4i16 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v2i32 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v1i64 (undef)), (IMPLICIT_DEF_VR64)>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8 : PatFrag<(ops node:$in), (v8i8 (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def MMX_UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKHMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def MMX_UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKLMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def MMX_UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def MMX_UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+// Patterns for shuffling.
+def MMX_PSHUFW_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isPSHUFDMask(N);
+}], MMX_SHUFFLE_get_shuf_imm>;
+
+// Patterns for: vector_shuffle v1, v2, <4, 5, 2, 3>; etc.
+def MMX_MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVLMask(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+ // MMXI_binop_rm - Simple MMX binary operator.
+ multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+ (bitconvert
+ (load_mmx addr:$src2)))))]>;
+ }
+
+ multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ }
+
+ // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+ //
+ // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+ // to collapse (bitconvert VT to VT) into its operand.
+ //
+ multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst,
+ (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+ }
+
+ multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId> {
+ def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+ def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (bitconvert (load_mmx addr:$src2))))]>;
+ def ri : MMXIi8<opc2, ImmForm, (ops VR64:$dst, VR64:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR64:$dst, (IntId VR64:$src1,
+ (scalar_to_vector (i32 imm:$src2))))]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS : MMXI<0x77, RawFrm, (ops), "emms", [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (ops), "femms", [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+ "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+ "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (ops i32mem:$dst, VR64:$src),
+ "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (ops VR64:$dst, GR64:$src),
+ "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (ops VR64:$dst, VR64:$src),
+ "movq {$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (ops VR64:$dst, i64mem:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : MMXID<0xD6, MRMDestMem, (ops VR64:$dst, VR128:$src),
+ "movdq2q {$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v1i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))))]>;
+
+def MMX_MOVQ2DQrr : MMXIS<0xD6, MRMDestMem, (ops VR128:$dst, VR64:$src),
+ "movq2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (bitconvert (v1i64 VR64:$src)))]>;
+
+def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+ "movntq {$src, $dst|$dst, $src}",
+ [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMX2I<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle immAllZerosV,
+ (v2i32 (scalar_to_vector GR32:$src)),
+ MMX_MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMX2I<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle immAllZerosV,
+ (v2i32 (scalar_to_vector
+ (loadi32 addr:$src))),
+ MMX_MOVL_shuffle_mask)))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8, 1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, 1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW : MMXI_binop_rm_int<0xE0, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR : MMXI_binop_rm_v1i64<0xEB, "por" , or, 1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+ def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "pandn {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ VR64:$src2)))]>;
+ def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "pandn {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+ (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+ int_x86_mmx_psrl_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+ int_x86_mmx_psrl_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+ int_x86_mmx_psrl_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+ int_x86_mmx_psll_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+ int_x86_mmx_psll_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+ int_x86_mmx_psll_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+ int_x86_mmx_psra_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+ int_x86_mmx_psra_d>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let isTwoAddress = 1 in {
+ // Unpack High Packed Data Instructions
+ def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpckhbw {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKH_shuffle_mask)))]>;
+ def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpckhbw {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (vector_shuffle VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)),
+ MMX_UNPCKH_shuffle_mask)))]>;
+
+ def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpckhwd {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKH_shuffle_mask)))]>;
+ def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpckhwd {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)),
+ MMX_UNPCKH_shuffle_mask)))]>;
+
+ def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpckhdq {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKH_shuffle_mask)))]>;
+ def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpckhdq {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)),
+ MMX_UNPCKH_shuffle_mask)))]>;
+
+ // Unpack Low Packed Data Instructions
+ def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpcklbw {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKL_shuffle_mask)))]>;
+ def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpcklbw {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v8i8 (vector_shuffle VR64:$src1,
+ (bc_v8i8 (load_mmx addr:$src2)),
+ MMX_UNPCKL_shuffle_mask)))]>;
+
+ def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpcklwd {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKL_shuffle_mask)))]>;
+ def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpcklwd {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle VR64:$src1,
+ (bc_v4i16 (load_mmx addr:$src2)),
+ MMX_UNPCKL_shuffle_mask)))]>;
+
+ def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, VR64:$src2),
+ "punpckldq {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+ MMX_UNPCKL_shuffle_mask)))]>;
+ def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+ "punpckldq {$src2, $dst|$dst, $src2}",
+ [(set VR64:$dst,
+ (v2i32 (vector_shuffle VR64:$src1,
+ (bc_v2i32 (load_mmx addr:$src2)),
+ MMX_UNPCKL_shuffle_mask)))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, i8imm:$src2),
+ "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle
+ VR64:$src1, (undef),
+ MMX_PSHUFW_shuffle_mask:$src2)))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+ (ops VR64:$dst, i64mem:$src1, i8imm:$src2),
+ "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR64:$dst,
+ (v4i16 (vector_shuffle
+ (bc_v4i16 (load_mmx addr:$src1)),
+ (undef),
+ MMX_PSHUFW_shuffle_mask:$src2)))]>;
+
+// -- Conversion Instructions
+def MMX_CVTPD2PIrr : MMX2I<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+ "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPD2PIrm : MMX2I<0x2D, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+ "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr : MMX2I<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+ "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PDrm : MMX2I<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr : MMXI<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+ "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PSrm : MMXI<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr : MMXI<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+ "cvtps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPS2PIrm : MMXI<0x2D, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+ "cvtps2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+ "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+ "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+ "cvttps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+ "cvttps2pi {$src, $dst|$dst, $src}", []>;
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri : MMXIi8<0xC5, MRMSrcReg,
+ (ops GR32:$dst, VR64:$src1, i16i8imm:$src2),
+ "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+ (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+ def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+ (ops VR64:$dst, VR64:$src1, GR32:$src2, i16i8imm:$src3),
+ "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ GR32:$src2, (iPTR imm:$src3))))]>;
+ def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+ (ops VR64:$dst, VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+ "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR64:$dst,
+ (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+ (i32 (anyext (loadi16 addr:$src2))),
+ (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (ops GR32:$dst, VR64:$src),
+ "pmovmskb {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (ops VR64:$src, VR64:$mask),
+ "maskmovq {$mask, $src|$src, $mask}",
+ [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>,
+ Imp<[EDI],[]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in {
+ def MMX_V_SET0 : MMXI<0xEF, MRMInitReg, (ops VR64:$dst),
+ "pxor $dst, $dst",
+ [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+ def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (ops VR64:$dst),
+ "pcmpeqd $dst, $dst",
+ [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+ (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// 64-bit vector all zero's.
+def : Pat<(v8i8 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+
+// 64-bit vector all one's.
+def : Pat<(v8i8 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
+
+// Bit convert.
+def : Pat<(v8i8 (bitconvert (v1i64 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 VR64:$src))), (v8i8 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8 VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8 (bitconvert (i64 GR64:$src))),
+ (MMX_MOVD64to64rr GR64:$src)>;
+
+def MMX_X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>;
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+ def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+ (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+ (MMX_MOVZDI2PDIrr GR32:$src)>;
+ def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+ (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+ (MMX_MOVZDI2PDIrr GR32:$src)>;
+ def : Pat<(v2i32 (vector_shuffle immAllZerosV,
+ (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+ (MMX_MOVZDI2PDIrr GR32:$src)>;
+}
+
+// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower
+// 8 or 16-bits matter.
+def : Pat<(v8i8 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKL_v_undef_shuffle_mask)),
+ (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKL_v_undef_shuffle_mask)),
+ (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKL_v_undef_shuffle_mask)),
+ (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+ def : Pat<(v8i8 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKH_v_undef_shuffle_mask)),
+ (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+ def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKH_v_undef_shuffle_mask)),
+ (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+ def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+ MMX_UNPCKH_v_undef_shuffle_mask)),
+ (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+ def : Pat<(bc_v2i32 (vector_shuffle immAllZerosV,
+ (v2i32 (scalar_to_vector (load_mmx addr:$src))),
+ MMX_UNPCKL_shuffle_mask)),
+ (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))),
+ VR64:$src2)),
+ (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8 immAllOnesV))),
+ (load addr:$src2))),
+ (MMX_PANDNrm VR64:$src1, addr:$src2)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..5fc7a65
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,2572 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the University
+// of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+ SDTCisFP<0>, SDTCisInt<2> ]>;
+
+def X86loadp : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>;
+def X86loadu : SDNode<"X86ISD::LOAD_UA", SDTLoad, [SDNPHasChain]>;
+def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
+def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
+def X86fand : SDNode<"X86ISD::FAND", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86for : SDNode<"X86ISD::FOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
+def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
+def X86fsrl : SDNode<"X86ISD::FSRL", SDTX86FPShiftOp>;
+def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest,
+ [SDNPHasChain, SDNPOutFlag]>;
+def X86s2vec : SDNode<"X86ISD::S2VEC", SDTypeProfile<1, 1, []>, []>;
+def X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+//===----------------------------------------------------------------------===//
+// SSE 'Special' Instructions
+//===----------------------------------------------------------------------===//
+
+def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set VR128:$dst, (v4f32 (undef)))]>,
+ Requires<[HasSSE1]>;
+def IMPLICIT_DEF_FR32 : I<0, Pseudo, (ops FR32:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
+def IMPLICIT_DEF_FR64 : I<0, Pseudo, (ops FR64:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements. These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [],
+ [SDNPHasChain]>;
+def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [],
+ [SDNPHasChain]>;
+
+def ssmem : Operand<v4f32> {
+ let PrintMethod = "printf32mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+def sdmem : Operand<v2f64> {
+ let PrintMethod = "printf64mem";
+ let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def X86loadpf32 : PatFrag<(ops node:$ptr), (f32 (X86loadp node:$ptr))>;
+def X86loadpf64 : PatFrag<(ops node:$ptr), (f64 (X86loadp node:$ptr))>;
+
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32 : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+ return N->isExactlyValue(+0.0);
+}]>;
+
+def PSxLDQ_imm : SDNodeXForm<imm, [{
+ // Transformation function: imm >> 3
+ return getI32Imm(N->getValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+ return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<build_vector, [{
+ return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<build_vector, [{
+ return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+def SSE_splat_mask : PatLeaf<(build_vector), [{
+ return X86::isSplatMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SSE_splat_lo_mask : PatLeaf<(build_vector), [{
+ return X86::isSplatLoMask(N);
+}]>;
+
+def MOVHLPS_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVHLPSMask(N);
+}]>;
+
+def MOVHLPS_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVHLPS_v_undef_Mask(N);
+}]>;
+
+def MOVHP_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVHPMask(N);
+}]>;
+
+def MOVLP_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVLPMask(N);
+}]>;
+
+def MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVLMask(N);
+}]>;
+
+def MOVSHDUP_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVSHDUPMask(N);
+}]>;
+
+def MOVSLDUP_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isMOVSLDUPMask(N);
+}]>;
+
+def UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKLMask(N);
+}]>;
+
+def UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKHMask(N);
+}]>;
+
+def UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+def UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+def PSHUFD_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFHW_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isPSHUFHWMask(N);
+}], SHUFFLE_get_pshufhw_imm>;
+
+def PSHUFLW_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isPSHUFLWMask(N);
+}], SHUFFLE_get_pshuflw_imm>;
+
+def SHUFP_unary_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SHUFP_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{
+ return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation. Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in { // Expanded by the scheduler.
+ def CMOV_FR32 : I<0, Pseudo,
+ (ops FR32:$dst, FR32:$t, FR32:$f, i8imm:$cond),
+ "#CMOV_FR32 PSEUDO!",
+ [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond))]>;
+ def CMOV_FR64 : I<0, Pseudo,
+ (ops FR64:$dst, FR64:$t, FR64:$f, i8imm:$cond),
+ "#CMOV_FR64 PSEUDO!",
+ [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond))]>;
+ def CMOV_V4F32 : I<0, Pseudo,
+ (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V4F32 PSEUDO!",
+ [(set VR128:$dst,
+ (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+ def CMOV_V2F64 : I<0, Pseudo,
+ (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2F64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+ def CMOV_V2I64 : I<0, Pseudo,
+ (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+ "#CMOV_V2I64 PSEUDO!",
+ [(set VR128:$dst,
+ (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE1 Instruction Templates:
+//
+// SSI - SSE1 instructions with XS prefix.
+// PSI - SSE1 instructions with TB prefix.
+// PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// Move Instructions
+def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+ "movss {$src, $dst|$dst, $src}", []>;
+def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+ "cvtsi2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+ "cvtsi2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvtss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvtss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_cvtss2si
+ (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+ "cvttss2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+ def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, GR32:$src2),
+ "cvtsi2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ GR32:$src2))]>;
+ def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+ "cvtsi2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+ def CMPSSrr : SSI<0xC2, MRMSrcReg,
+ (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ []>;
+ def CMPSSrm : SSI<0xC2, MRMSrcMem,
+ (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, FR32:$src2)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+ def Int_CMPSSrr : SSI<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSSrm : SSI<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc),
+ "cmp${cc}ss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "ucomiss {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "comiss {$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "comiss {$src2, $src1|$src1, $src2}",
+ [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst),
+ "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+ Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+ "movaps {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (X86loadpf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+ def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+ def FsORPSrr : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+ def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fand FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+def FsORPSrm : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86for FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set FR32:$dst, (X86fxor FR32:$src1,
+ (X86loadpf32 addr:$src2)))]>;
+
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+ (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ "andnps {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+ (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+ "andnps {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F32Int VR128:$src1,
+ sse_load_f32:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PSrr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PSrm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+ !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movaps {$src, $dst|$dst, $src}", []>;
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movaps {$src, $dst|$dst, $src}",
+ [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movups {$src, $dst|$dst, $src}", []>;
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movups {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movups {$src, $dst|$dst, $src}",
+ [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+ let AddedComplexity = 20 in {
+ def MOVLPSrm : PSI<0x12, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movlps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+ MOVLP_shuffle_mask)))]>;
+ def MOVHPSrm : PSI<0x16, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+ MOVHP_shuffle_mask)))]>;
+ } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movlps {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movhps {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (vector_shuffle
+ (bc_v2f64 (v4f32 VR128:$src)), (undef),
+ UNPCKH_shuffle_mask)), (iPTR 0))),
+ addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let AddedComplexity = 15 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movlhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHP_shuffle_mask)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movhlps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHLPS_shuffle_mask)))]>;
+} // AddedComplexity
+} // isTwoAddress
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F32Int,
+ Intrinsic V4F32Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SSr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+ !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SSm : SSI<opc, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+ !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+
+ // Vector operation, reg.
+ def PSr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PSm : PSI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SSr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SSm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
+ !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PSr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PSm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+ !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse1_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+ int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP : sse1_fp_unop_rm<0x53, "rcp", X86frcp,
+ int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let isTwoAddress = 1 in {
+ let isCommutable = 1 in {
+ def ANDPSrr : PSI<0x54, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (and VR128:$src1, VR128:$src2)))]>;
+ def ORPSrr : PSI<0x56, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (or VR128:$src1, VR128:$src2)))]>;
+ def XORPSrr : PSI<0x57, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64
+ (xor VR128:$src1, VR128:$src2)))]>;
+ }
+
+ def ANDPSrm : PSI<0x54, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "andps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (and VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def ORPSrm : PSI<0x56, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "orps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (or VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def XORPSrm : PSI<0x57, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "xorps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (xor VR128:$src1,
+ (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+ def ANDNPSrr : PSI<0x55, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andnps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)))]>;
+ def ANDNPSrm : PSI<0x55, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+ "andnps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (and (xor VR128:$src1,
+ (bc_v2i64 (v4i32 immAllOnesV))),
+ (bc_v2i64 (loadv4f32 addr:$src2)))))]>;
+}
+
+let isTwoAddress = 1 in {
+ def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+ let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+ def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1,
+ VR128:$src2, i32i8imm:$src3),
+ "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ SHUFP_shuffle_mask:$src3)))]>;
+ def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,
+ f128mem:$src2, i32i8imm:$src3),
+ "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ SHUFP_shuffle_mask:$src3)))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpckhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpckhps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKH_shuffle_mask)))]>;
+
+ def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpcklps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpcklps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKL_shuffle_mask)))]>;
+ } // AddedComplexity
+} // isTwoAddress
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "movmskps {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "movmskpd {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetching loads.
+// TODO: no intrinsics for these?
+def PREFETCHT0 : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>;
+def PREFETCHT1 : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>;
+def PREFETCHT2 : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>;
+def PREFETCHNTA : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+ "movntps {$src, $dst|$dst, $src}",
+ [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src),
+ "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst),
+ "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst),
+ "xorps $dst, $dst",
+ [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+
+// FR32 to 128-bit vector conversion.
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+// (f32 FR32:$src)>;
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(store (f32 (vector_extract (v4f32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+ def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, FR32:$src2),
+ "movss {$src2, $dst|$dst, $src2}", []>;
+
+ let AddedComplexity = 15 in
+ def MOVLPSrr : SSI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVL_shuffle_mask)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+ "movss {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+ (v4f32 (scalar_to_vector (loadf32 addr:$src))),
+ MOVL_shuffle_mask)))]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE2 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE2 Instruction Templates:
+//
+// SDI - SSE2 instructions with XD prefix.
+// PDI - SSE2 instructions with TB and OpSize prefixes.
+// PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// Move Instructions
+def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+ "movsd {$src, $dst|$dst, $src}", []>;
+def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src),
+ "cvttsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f64mem:$src),
+ "cvttsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (ops FR32:$dst, FR64:$src),
+ "cvtsd2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm : SDI<0x5A, MRMSrcMem, (ops FR32:$dst, f64mem:$src),
+ "cvtsd2ss {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
+def CVTSI2SDrr : SDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR32:$src),
+ "cvtsi2sd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm : SDI<0x2A, MRMSrcMem, (ops FR64:$dst, i32mem:$src),
+ "cvtsi2sd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (ops FR64:$dst, FR32:$src),
+ "cvtss2sd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+ Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (ops FR64:$dst, f32mem:$src),
+ "cvtss2sd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+ Requires<[HasSSE2]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvtsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+ "cvtsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+ (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "cvttsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst,
+ (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+ "cvttsd2si {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+ (load addr:$src)))]>;
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+ def CMPSDrr : SDI<0xC2, MRMSrcReg,
+ (ops FR64:$dst, FR64:$src1, FR64:$src, SSECC:$cc),
+ "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+ def CMPSDrm : SDI<0xC2, MRMSrcMem,
+ (ops FR64:$dst, FR64:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops FR64:$src1, FR64:$src2),
+ "ucomisd {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR64:$src1, FR64:$src2)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops FR64:$src1, f64mem:$src2),
+ "ucomisd {$src2, $src1|$src1, $src2}",
+ [(X86cmp FR64:$src1, (loadf64 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+ def Int_CMPSDrr : SDI<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}sd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def Int_CMPSDrm : SDI<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src, SSECC:$cc),
+ "cmp${cc}sd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "ucomisd {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "ucomisd {$src2, $src1|$src1, $src2}",
+ [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+ "comisd {$src2, $src1|$src1, $src2}",
+ [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+ "comisd {$src2, $src1|$src1, $src2}",
+ [(X86comi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SD : I<0xEF, MRMInitReg, (ops FR64:$dst),
+ "pxor $dst, $dst", [(set FR64:$dst, fpimm0)]>,
+ Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+ "movapd {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (ops FR64:$dst, f128mem:$src),
+ "movapd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (X86loadpf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+ def FsANDPDrr : PDI<0x54, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ "andpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+ def FsORPDrr : PDI<0x56, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ "orpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+ def FsXORPDrr : PDI<0x57, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ "xorpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+ "andpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fand FR64:$src1,
+ (X86loadpf64 addr:$src2)))]>;
+def FsORPDrm : PDI<0x56, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+ "orpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86for FR64:$src1,
+ (X86loadpf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+ "xorpd {$src2, $dst|$dst, $src2}",
+ [(set FR64:$dst, (X86fxor FR64:$src1,
+ (X86loadpf64 addr:$src2)))]>;
+
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+ (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ "andnpd {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+ (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+ "andnpd {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, Intrinsic F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form. Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+
+ // Scalar operation, reg+reg.
+ def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, reg+mem.
+ def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+ // Vector operation, reg+reg.
+ def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, reg+mem.
+ def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+ // Intrinsic operation, reg+reg.
+ def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, reg+mem.
+ def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+ !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (F64Int VR128:$src1,
+ sse_load_f64:$src2))]>;
+
+ // Vector intrinsic operation, reg+reg.
+ def PDrr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, reg+mem.
+ def PDrm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+ int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+ int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movapd {$src, $dst|$dst, $src}", []>;
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movapd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movapd {$src, $dst|$dst, $src}",
+ [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movupd {$src, $dst|$dst, $src}", []>;
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movupd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movupd {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+ let AddedComplexity = 20 in {
+ def MOVLPDrm : PDI<0x12, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movlpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)),
+ MOVLP_shuffle_mask)))]>;
+ def MOVHPDrm : PDI<0x16, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "movhpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle VR128:$src1,
+ (scalar_to_vector (loadf64 addr:$src2)),
+ MOVHP_shuffle_mask)))]>;
+ } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movlpd {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movhpd {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract
+ (v2f64 (vector_shuffle VR128:$src, (undef),
+ UNPCKH_shuffle_mask)), (iPTR 0))),
+ addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtdq2ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+ "cvtdq2ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+ (bitconvert (loadv2i64 addr:$src))))]>,
+ TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtdq2pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "cvtdq2pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+ (bitconvert (loadv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtps2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "cvtps2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+ (load addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvttps2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "cvttps2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+ (load addr:$src)))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtpd2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+ XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "cvtpd2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+ (load addr:$src)))]>,
+ XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvttpd2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "cvttpd2dq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+ (load addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtps2pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+ TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (ops VR128:$dst, f64mem:$src),
+ "cvtps2pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+ (load addr:$src)))]>,
+ TB, Requires<[HasSSE2]>;
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "cvtpd2ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, f128mem:$src),
+ "cvtpd2ps {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+ (load addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let isTwoAddress = 1 in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, GR32:$src2),
+ "cvtsi2sd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+ "cvtsi2sd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+ (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "cvtsd2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+ "cvtsd2ss {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+ (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "cvtss2sd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ VR128:$src2))]>, XS,
+ Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+ "cvtss2sd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+ (load addr:$src2)))]>, XS,
+ Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation. This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+ SDNode OpNode,
+ Intrinsic F64Int,
+ Intrinsic V2F64Int,
+ bit Commutable = 0> {
+ // Scalar operation, reg.
+ def SDr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+ !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode FR64:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Scalar operation, mem.
+ def SDm : SDI<opc, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+ !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+ [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+
+ // Vector operation, reg.
+ def PDr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector operation, mem.
+ def PDm : PDI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>;
+
+ // Intrinsic operation, reg.
+ def SDr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Intrinsic operation, mem.
+ def SDm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
+ !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+ // Vector intrinsic operation, reg
+ def PDr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+ let isCommutable = Commutable;
+ }
+
+ // Vector intrinsic operation, mem
+ def PDm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+ !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT : sse2_fp_unop_rm<0x51, "sqrt", fsqrt,
+ int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let isTwoAddress = 1 in {
+ let isCommutable = 1 in {
+ def ANDPDrr : PDI<0x54, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ORPDrr : PDI<0x56, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "orpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def XORPDrr : PDI<0x57, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "xorpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ }
+
+ def ANDPDrm : PDI<0x54, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "andpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+ def ORPDrm : PDI<0x56, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "orpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (or (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+ def XORPDrm : PDI<0x57, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "xorpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (xor (bc_v2i64 (v2f64 VR128:$src1)),
+ (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+ def ANDNPDrr : PDI<0x55, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "andnpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (bc_v2i64 (v2f64 VR128:$src2))))]>;
+ def ANDNPDrm : PDI<0x55, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+ "andnpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+ (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+}
+
+let isTwoAddress = 1 in {
+ def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+ "cmp${cc}pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ VR128:$src, imm:$cc))]>;
+ def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+ "cmp${cc}pd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+ (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+ def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2, i8imm:$src3),
+ "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst, (v2f64 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ SHUFP_shuffle_mask:$src3)))]>;
+ def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,
+ f128mem:$src2, i8imm:$src3),
+ "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ SHUFP_shuffle_mask:$src3)))]>;
+
+ let AddedComplexity = 10 in {
+ def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpckhpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpckhpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKH_shuffle_mask)))]>;
+
+ def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "unpcklpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "unpcklpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ VR128:$src1, (load addr:$src2),
+ UNPCKL_shuffle_mask)))]>;
+ } // AddedComplexity
+} // isTwoAddress
+
+
+//===----------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movdqa {$src, $dst|$dst, $src}", []>;
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+ "movdqa {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (loadv2i64 addr:$src))]>;
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+ "movdqa {$src, $dst|$dst, $src}",
+ [(store (v2i64 VR128:$src), addr:$dst)]>;
+def MOVDQUrm : I<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+ "movdqu {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+ XS, Requires<[HasSSE2]>;
+def MOVDQUmr : I<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+ "movdqu {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+ XS, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+ string OpcodeStr, Intrinsic IntId> {
+ def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+ def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2))))]>;
+ def ri : PDIi8<opc2, ImmForm, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1,
+ (scalar_to_vector (i32 imm:$src2))))]>;
+}
+
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ bit Commutable = 0> {
+ def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (OpNode VR128:$src1,(loadv2i64 addr:$src2)))]>;
+}
+
+} // isTwoAddress
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>;
+// PSRAQ doesn't exist in SSE[1-3].
+
+// 128-bit logical shifts.
+let isTwoAddress = 1 in {
+ def PSLLDQri : PDIi8<0x73, MRM7r,
+ (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+ "pslldq {$src2, $dst|$dst, $src2}", []>;
+ def PSRLDQri : PDIi8<0x73, MRM3r,
+ (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+ "psrldq {$src2, $dst|$dst, $src2}", []>;
+ // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+ def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+ def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+ (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+ def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+ (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+ def PANDNrr : PDI<0xDF, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "pandn {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ VR128:$src2)))]>;
+
+ def PANDNrm : PDI<0xDF, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "pandn {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+ (load addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+ "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (vector_shuffle
+ VR128:$src1, (undef),
+ PSHUFD_shuffle_mask:$src2)))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+ (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+ "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v4i32 (vector_shuffle
+ (bc_v4i32(loadv2i64 addr:$src1)),
+ (undef),
+ PSHUFD_shuffle_mask:$src2)))]>;
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+ "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (vector_shuffle
+ VR128:$src1, (undef),
+ PSHUFHW_shuffle_mask:$src2)))]>,
+ XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+ (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+ "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (vector_shuffle
+ (bc_v8i16 (loadv2i64 addr:$src1)),
+ (undef),
+ PSHUFHW_shuffle_mask:$src2)))]>,
+ XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+ "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (vector_shuffle
+ VR128:$src1, (undef),
+ PSHUFLW_shuffle_mask:$src2)))]>,
+ XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+ (ops VR128:$dst, i128mem:$src1, i32i8imm:$src2),
+ "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128:$dst, (v8i16 (vector_shuffle
+ (bc_v8i16 (loadv2i64 addr:$src1)),
+ (undef),
+ PSHUFLW_shuffle_mask:$src2)))]>,
+ XD, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+ def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpcklbw {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpcklbw {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (vector_shuffle VR128:$src1,
+ (bc_v16i8 (loadv2i64 addr:$src2)),
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpcklwd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpcklwd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (vector_shuffle VR128:$src1,
+ (bc_v8i16 (loadv2i64 addr:$src2)),
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpckldq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpckldq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle VR128:$src1,
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpcklqdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKL_shuffle_mask)))]>;
+ def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpcklqdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle VR128:$src1,
+ (loadv2i64 addr:$src2),
+ UNPCKL_shuffle_mask)))]>;
+
+ def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpckhbw {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpckhbw {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v16i8 (vector_shuffle VR128:$src1,
+ (bc_v16i8 (loadv2i64 addr:$src2)),
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpckhwd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpckhwd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v8i16 (vector_shuffle VR128:$src1,
+ (bc_v8i16 (loadv2i64 addr:$src2)),
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpckhdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpckhdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle VR128:$src1,
+ (bc_v4i32 (loadv2i64 addr:$src2)),
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "punpckhqdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+ UNPCKH_shuffle_mask)))]>;
+ def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ "punpckhqdq {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2i64 (vector_shuffle VR128:$src1,
+ (loadv2i64 addr:$src2),
+ UNPCKH_shuffle_mask)))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+ (ops GR32:$dst, VR128:$src1, i32i8imm:$src2),
+ "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+ (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+ def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1,
+ GR32:$src2, i32i8imm:$src3),
+ "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+ GR32:$src2, (iPTR imm:$src3))))]>;
+ def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1,
+ i16mem:$src2, i32i8imm:$src3),
+ "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+ (i32 (anyext (loadi16 addr:$src2))),
+ (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+ "pmovmskb {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (ops VR128:$src, VR128:$mask),
+ "maskmovdqu {$mask, $src|$src, $mask}",
+ [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+ Imp<[EDI],[]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+ "movntpd {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+ "movntdq {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr : I<0xC3, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+ "movnti {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (ops i8mem:$src),
+ "clflush $src", [(int_x86_sse2_clflush addr:$src)]>,
+ TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5m, (ops),
+ "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6m, (ops),
+ "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+ def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst),
+ "pcmpeqd $dst, $dst",
+ [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr : PDI<0x6E, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm : PDI<0x6E, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+ Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+// (f32 FR32:$src)>;
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, VR128:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(store (f64 (vector_extract (v2f64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr : PDI<0x7E, MRMDestReg, (ops GR32:$dst, VR128:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPDI2DImr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(store (i32 (vector_extract (v4i32 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr : PDI<0x7E, MRMDestReg, (ops GR32:$dst, FR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, FR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+ def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, FR64:$src2),
+ "movsd {$src2, $dst|$dst, $src2}", []>;
+
+ let AddedComplexity = 15 in
+ def MOVLPDrr : SDI<0x10, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "movsd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVL_shuffle_mask)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+ def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+ "movsd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle immAllZerosV,
+ (v2f64 (scalar_to_vector
+ (loadf64 addr:$src))),
+ MOVL_shuffle_mask)))]>;
+
+let AddedComplexity = 15 in
+// movd / movq to XMM register zero-extends
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle immAllZerosV,
+ (v4i32 (scalar_to_vector GR32:$src)),
+ MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+ "movd {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v4i32 (vector_shuffle immAllZerosV,
+ (v4i32 (scalar_to_vector (loadi32 addr:$src))),
+ MOVL_shuffle_mask)))]>;
+
+// Moving from XMM to XMM but still clear upper 64 bits.
+let AddedComplexity = 15 in
+def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>,
+ XS, Requires<[HasSSE2]>;
+let AddedComplexity = 20 in
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "movq {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse2_movl_dq
+ (bitconvert (loadv2i64 addr:$src))))]>,
+ XS, Requires<[HasSSE2]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+//
+// S3I - SSE3 instructions with TB and OpSize prefixes.
+// S3SI - SSE3 instructions with XS prefix.
+// S3DI - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movshdup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (vector_shuffle
+ VR128:$src, (undef),
+ MOVSHDUP_shuffle_mask)))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movshdup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (vector_shuffle
+ (loadv4f32 addr:$src), (undef),
+ MOVSHDUP_shuffle_mask)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movsldup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (vector_shuffle
+ VR128:$src, (undef),
+ MOVSLDUP_shuffle_mask)))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+ "movsldup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v4f32 (vector_shuffle
+ (loadv4f32 addr:$src), (undef),
+ MOVSLDUP_shuffle_mask)))]>;
+
+def MOVDDUPrr : S3DI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+ "movddup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (v2f64 (vector_shuffle
+ VR128:$src, (undef),
+ SSE_splat_lo_mask)))]>;
+def MOVDDUPrm : S3DI<0x12, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+ "movddup {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2f64 (vector_shuffle
+ (scalar_to_vector (loadf64 addr:$src)),
+ (undef),
+ SSE_splat_lo_mask)))]>;
+
+// Arithmetic
+let isTwoAddress = 1 in {
+ def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "addsubps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "addsubps {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+ (load addr:$src2)))]>;
+ def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ "addsubpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ VR128:$src2))]>;
+ def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ "addsubpd {$src2, $dst|$dst, $src2}",
+ [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+ (load addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+ "lddqu {$src, $dst|$dst, $src}",
+ [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3DI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+ : S3I<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>;
+
+let isTwoAddress = 1 in {
+ def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+ def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+ def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+ def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+ def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0xC8, RawFrm, (ops), "monitor",
+ [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT : I<0xC9, RawFrm, (ops), "mwait",
+ [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+ MOVSHDUP_shuffle_mask)),
+ (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+ MOVSHDUP_shuffle_mask)),
+ (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+ def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+ MOVSLDUP_shuffle_mask)),
+ (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+ def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+ MOVSLDUP_shuffle_mask)),
+ (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+//
+// SS38I - SSSE3 instructions with T8 and OpSize prefixes.
+// SS3AI - SSSE3 instructions with TA and OpSize prefixes.
+
+class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>;
+
+/// SS3I_binop_rm_int - Simple SSSE3 binary operatr whose type is v2i64.
+let isTwoAddress = 1 in {
+ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+ bit Commutable = 0> {
+ def rr : SS38I<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+ let isCommutable = Commutable;
+ }
+ def rm : SS38I<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+ !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src1,
+ (bitconvert (loadv2i64 addr:$src2))))]>;
+ }
+}
+
+defm PMULHRSW128 : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+ int_x86_ssse3_pmulhrsw_128, 1>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// 128-bit vector undef's.
+def : Pat<(v2f64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all zero's.
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all one's.
+def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
+
+// Store 128-bit integer vector values.
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+ (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+ (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+ (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
+// 16-bits matter.
+def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+ Requires<[HasSSE2]>;
+def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+ Requires<[HasSSE2]>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+ def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+ def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+ (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+ (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+ (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+ (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+ (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
+ (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+ (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
+ (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+ (UNPCKLPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+ (UNPCKHPDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+ (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+ (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v4f32
+def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm),
+ (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>,
+ Requires<[HasSSE1]>;
+
+// Special unary SHUFPSrri case.
+// FIXME: when we want non two-address code, then we should use PSHUFD?
+def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
+ SHUFP_unary_shuffle_mask:$sm),
+ (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+ Requires<[HasSSE1]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(vector_shuffle (loadv4f32 addr:$src1), (undef),
+ SHUFP_unary_shuffle_mask:$sm),
+ (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+ PSHUFD_binary_shuffle_mask:$sm),
+ (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v4i32 VR128:$src1),
+ (bc_v4i32 (loadv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm),
+ (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+ Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+ UNPCKL_v_undef_shuffle_mask)),
+ (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+ UNPCKL_v_undef_shuffle_mask)),
+ (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+ UNPCKL_v_undef_shuffle_mask)),
+ (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+ UNPCKL_v_undef_shuffle_mask)),
+ (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+ UNPCKH_v_undef_shuffle_mask)),
+ (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+ UNPCKH_v_undef_shuffle_mask)),
+ (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+ UNPCKH_v_undef_shuffle_mask)),
+ (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+ UNPCKH_v_undef_shuffle_mask)),
+ (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+let AddedComplexity = 15 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHP_shuffle_mask)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVHLPS_shuffle_mask)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
+ MOVHLPS_v_undef_shuffle_mask)),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef),
+ MOVHLPS_v_undef_shuffle_mask)),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+ MOVLP_shuffle_mask)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+ MOVLP_shuffle_mask)),
+ (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+ MOVHP_shuffle_mask)),
+ (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+ MOVHP_shuffle_mask)),
+ (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+ MOVLP_shuffle_mask)),
+ (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+ MOVLP_shuffle_mask)),
+ (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+ MOVHP_shuffle_mask)),
+ (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+ MOVLP_shuffle_mask)),
+ (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVL_shuffle_mask)),
+ (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVL_shuffle_mask)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVLP_shuffle_mask)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+ MOVLP_shuffle_mask)),
+ (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 20 in
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+ (v2f64 (scalar_to_vector (loadf64 addr:$src))),
+ MOVL_shuffle_mask)),
+ (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
+
+// FIXME: Temporary workaround since 2-wide shuffle is broken.
+def : Pat<(int_x86_sse2_movs_d VR128:$src1, VR128:$src2),
+ (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2),
+ (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2),
+ (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3),
+ (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>,
+ Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3),
+ (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>,
+ Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2),
+ (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)),
+ (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2),
+ (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)),
+ (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2),
+ (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)),
+ (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2),
+ (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)),
+ (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ VR128:$src2)),
+ (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+ (load addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+ (load addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+ (load addr:$src2))),
+ (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Unaligned load
+def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>,
+ Requires<[HasSSE1]>;
diff --git a/lib/Target/X86/X86InstrX86-64.td b/lib/Target/X86/X86InstrX86-64.td
new file mode 100644
index 0000000..ac43846
--- /dev/null
+++ b/lib/Target/X86/X86InstrX86-64.td
@@ -0,0 +1,1165 @@
+//====- X86InstrX86-64.td - Describe the X86 Instruction Set ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions...
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64>;
+// 64-bits but only 8 bits are significant.
+def i64i8imm : Operand<i64>;
+
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+}
+
+def lea64_32mem : Operand<i32> {
+ let PrintMethod = "printlea64_32mem";
+ let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions...
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+ [add, mul, shl, or, frameindex, X86Wrapper],
+ []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : I<o, F, ops, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii8<o, F, ops, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : Ii32<o, F, ops, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+ : X86Inst<o, f, Imm64, ops, asm>, REX_W {
+ let Pattern = pattern;
+ let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : SSI<o, F, ops, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : SDI<o, F, ops, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+ : PDI<o, F, ops, asm, pattern>, REX_W;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+def i64immSExt32 : PatLeaf<(i64 imm), [{
+ // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // sign extended field.
+ return (int64_t)N->getValue() == (int32_t)N->getValue();
+}]>;
+
+def i64immZExt32 : PatLeaf<(i64 imm), [{
+ // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+ // unsignedsign extended field.
+ return (uint64_t)N->getValue() == (uint32_t)N->getValue();
+}]>;
+
+def i64immSExt8 : PatLeaf<(i64 imm), [{
+ // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+ // sign extended field.
+ return (int64_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def sextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (sextloadi1 node:$ptr))>;
+def sextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1 : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8 : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1 : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8 : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16 : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32 : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+def IMPLICIT_DEF_GR64 : I<0, Pseudo, (ops GR64:$dst),
+ "#IMPLICIT_DEF $dst",
+ [(set GR64:$dst, (undef))]>;
+
+//===----------------------------------------------------------------------===//
+// Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+ // All calls clobber the non-callee saved registers...
+ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+ MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+ XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15] in {
+ def CALL64pcrel32 : I<0xE8, RawFrm, (ops i64imm:$dst, variable_ops),
+ "call ${dst:call}", []>;
+ def CALL64r : I<0xFF, MRM2r, (ops GR64:$dst, variable_ops),
+ "call {*}$dst", [(X86call GR64:$dst)]>;
+ def CALL64m : I<0xFF, MRM2m, (ops i64mem:$dst, variable_ops),
+ "call {*}$dst", []>;
+ }
+
+// Branches
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+ def JMP64r : I<0xFF, MRM4r, (ops GR64:$dst), "jmp{q} {*}$dst",
+ [(brind GR64:$dst)]>;
+ def JMP64m : I<0xFF, MRM4m, (ops i64mem:$dst), "jmp{q} {*}$dst",
+ [(brind (loadi64 addr:$dst))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Instructions...
+//
+def LEAVE64 : I<0xC9, RawFrm,
+ (ops), "leave", []>, Imp<[RBP,RSP],[RBP,RSP]>;
+def POP64r : I<0x58, AddRegFrm,
+ (ops GR64:$reg), "pop{q} $reg", []>, Imp<[RSP],[RSP]>;
+def PUSH64r : I<0x50, AddRegFrm,
+ (ops GR64:$reg), "push{q} $reg", []>, Imp<[RSP],[RSP]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+ (ops GR32:$dst, lea64_32mem:$src),
+ "lea{l} {$src|$dst}, {$dst|$src}",
+ [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+def LEA64r : RI<0x8D, MRMSrcMem, (ops GR64:$dst, lea64mem:$src),
+ "lea{q} {$src|$dst}, {$dst|$src}",
+ [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (ops GR64:$dst, GR64:$src),
+ "bswap{q} $dst",
+ [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+// Exchange
+def XCHG64rr : RI<0x87, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+ "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64mr : RI<0x87, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+ "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64rm : RI<0x87, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+ "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+
+// Repeat string ops
+def REP_MOVSQ : RI<0xA5, RawFrm, (ops), "{rep;movsq|rep movsq}",
+ [(X86rep_movs i64)]>,
+ Imp<[RCX,RDI,RSI], [RCX,RDI,RSI]>, REP;
+def REP_STOSQ : RI<0xAB, RawFrm, (ops), "{rep;stosq|rep stosq}",
+ [(X86rep_stos i64)]>,
+ Imp<[RAX,RCX,RDI], [RCX,RDI]>, REP;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions...
+//
+
+def MOV64rr : RI<0x89, MRMDestReg, (ops GR64:$dst, GR64:$src),
+ "mov{q} {$src, $dst|$dst, $src}", []>;
+
+def MOV64ri : RIi64<0xB8, AddRegFrm, (ops GR64:$dst, i64imm:$src),
+ "movabs{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (ops GR64:$dst, i64i32imm:$src),
+ "mov{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, i64immSExt32:$src)]>;
+
+def MOV64rm : RI<0x8B, MRMSrcMem, (ops GR64:$dst, i64mem:$src),
+ "mov{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+ "mov{q} {$src, $dst|$dst, $src}",
+ [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (ops i64mem:$dst, i64i32imm:$src),
+ "mov{q} {$src, $dst|$dst, $src}",
+ [(store i64immSExt32:$src, addr:$dst)]>;
+
+// Sign/Zero extenders
+
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+ "movs{bq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+ "movs{bq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+ "movs{wq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+ "movs{wq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (ops GR64:$dst, GR32:$src),
+ "movs{lq|xd} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+ "movs{lq|xd} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+ "movz{bq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+ "movz{bq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+def MOVZX64rr16: RI<0xB7, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+ "movz{wq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: RI<0xB7, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+ "movz{wq|x} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+def CDQE : RI<0x98, RawFrm, (ops),
+ "{cltq|cdqe}", []>, Imp<[EAX],[RAX]>; // RAX = signext(EAX)
+
+def CQO : RI<0x99, RawFrm, (ops),
+ "{cqto|cqo}", []>, Imp<[RAX],[RAX,RDX]>; // RDX:RAX = signext(RAX)
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions...
+//
+
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+def ADD64rr : RI<0x01, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, GR64:$src2))]>;
+
+def ADD64ri32 : RIi32<0x81, MRM0r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2))]>;
+def ADD64ri8 : RIi8<0x83, MRM0r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2))]>;
+} // isConvertibleToThreeAddress
+
+def ADD64rm : RI<0x03, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (add GR64:$src1, (load addr:$src2)))]>;
+} // isTwoAddress
+
+def ADD64mr : RI<0x01, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (ops i64mem:$dst, i64i32imm :$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (ops i64mem:$dst, i64i8imm :$src2),
+ "add{q} {$src2, $dst|$dst, $src2}",
+ [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr : RI<0x11, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rm : RI<0x13, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri32 : RIi32<0x81, MRM2r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+def ADC64ri8 : RIi8<0x83, MRM2r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr : RI<0x11, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (ops i64mem:$dst, i64i32imm:$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (ops i64mem:$dst, i64i8imm :$src2),
+ "adc{q} {$src2, $dst|$dst, $src2}",
+ [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SUB64rr : RI<0x29, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>;
+
+def SUB64rm : RI<0x2B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2)))]>;
+
+def SUB64ri32 : RIi32<0x81, MRM5r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2))]>;
+def SUB64ri8 : RIi8<0x83, MRM5r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SUB64mr : RI<0x29, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (ops i64mem:$dst, i64i32imm:$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SUB64mi8 : RIi8<0x83, MRM5m, (ops i64mem:$dst, i64i8imm :$src2),
+ "sub{q} {$src2, $dst|$dst, $src2}",
+ [(store (sub (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SBB64rr : RI<0x19, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rm : RI<0x1B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri32 : RIi32<0x81, MRM3r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+def SBB64ri8 : RIi8<0x83, MRM3r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SBB64mr : RI<0x19, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (ops i64mem:$dst, i64i32imm:$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (ops i64mem:$dst, i64i8imm :$src2),
+ "sbb{q} {$src2, $dst|$dst, $src2}",
+ [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+// Unsigned multiplication
+def MUL64r : RI<0xF7, MRM4r, (ops GR64:$src),
+ "mul{q} $src", []>,
+ Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*GR64
+def MUL64m : RI<0xF7, MRM4m, (ops i64mem:$src),
+ "mul{q} $src", []>,
+ Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (ops GR64:$src),
+ "imul{q} $src", []>,
+ Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*GR64
+def IMUL64m : RI<0xF7, MRM5m, (ops i64mem:$src),
+ "imul{q} $src", []>,
+ Imp<[RAX],[RAX,RDX]>; // RAX,RDX = RAX*[mem64]
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def IMUL64rr : RI<0xAF, MRMSrcReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "imul{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>, TB;
+
+def IMUL64rm : RI<0xAF, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "imul{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2)))]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg, // GR64 = GR64*I32
+ (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg, // GR64 = GR64*I8
+ (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2))]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem, // GR64 = [mem64]*I32
+ (ops GR64:$dst, i64mem:$src1, i64i32imm:$src2),
+ "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul (load addr:$src1), i64immSExt32:$src2))]>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
+ (ops GR64:$dst, i64mem:$src1, i64i8imm: $src2),
+ "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set GR64:$dst, (mul (load addr:$src1), i64immSExt8:$src2))]>;
+
+// Unsigned division / remainder
+def DIV64r : RI<0xF7, MRM6r, (ops GR64:$src), // RDX:RAX/r64 = RAX,RDX
+ "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def DIV64m : RI<0xF7, MRM6m, (ops i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX
+ "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Signed division / remainder
+def IDIV64r: RI<0xF7, MRM7r, (ops GR64:$src), // RDX:RAX/r64 = RAX,RDX
+ "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def IDIV64m: RI<0xF7, MRM7m, (ops i64mem:$src), // RDX:RAX/[mem64] = RAX,RDX
+ "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Unary instructions
+let CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (ops GR64:$dst, GR64:$src), "neg{q} $dst",
+ [(set GR64:$dst, (ineg GR64:$src))]>;
+def NEG64m : RI<0xF7, MRM3m, (ops i64mem:$dst), "neg{q} $dst",
+ [(store (ineg (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (ops GR64:$dst, GR64:$src), "inc{q} $dst",
+ [(set GR64:$dst, (add GR64:$src, 1))]>;
+def INC64m : RI<0xFF, MRM0m, (ops i64mem:$dst), "inc{q} $dst",
+ [(store (add (loadi64 addr:$dst), 1), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (ops GR64:$dst, GR64:$src), "dec{q} $dst",
+ [(set GR64:$dst, (add GR64:$src, -1))]>;
+def DEC64m : RI<0xFF, MRM1m, (ops i64mem:$dst), "dec{q} $dst",
+ [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+ [(set GR16:$dst, (add GR16:$src, 1))]>,
+ OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+ [(set GR32:$dst, (add GR32:$src, 1))]>,
+ Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+ [(set GR16:$dst, (add GR16:$src, -1))]>,
+ OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+ [(set GR32:$dst, (add GR32:$src, -1))]>,
+ Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+} // CodeSize
+
+
+// Shift instructions
+let isTwoAddress = 1 in {
+def SHL64rCL : RI<0xD3, MRM4r, (ops GR64:$dst, GR64:$src),
+ "shl{q} {%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (shl GR64:$src, CL))]>,
+ Imp<[CL],[]>;
+def SHL64ri : RIi8<0xC1, MRM4r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+ "shl{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+def SHL64r1 : RI<0xD1, MRM4r, (ops GR64:$dst, GR64:$src1),
+ "shl{q} $dst", []>;
+} // isTwoAddress
+
+def SHL64mCL : RI<0xD3, MRM4m, (ops i64mem:$dst),
+ "shl{q} {%cl, $dst|$dst, %CL}",
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (ops i64mem:$dst, i8imm:$src),
+ "shl{q} {$src, $dst|$dst, $src}",
+ [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (ops i64mem:$dst),
+ "shl{q} $dst",
+ [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SHR64rCL : RI<0xD3, MRM5r, (ops GR64:$dst, GR64:$src),
+ "shr{q} {%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (srl GR64:$src, CL))]>,
+ Imp<[CL],[]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+ "shr{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1 : RI<0xD1, MRM5r, (ops GR64:$dst, GR64:$src1),
+ "shr{q} $dst",
+ [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SHR64mCL : RI<0xD3, MRM5m, (ops i64mem:$dst),
+ "shr{q} {%cl, $dst|$dst, %CL}",
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (ops i64mem:$dst, i8imm:$src),
+ "shr{q} {$src, $dst|$dst, $src}",
+ [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (ops i64mem:$dst),
+ "shr{q} $dst",
+ [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SAR64rCL : RI<0xD3, MRM7r, (ops GR64:$dst, GR64:$src),
+ "sar{q} {%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (sra GR64:$src, CL))]>, Imp<[CL],[]>;
+def SAR64ri : RIi8<0xC1, MRM7r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+ "sar{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1 : RI<0xD1, MRM7r, (ops GR64:$dst, GR64:$src1),
+ "sar{q} $dst",
+ [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SAR64mCL : RI<0xD3, MRM7m, (ops i64mem:$dst),
+ "sar{q} {%cl, $dst|$dst, %CL}",
+ [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+def SAR64mi : RIi8<0xC1, MRM7m, (ops i64mem:$dst, i8imm:$src),
+ "sar{q} {$src, $dst|$dst, $src}",
+ [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (ops i64mem:$dst),
+ "sar{q} $dst",
+ [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+let isTwoAddress = 1 in {
+def ROL64rCL : RI<0xD3, MRM0r, (ops GR64:$dst, GR64:$src),
+ "rol{q} {%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotl GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROL64ri : RIi8<0xC1, MRM0r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+ "rol{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1 : RI<0xD1, MRM0r, (ops GR64:$dst, GR64:$src1),
+ "rol{q} $dst",
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROL64mCL : I<0xD3, MRM0m, (ops i64mem:$dst),
+ "rol{q} {%cl, $dst|$dst, %CL}",
+ [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+def ROL64mi : RIi8<0xC1, MRM0m, (ops i64mem:$dst, i8imm:$src),
+ "rol{q} {$src, $dst|$dst, $src}",
+ [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1 : RI<0xD1, MRM0m, (ops i64mem:$dst),
+ "rol{q} $dst",
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def ROR64rCL : RI<0xD3, MRM1r, (ops GR64:$dst, GR64:$src),
+ "ror{q} {%cl, $dst|$dst, %CL}",
+ [(set GR64:$dst, (rotr GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROR64ri : RIi8<0xC1, MRM1r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+ "ror{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1 : RI<0xD1, MRM1r, (ops GR64:$dst, GR64:$src1),
+ "ror{q} $dst",
+ [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROR64mCL : RI<0xD3, MRM1m, (ops i64mem:$dst),
+ "ror{q} {%cl, $dst|$dst, %CL}",
+ [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+ Imp<[CL],[]>;
+def ROR64mi : RIi8<0xC1, MRM1m, (ops i64mem:$dst, i8imm:$src),
+ "ror{q} {$src, $dst|$dst, $src}",
+ [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1 : RI<0xD1, MRM1m, (ops i64mem:$dst),
+ "ror{q} $dst",
+ [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+ Imp<[CL],[]>, TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+ Imp<[CL],[]>, TB;
+
+let isCommutable = 1 in { // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+ (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+ (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+ "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ TB;
+} // isCommutable
+} // isTwoAddress
+
+// Temporary hack: there is no patterns associated with these instructions
+// so we have to tell tblgen that these do not produce results.
+let noResults = 1 in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+ Imp<[CL],[]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+ "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+ Imp<[CL],[]>, TB;
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+ (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
+ (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+ "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+ TB;
+} // noResults
+
+//===----------------------------------------------------------------------===//
+// Logical Instructions...
+//
+
+let isTwoAddress = 1 in
+def NOT64r : RI<0xF7, MRM2r, (ops GR64:$dst, GR64:$src), "not{q} $dst",
+ [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (ops i64mem:$dst), "not{q} $dst",
+ [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr : RI<0x21, MRMDestReg,
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "and{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>;
+def AND64rm : RI<0x23, MRMSrcMem,
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "and{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, (load addr:$src2)))]>;
+def AND64ri32 : RIi32<0x81, MRM4r,
+ (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "and{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2))]>;
+def AND64ri8 : RIi8<0x83, MRM4r,
+ (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "and{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def AND64mr : RI<0x21, MRMDestMem,
+ (ops i64mem:$dst, GR64:$src),
+ "and{q} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), GR64:$src), addr:$dst)]>;
+def AND64mi32 : RIi32<0x81, MRM4m,
+ (ops i64mem:$dst, i64i32imm:$src),
+ "and{q} {$src, $dst|$dst, $src}",
+ [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+ (ops i64mem:$dst, i64i8imm :$src),
+ "and{q} {$src, $dst|$dst, $src}",
+ [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr : RI<0x09, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "or{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>;
+def OR64rm : RI<0x0B, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "or{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, (load addr:$src2)))]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "or{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2))]>;
+def OR64ri8 : RIi8<0x83, MRM1r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "or{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+ "or{q} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), GR64:$src), addr:$dst)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (ops i64mem:$dst, i64i32imm:$src),
+ "or{q} {$src, $dst|$dst, $src}",
+ [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def OR64mi8 : RIi8<0x83, MRM1m, (ops i64mem:$dst, i64i8imm:$src),
+ "or{q} {$src, $dst|$dst, $src}",
+ [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr : RI<0x31, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "xor{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
+def XOR64rm : RI<0x33, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "xor{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2)))]>;
+def XOR64ri32 : RIi32<0x81, MRM6r,
+ (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+ "xor{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2))]>;
+def XOR64ri8 : RIi8<0x83, MRM6r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+ "xor{q} {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def XOR64mr : RI<0x31, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+ "xor{q} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), GR64:$src), addr:$dst)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (ops i64mem:$dst, i64i32imm:$src),
+ "xor{q} {$src, $dst|$dst, $src}",
+ [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (ops i64mem:$dst, i64i8imm :$src),
+ "xor{q} {$src, $dst|$dst, $src}",
+ [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Comparison Instructions...
+//
+
+// Integer comparison
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+ "test{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, GR64:$src2), 0)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+ "test{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (ops GR64:$src1, i64i32imm:$src2),
+ "test{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (ops i64mem:$src1, i64i32imm:$src2),
+ "test{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0)]>;
+
+def CMP64rr : RI<0x39, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, GR64:$src2)]>;
+def CMP64mr : RI<0x39, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), GR64:$src2)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, (loadi64 addr:$src2))]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (ops GR64:$src1, i64i32imm:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, i64immSExt32:$src2)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (ops i64mem:$src1, i64i32imm:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (ops i64mem:$src1, i64i8imm:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (ops GR64:$src1, i64i8imm:$src2),
+ "cmp{q} {$src2, $src1|$src1, $src2}",
+ [(X86cmp GR64:$src1, i64immSExt8:$src2)]>;
+
+// Conditional moves
+let isTwoAddress = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg, // if <u, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_B))]>, TB;
+def CMOVB64rm : RI<0x42, MRMSrcMem, // if <u, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovb {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_B))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg, // if >=u, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_AE))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem, // if >=u, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovae {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_AE))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg, // if ==, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_E))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem, // if ==, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmove {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_E))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg, // if !=, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NE))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem, // if !=, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovne {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NE))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg, // if <=u, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_BE))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem, // if <=u, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovbe {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_BE))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg, // if >u, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_A))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem, // if >u, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmova {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_A))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg, // if <s, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_L))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem, // if <s, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovl {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_L))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg, // if >=s, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_GE))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem, // if >=s, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovge {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_GE))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg, // if <=s, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_LE))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem, // if <=s, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovle {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_LE))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg, // if >s, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_G))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem, // if >s, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovg {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_G))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg, // if signed, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_S))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem, // if signed, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovs {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_S))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg, // if !signed, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem, // if !signed, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovns {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg, // if parity, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_P))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem, // if parity, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovp {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_P))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg, // if !parity, GR64 = GR64
+ (ops GR64:$dst, GR64:$src1, GR64:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+ X86_COND_NP))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem, // if !parity, GR64 = [mem64]
+ (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+ "cmovnp {$src2, $dst|$dst, $src2}",
+ [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+ X86_COND_NP))]>, TB;
+} // isTwoAddress
+
+//===----------------------------------------------------------------------===//
+// Conversion Instructions...
+//
+
+// f64 -> signed i64
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+ "cvtsd2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+ "cvtsd2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, FR64:$src),
+ "cvttsd2si{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f64mem:$src),
+ "cvttsd2si{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+ "cvttsd2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+ "cvttsd2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+ "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+ "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, GR64:$src2),
+ "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+ []>; // TODO: add intrinsic
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+ "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+ []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR64:$src),
+ "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (ops FR32:$dst, i64mem:$src),
+ "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+ [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg,
+ (ops VR128:$dst, VR128:$src1, GR64:$src2),
+ "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+ []>; // TODO: add intrinsic
+def Int_CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem,
+ (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+ "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+ []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// f32 -> signed i64
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+ "cvtss2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+ "cvtss2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, FR32:$src),
+ "cvttss2si{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+ "cvttss2si{q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+ "cvttss2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+ "cvttss2si{q} {$src, $dst|$dst, $src}",
+ []>; // TODO: add intrinsic
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Truncate
+// In 64-mode, each 64-bit and 32-bit registers has a low 8-bit sub-register.
+def TRUNC_64to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR64:$src),
+ "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+ [(set GR8:$dst, (trunc GR64:$src))]>;
+def TRUNC_32to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32:$src),
+ "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+ [(set GR8:$dst, (trunc GR32:$src))]>,
+ Requires<[In64BitMode]>;
+def TRUNC_16to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16:$src),
+ "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}",
+ [(set GR8:$dst, (trunc GR16:$src))]>,
+ Requires<[In64BitMode]>;
+
+def TRUNC_64to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR64:$src),
+ "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+ [(set GR16:$dst, (trunc GR64:$src))]>;
+
+def TRUNC_64to32 : I<0x89, MRMDestReg, (ops GR32:$dst, GR64:$src),
+ "mov{l} {${src:subreg32}, $dst|$dst, ${src:subreg32}}",
+ [(set GR32:$dst, (trunc GR64:$src))]>;
+
+// Zero-extension
+// TODO: Remove this after proper i32 -> i64 zext support.
+def PsMOVZX64rr32: I<0x89, MRMDestReg, (ops GR64:$dst, GR32:$src),
+ "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zext GR32:$src))]>;
+def PsMOVZX64rm32: I<0x8B, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+ "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let AddedComplexity = 1 in
+def MOV64r0 : RI<0x31, MRMInitReg, (ops GR64:$dst),
+ "xor{q} $dst, $dst",
+ [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero.
+let AddedComplexity = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (ops GR64:$dst, i64i32imm:$src),
+ "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+ [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i64 (X86Wrapper tconstpool :$dst)),
+ (MOV64ri tconstpool :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable :$dst)),
+ (MOV64ri tjumptable :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+ (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+ (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tconstpool:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tjumptable:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+ (MOV64mi32 addr:$dst, texternalsym:$src)>,
+ Requires<[SmallCode, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+ (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+ (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall GR64:$dst),
+ (CALL64r GR64:$dst)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi64i1 addr:$src), (MOVSX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+def : Pat<(extloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+def : Pat<(extloadi64i32 addr:$src), (PsMOVZX64rm32 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8 GR8 :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16:$src)>;
+def : Pat<(i64 (anyext GR32:$src)), (PsMOVZX64rr32 GR32:$src)>;
+def : Pat<(i64 (anyext (loadi8 addr:$src))), (MOVZX64rm8 addr:$src)>;
+def : Pat<(i64 (anyext (loadi16 addr:$src))), (MOVZX64rm16 addr:$src)>;
+def : Pat<(i64 (anyext (loadi32 addr:$src))), (PsMOVZX64rm32 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c)
+def : Pat<(or (srl GR64:$src1, CL:$amt),
+ (shl GR64:$src2, (sub 64, CL:$amt))),
+ (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
+ (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+ (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+def : Pat<(or (shl GR64:$src1, CL:$amt),
+ (srl GR64:$src2, (sub 64, CL:$amt))),
+ (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
+ (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+ (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+ (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+ (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+ (ADD64ri32 GR64:$src1, imm:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+ (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+ (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+ (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+ (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+ (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR64:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOV64toPQIrm : RPDI<0x6E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set VR128:$dst,
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>;
+
+def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, VR128:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+ (iPTR 0)))]>;
+def MOVPQIto64mr : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(store (i64 (vector_extract (v2i64 VR128:$src),
+ (iPTR 0))), addr:$dst)]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, FR64:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, FR64:$src),
+ "mov{d|q} {$src, $dst|$dst, $src}",
+ [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
diff --git a/lib/Target/X86/X86IntelAsmPrinter.cpp b/lib/Target/X86/X86IntelAsmPrinter.cpp
new file mode 100755
index 0000000..39b65ee
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.cpp
@@ -0,0 +1,533 @@
+//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Intel format assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelAsmPrinter.h"
+#include "X86TargetAsmInfo.h"
+#include "X86.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+std::string X86IntelAsmPrinter::getSectionForFunction(const Function &F) const {
+ // Intel asm always emits functions to _text.
+ return "_text";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ SetupMachineFunction(MF);
+ O << "\n\n";
+
+ // Print out constants referenced by the function
+ EmitConstantPool(MF.getConstantPool());
+
+ // Print out labels for the function.
+ const Function *F = MF.getFunction();
+ unsigned CC = F->getCallingConv();
+
+ // Populate function information map. Actually, We don't want to populate
+ // non-stdcall or non-fastcall functions' information right now.
+ if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+ FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+ X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+ SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+ switch (F->getLinkage()) {
+ default: assert(0 && "Unsupported linkage type!");
+ case Function::InternalLinkage:
+ EmitAlignment(4);
+ break;
+ case Function::DLLExportLinkage:
+ DLLExportedFns.insert(CurrentFnName);
+ //FALLS THROUGH
+ case Function::ExternalLinkage:
+ O << "\tpublic " << CurrentFnName << "\n";
+ EmitAlignment(4);
+ break;
+ }
+
+ O << CurrentFnName << "\tproc near\n";
+
+ // Print out code for the function.
+ for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+ I != E; ++I) {
+ // Print a label for the basic block if there are any predecessors.
+ if (I->pred_begin() != I->pred_end()) {
+ printBasicBlockLabel(I, true);
+ O << '\n';
+ }
+ for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+ II != E; ++II) {
+ // Print the assembly for the instruction.
+ O << "\t";
+ printMachineInstruction(II);
+ }
+ }
+
+ // Print out jump tables referenced by the function.
+ EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+ O << CurrentFnName << "\tendp\n";
+
+ // We didn't modify anything.
+ return false;
+}
+
+void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+ unsigned char value = MI->getOperand(Op).getImmedValue();
+ assert(value <= 7 && "Invalid ssecc argument!");
+ switch (value) {
+ case 0: O << "eq"; break;
+ case 1: O << "lt"; break;
+ case 2: O << "le"; break;
+ case 3: O << "unord"; break;
+ case 4: O << "neq"; break;
+ case 5: O << "nlt"; break;
+ case 6: O << "nle"; break;
+ case 7: O << "ord"; break;
+ }
+}
+
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO,
+ const char *Modifier) {
+ const MRegisterInfo &RI = *TM.getRegisterInfo();
+ switch (MO.getType()) {
+ case MachineOperand::MO_Register: {
+ if (MRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ unsigned Reg = MO.getReg();
+ if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+ MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ?
+ MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+ ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
+ Reg = getX86SubSuperRegister(Reg, VT);
+ }
+ O << RI.get(Reg).Name;
+ } else
+ O << "reg" << MO.getReg();
+ return;
+ }
+ case MachineOperand::MO_Immediate:
+ O << MO.getImmedValue();
+ return;
+ case MachineOperand::MO_MachineBasicBlock:
+ printBasicBlockLabel(MO.getMachineBasicBlock());
+ return;
+ case MachineOperand::MO_JumpTableIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << "OFFSET ";
+ O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+ << "_" << MO.getJumpTableIndex();
+ return;
+ }
+ case MachineOperand::MO_ConstantPoolIndex: {
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ if (!isMemOp) O << "OFFSET ";
+ O << "[" << TAI->getPrivateGlobalPrefix() << "CPI"
+ << getFunctionNumber() << "_" << MO.getConstantPoolIndex();
+ int Offset = MO.getOffset();
+ if (Offset > 0)
+ O << " + " << Offset;
+ else if (Offset < 0)
+ O << Offset;
+ O << "]";
+ return;
+ }
+ case MachineOperand::MO_GlobalAddress: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ bool isMemOp = Modifier && !strcmp(Modifier, "mem");
+ GlobalValue *GV = MO.getGlobal();
+ std::string Name = Mang->getValueName(GV);
+
+ X86SharedAsmPrinter::decorateName(Name, GV);
+
+ if (!isMemOp && !isCallOp) O << "OFFSET ";
+ if (GV->hasDLLImportLinkage()) {
+ // FIXME: This should be fixed with full support of stdcall & fastcall
+ // CC's
+ O << "__imp_";
+ }
+ O << Name;
+ int Offset = MO.getOffset();
+ if (Offset > 0)
+ O << " + " << Offset;
+ else if (Offset < 0)
+ O << Offset;
+ return;
+ }
+ case MachineOperand::MO_ExternalSymbol: {
+ bool isCallOp = Modifier && !strcmp(Modifier, "call");
+ if (!isCallOp) O << "OFFSET ";
+ O << TAI->getGlobalPrefix() << MO.getSymbolName();
+ return;
+ }
+ default:
+ O << "<unknown operand type>"; return;
+ }
+}
+
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier) {
+ assert(isMem(MI, Op) && "Invalid memory reference!");
+
+ const MachineOperand &BaseReg = MI->getOperand(Op);
+ int ScaleVal = MI->getOperand(Op+1).getImmedValue();
+ const MachineOperand &IndexReg = MI->getOperand(Op+2);
+ const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+ O << "[";
+ bool NeedPlus = false;
+ if (BaseReg.getReg()) {
+ printOp(BaseReg, Modifier);
+ NeedPlus = true;
+ }
+
+ if (IndexReg.getReg()) {
+ if (NeedPlus) O << " + ";
+ if (ScaleVal != 1)
+ O << ScaleVal << "*";
+ printOp(IndexReg, Modifier);
+ NeedPlus = true;
+ }
+
+ if (DispSpec.isGlobalAddress() || DispSpec.isConstantPoolIndex() ||
+ DispSpec.isJumpTableIndex()) {
+ if (NeedPlus)
+ O << " + ";
+ printOp(DispSpec, "mem");
+ } else {
+ int DispVal = DispSpec.getImmedValue();
+ if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) {
+ if (NeedPlus)
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
+ O << DispVal;
+ }
+ }
+ O << "]";
+}
+
+void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+ O << "\"L" << getFunctionNumber() << "$pb\"\n";
+ O << "\"L" << getFunctionNumber() << "$pb\":";
+}
+
+bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+ const char Mode) {
+ const MRegisterInfo &RI = *TM.getRegisterInfo();
+ unsigned Reg = MO.getReg();
+ switch (Mode) {
+ default: return true; // Unknown mode.
+ case 'b': // Print QImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8);
+ break;
+ case 'h': // Print QImode high register
+ Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+ break;
+ case 'w': // Print HImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i16);
+ break;
+ case 'k': // Print SImode register
+ Reg = getX86SubSuperRegister(Reg, MVT::i32);
+ break;
+ }
+
+ O << '%' << RI.get(Reg).Name;
+ return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ // Does this asm operand have a single letter operand modifier?
+ if (ExtraCode && ExtraCode[0]) {
+ if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+ switch (ExtraCode[0]) {
+ default: return true; // Unknown modifier.
+ case 'b': // Print QImode register
+ case 'h': // Print QImode high register
+ case 'w': // Print HImode register
+ case 'k': // Print SImode register
+ return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+ }
+ }
+
+ printOperand(MI, OpNo);
+ return false;
+}
+
+bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+ unsigned OpNo,
+ unsigned AsmVariant,
+ const char *ExtraCode) {
+ if (ExtraCode && ExtraCode[0])
+ return true; // Unknown modifier.
+ printMemReference(MI, OpNo);
+ return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in Intel syntax to the current output stream.
+///
+void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+ ++EmittedInsts;
+
+ // See if a truncate instruction can be turned into a nop.
+ switch (MI->getOpcode()) {
+ default: break;
+ case X86::TRUNC_64to32:
+ case X86::TRUNC_64to16:
+ case X86::TRUNC_32to16:
+ case X86::TRUNC_32to8:
+ case X86::TRUNC_16to8:
+ case X86::TRUNC_32_to8:
+ case X86::TRUNC_16_to8: {
+ const MachineOperand &MO0 = MI->getOperand(0);
+ const MachineOperand &MO1 = MI->getOperand(1);
+ unsigned Reg0 = MO0.getReg();
+ unsigned Reg1 = MO1.getReg();
+ unsigned Opc = MI->getOpcode();
+ if (Opc == X86::TRUNC_64to32)
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+ else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+ else
+ Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+ O << TAI->getCommentString() << " TRUNCATE ";
+ if (Reg0 != Reg1)
+ O << "\n\t";
+ break;
+ }
+ case X86::PsMOVZX64rr32:
+ O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+ break;
+ }
+
+ // Call the autogenerated instruction printer routines.
+ printInstruction(MI);
+}
+
+bool X86IntelAsmPrinter::doInitialization(Module &M) {
+ X86SharedAsmPrinter::doInitialization(M);
+
+ Mang->markCharUnacceptable('.');
+
+ O << "\t.686\n\t.model flat\n\n";
+
+ // Emit declarations for external functions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+ if (I->isDeclaration()) {
+ std::string Name = Mang->getValueName(I);
+ X86SharedAsmPrinter::decorateName(Name, I);
+
+ O << "\textern " ;
+ if (I->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name << ":near\n";
+ }
+
+ // Emit declarations for external globals. Note that VC++ always declares
+ // external globals to have type byte, and if that's good enough for VC++...
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (I->isDeclaration()) {
+ std::string Name = Mang->getValueName(I);
+
+ O << "\textern " ;
+ if (I->hasDLLImportLinkage()) {
+ O << "__imp_";
+ }
+ O << Name << ":byte\n";
+ }
+ }
+
+ return false;
+}
+
+bool X86IntelAsmPrinter::doFinalization(Module &M) {
+ const TargetData *TD = TM.getTargetData();
+
+ // Print out module-level global variables here.
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (I->isDeclaration()) continue; // External global require no code
+
+ // Check to see if this is a special global used by LLVM, if so, emit it.
+ if (EmitSpecialLLVMGlobal(I))
+ continue;
+
+ std::string name = Mang->getValueName(I);
+ Constant *C = I->getInitializer();
+ unsigned Align = TD->getPreferredAlignmentLog(I);
+ bool bCustomSegment = false;
+
+ switch (I->getLinkage()) {
+ case GlobalValue::LinkOnceLinkage:
+ case GlobalValue::WeakLinkage:
+ SwitchToDataSection("");
+ O << name << "?\tsegment common 'COMMON'\n";
+ bCustomSegment = true;
+ // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+ // are also available.
+ break;
+ case GlobalValue::AppendingLinkage:
+ SwitchToDataSection("");
+ O << name << "?\tsegment public 'DATA'\n";
+ bCustomSegment = true;
+ // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+ // are also available.
+ break;
+ case GlobalValue::DLLExportLinkage:
+ DLLExportedGVs.insert(name);
+ // FALL THROUGH
+ case GlobalValue::ExternalLinkage:
+ O << "\tpublic " << name << "\n";
+ // FALL THROUGH
+ case GlobalValue::InternalLinkage:
+ SwitchToDataSection(TAI->getDataSection(), I);
+ break;
+ default:
+ assert(0 && "Unknown linkage type!");
+ }
+
+ if (!bCustomSegment)
+ EmitAlignment(Align, I);
+
+ O << name << ":\t\t\t\t" << TAI->getCommentString()
+ << " " << I->getName() << '\n';
+
+ EmitGlobalConstant(C);
+
+ if (bCustomSegment)
+ O << name << "?\tends\n";
+ }
+
+ // Output linker support code for dllexported globals
+ if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+ (DLLExportedFns.begin() != DLLExportedFns.end())) {
+ SwitchToDataSection("");
+ O << "; WARNING: The following code is valid only with MASM v8.x and (possible) higher\n"
+ << "; This version of MASM is usually shipped with Microsoft Visual Studio 2005\n"
+ << "; or (possible) further versions. Unfortunately, there is no way to support\n"
+ << "; dllexported symbols in the earlier versions of MASM in fully automatic way\n\n";
+ O << "_drectve\t segment info alias('.drectve')\n";
+ }
+
+ for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+ e = DLLExportedGVs.end();
+ i != e; ++i) {
+ O << "\t db ' /EXPORT:" << *i << ",data'\n";
+ }
+
+ for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+ e = DLLExportedFns.end();
+ i != e; ++i) {
+ O << "\t db ' /EXPORT:" << *i << "'\n";
+ }
+
+ if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+ (DLLExportedFns.begin() != DLLExportedFns.end())) {
+ O << "_drectve\t ends\n";
+ }
+
+ // Bypass X86SharedAsmPrinter::doFinalization().
+ AsmPrinter::doFinalization(M);
+ SwitchToDataSection("");
+ O << "\tend\n";
+ return false; // success
+}
+
+void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const {
+ unsigned NumElts = CVA->getNumOperands();
+ if (NumElts) {
+ // ML does not have escape sequences except '' for '. It also has a maximum
+ // string length of 255.
+ unsigned len = 0;
+ bool inString = false;
+ for (unsigned i = 0; i < NumElts; i++) {
+ int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255;
+ if (len == 0)
+ O << "\tdb ";
+
+ if (n >= 32 && n <= 127) {
+ if (!inString) {
+ if (len > 0) {
+ O << ",'";
+ len += 2;
+ } else {
+ O << "'";
+ len++;
+ }
+ inString = true;
+ }
+ if (n == '\'') {
+ O << "'";
+ len++;
+ }
+ O << char(n);
+ } else {
+ if (inString) {
+ O << "'";
+ len++;
+ inString = false;
+ }
+ if (len > 0) {
+ O << ",";
+ len++;
+ }
+ O << n;
+ len += 1 + (n > 9) + (n > 99);
+ }
+
+ if (len > 60) {
+ if (inString) {
+ O << "'";
+ inString = false;
+ }
+ O << "\n";
+ len = 0;
+ }
+ }
+
+ if (len > 0) {
+ if (inString)
+ O << "'";
+ O << "\n";
+ }
+ }
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter1.inc"
diff --git a/lib/Target/X86/X86IntelAsmPrinter.h b/lib/Target/X86/X86IntelAsmPrinter.h
new file mode 100755
index 0000000..9ad11ff
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.h
@@ -0,0 +1,112 @@
+//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Intel assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INTELASMPRINTER_H
+#define X86INTELASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/MRegisterInfo.h"
+
+namespace llvm {
+
+struct X86IntelAsmPrinter : public X86SharedAsmPrinter {
+ X86IntelAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+ const TargetAsmInfo *T)
+ : X86SharedAsmPrinter(O, TM, T) {
+ }
+
+ virtual const char *getPassName() const {
+ return "X86 Intel-Style Assembly Printer";
+ }
+
+ /// printInstruction - This method is automatically generated by tablegen
+ /// from the instruction set description. This method returns true if the
+ /// machine instruction was sufficiently described to print it, otherwise it
+ /// returns false.
+ bool printInstruction(const MachineInstr *MI);
+
+ // This method is used by the tablegen'erated instruction printer.
+ void printOperand(const MachineInstr *MI, unsigned OpNo,
+ const char *Modifier = 0) {
+ const MachineOperand &MO = MI->getOperand(OpNo);
+ if (MO.isRegister()) {
+ assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) && "Not physreg??");
+ O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+ } else {
+ printOp(MO, Modifier);
+ }
+ }
+
+ void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "BYTE PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "WORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "DWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "XMMWORD PTR ";
+ printMemReference(MI, OpNo);
+ }
+ void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+ O << "QWORD PTR ";
+ printMemReference(MI, OpNo, "subreg64");
+ }
+
+ bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+ bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+ unsigned AsmVariant, const char *ExtraCode);
+ void printMachineInstruction(const MachineInstr *MI);
+ void printOp(const MachineOperand &MO, const char *Modifier = 0);
+ void printSSECC(const MachineInstr *MI, unsigned Op);
+ void printMemReference(const MachineInstr *MI, unsigned Op,
+ const char *Modifier=NULL);
+ void printPICLabel(const MachineInstr *MI, unsigned Op);
+ bool runOnMachineFunction(MachineFunction &F);
+ bool doInitialization(Module &M);
+ bool doFinalization(Module &M);
+
+ /// getSectionForFunction - Return the section that we should emit the
+ /// specified function body into.
+ virtual std::string getSectionForFunction(const Function &F) const;
+
+ virtual void EmitString(const ConstantArray *CVA) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..b9e5d5b
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,372 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include <cstdlib>
+using namespace llvm;
+
+#ifdef _MSC_VER
+ extern "C" void *_AddressOfReturnAddress(void);
+ #pragma intrinsic(_AddressOfReturnAddress)
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+ unsigned char *OldByte = (unsigned char *)Old;
+ *OldByte++ = 0xE9; // Emit JMP opcode.
+ unsigned *OldWord = (unsigned *)OldByte;
+ unsigned NewAddr = (intptr_t)New;
+ unsigned OldAddr = (intptr_t)OldWord;
+ *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host. This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(__x86_64__)
+ // No need to save EAX/EDX for X86-64.
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ ASMPREFIX "X86CompilationCallback:\n"
+ // Save RBP
+ "pushq %rbp\n"
+ // Save RSP
+ "movq %rsp, %rbp\n"
+ // Save all int arg registers
+ "pushq %rdi\n"
+ "pushq %rsi\n"
+ "pushq %rdx\n"
+ "pushq %rcx\n"
+ "pushq %r8\n"
+ "pushq %r9\n"
+ // Align stack on 16-byte boundary. ESP might not be properly aligned
+ // (8 byte) if this is called from an indirect stub.
+ "andq $-16, %rsp\n"
+ // Save all XMM arg registers
+ "subq $128, %rsp\n"
+ "movaps %xmm0, (%rsp)\n"
+ "movaps %xmm1, 16(%rsp)\n"
+ "movaps %xmm2, 32(%rsp)\n"
+ "movaps %xmm3, 48(%rsp)\n"
+ "movaps %xmm4, 64(%rsp)\n"
+ "movaps %xmm5, 80(%rsp)\n"
+ "movaps %xmm6, 96(%rsp)\n"
+ "movaps %xmm7, 112(%rsp)\n"
+ // JIT callee
+ "movq %rbp, %rdi\n" // Pass prev frame and return address
+ "movq 8(%rbp), %rsi\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ // Restore all XMM arg registers
+ "movaps 112(%rsp), %xmm7\n"
+ "movaps 96(%rsp), %xmm6\n"
+ "movaps 80(%rsp), %xmm5\n"
+ "movaps 64(%rsp), %xmm4\n"
+ "movaps 48(%rsp), %xmm3\n"
+ "movaps 32(%rsp), %xmm2\n"
+ "movaps 16(%rsp), %xmm1\n"
+ "movaps (%rsp), %xmm0\n"
+ // Restore RSP
+ "movq %rbp, %rsp\n"
+ // Restore all int arg registers
+ "subq $48, %rsp\n"
+ "popq %r9\n"
+ "popq %r8\n"
+ "popq %rcx\n"
+ "popq %rdx\n"
+ "popq %rsi\n"
+ "popq %rdi\n"
+ // Restore RBP
+ "popq %rbp\n"
+ "ret\n");
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+#ifndef _MSC_VER
+ void X86CompilationCallback(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback\n"
+ ASMPREFIX "X86CompilationCallback:\n"
+ "pushl %ebp\n"
+ "movl %esp, %ebp\n" // Standard prologue
+ "pushl %eax\n"
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ "pushl %ecx\n"
+#if defined(__APPLE__)
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+#endif
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ "movl %ebp, %esp\n" // Restore ESP
+ "subl $12, %esp\n"
+ "popl %ecx\n"
+ "popl %edx\n"
+ "popl %eax\n"
+ "popl %ebp\n"
+ "ret\n");
+
+ // Same as X86CompilationCallback but also saves XMM argument registers.
+ void X86CompilationCallback_SSE(void);
+ asm(
+ ".text\n"
+ ".align 8\n"
+ ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+ ASMPREFIX "X86CompilationCallback_SSE:\n"
+ "pushl %ebp\n"
+ "movl %esp, %ebp\n" // Standard prologue
+ "pushl %eax\n"
+ "pushl %edx\n" // Save EAX/EDX/ECX
+ "pushl %ecx\n"
+ "andl $-16, %esp\n" // Align ESP on 16-byte boundary
+ // Save all XMM arg registers
+ "subl $64, %esp\n"
+ "movaps %xmm0, (%esp)\n"
+ "movaps %xmm1, 16(%esp)\n"
+ "movaps %xmm2, 32(%esp)\n"
+ "movaps %xmm3, 48(%esp)\n"
+ "subl $16, %esp\n"
+ "movl 4(%ebp), %eax\n" // Pass prev frame and return address
+ "movl %eax, 4(%esp)\n"
+ "movl %ebp, (%esp)\n"
+ "call " ASMPREFIX "X86CompilationCallback2\n"
+ "addl $16, %esp\n"
+ "movaps 48(%esp), %xmm3\n"
+ "movaps 32(%esp), %xmm2\n"
+ "movaps 16(%esp), %xmm1\n"
+ "movaps (%esp), %xmm0\n"
+ "movl %ebp, %esp\n" // Restore ESP
+ "subl $12, %esp\n"
+ "popl %ecx\n"
+ "popl %edx\n"
+ "popl %eax\n"
+ "popl %ebp\n"
+ "ret\n");
+#else
+ void X86CompilationCallback2(void);
+
+ _declspec(naked) void X86CompilationCallback(void) {
+ __asm {
+ push eax
+ push edx
+ push ecx
+ call X86CompilationCallback2
+ pop ecx
+ pop edx
+ pop eax
+ ret
+ }
+ }
+#endif // _MSC_VER
+
+#else // Not an i386 host
+ void X86CompilationCallback() {
+ assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
+ abort();
+ }
+#endif
+}
+
+/// X86CompilationCallback - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call. This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+#ifdef _MSC_VER
+extern "C" void X86CompilationCallback2() {
+ assert(sizeof(size_t) == 4); // FIXME: handle Win64
+ intptr_t *RetAddrLoc = (intptr_t *)_AddressOfReturnAddress();
+ RetAddrLoc += 4; // skip over ret addr, edx, eax, ecx
+ intptr_t RetAddr = *RetAddrLoc;
+#else
+extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+ intptr_t *RetAddrLoc = &StackPtr[1];
+#endif
+ assert(*RetAddrLoc == RetAddr &&
+ "Could not find return address on the stack!");
+
+ // It's a stub if there is an interrupt marker after the call.
+ bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+
+ // The call instruction should have pushed the return value onto the stack...
+#ifdef __x86_64__
+ RetAddr--; // Backtrack to the reference itself...
+#else
+ RetAddr -= 4; // Backtrack to the reference itself...
+#endif
+
+#if 0
+ DOUT << "In callback! Addr=" << (void*)RetAddr
+ << " ESP=" << (void*)StackPtr
+ << ": Resolving call to function: "
+ << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+ // Sanity check to make sure this really is a call instruction.
+#ifdef __x86_64__
+ assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+ assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+ assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+ intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+ // Rewrite the call target... so that we don't end up here every time we
+ // execute the call.
+#ifdef __x86_64__
+ *(intptr_t *)(RetAddr - 0xa) = NewVal;
+#else
+ *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+ if (isStub) {
+ // If this is a stub, rewrite the call into an unconditional branch
+ // instruction so that two return addresses are not pushed onto the stack
+ // when the requested function finally gets called. This also makes the
+ // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+#ifdef __x86_64__
+ ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+#else
+ ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+ }
+
+ // Change the return address to reexecute the call instruction...
+#ifdef __x86_64__
+ *RetAddrLoc -= 0xd;
+#else
+ *RetAddrLoc -= 5;
+#endif
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+ JITCompilerFunction = F;
+
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+ !defined(_MSC_VER) && !defined(__x86_64__)
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
+ // FIXME: support for AMD family of processors.
+ if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+ X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+ if ((EDX >> 25) & 0x1)
+ return X86CompilationCallback_SSE;
+ }
+ }
+#endif
+
+ return X86CompilationCallback;
+}
+
+void *X86JITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+ // Note, we cast to intptr_t here to silence a -pedantic warning that
+ // complains about casting a function pointer to a normal pointer.
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+ !defined(_MSC_VER) && !defined(__x86_64__)
+ bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback &&
+ Fn != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+ bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback;
+#endif
+ if (NotCC) {
+#ifdef __x86_64__
+ MCE.startFunctionStub(13, 4);
+ MCE.emitByte(0x49); // REX prefix
+ MCE.emitByte(0xB8+2); // movabsq r10
+ MCE.emitWordLE(((unsigned *)&Fn)[0]);
+ MCE.emitWordLE(((unsigned *)&Fn)[1]);
+ MCE.emitByte(0x41); // REX prefix
+ MCE.emitByte(0xFF); // jmpq *r10
+ MCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+ MCE.startFunctionStub(5, 4);
+ MCE.emitByte(0xE9);
+ MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+ return MCE.finishFunctionStub(0);
+ }
+
+#ifdef __x86_64__
+ MCE.startFunctionStub(14, 4);
+ MCE.emitByte(0x49); // REX prefix
+ MCE.emitByte(0xB8+2); // movabsq r10
+ MCE.emitWordLE(((unsigned *)&Fn)[0]);
+ MCE.emitWordLE(((unsigned *)&Fn)[1]);
+ MCE.emitByte(0x41); // REX prefix
+ MCE.emitByte(0xFF); // callq *r10
+ MCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+ MCE.startFunctionStub(6, 4);
+ MCE.emitByte(0xE8); // Call with 32 bit pc-rel destination...
+
+ MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+
+ MCE.emitByte(0xCD); // Interrupt - Just a marker identifying the stub!
+ return MCE.finishFunctionStub(0);
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase) {
+ for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+ void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+ intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+ switch ((X86::RelocationType)MR->getRelocationType()) {
+ case X86::reloc_pcrel_word: {
+ // PC relative relocation, add the relocated value to the value already in
+ // memory, after we adjust it for where the PC is.
+ ResultPtr = ResultPtr-(intptr_t)RelocPos-4-MR->getConstantVal();
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ }
+ case X86::reloc_absolute_word:
+ // Absolute relocation, just add the relocated value to the value already
+ // in memory.
+ *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+ break;
+ case X86::reloc_absolute_dword:
+ *((intptr_t*)RelocPos) += ResultPtr;
+ break;
+ }
+ }
+}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..a4c731a
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,50 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface --*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+ class X86TargetMachine;
+
+ class X86JITInfo : public TargetJITInfo {
+ X86TargetMachine &TM;
+ public:
+ X86JITInfo(X86TargetMachine &tm) : TM(tm) {useGOT = 0;}
+
+ /// replaceMachineCodeForFunction - Make it so that calling the function
+ /// whose machine code is at OLD turns into a call to NEW, perhaps by
+ /// overwriting OLD with a branch to NEW. This is used for self-modifying
+ /// code.
+ ///
+ virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+ /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a
+ /// small native function that simply calls the function at the specified
+ /// address.
+ virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+
+ /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+ virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+ /// relocate - Before the JIT can run a block of code that has been emitted,
+ /// it must rewrite the code to contain the actual addresses of any
+ /// referenced global symbols.
+ virtual void relocate(void *Function, MachineRelocation *MR,
+ unsigned NumRelocs, unsigned char* GOTBase);
+ };
+}
+
+#endif
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..7a21fb2
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,74 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+ None,
+ StdCall,
+ FastCall
+};
+
+/// X86MachineFunctionInfo - This class is derived from MachineFunction private
+/// X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+ /// ForceFramePointer - True if the function is required to use of frame
+ /// pointer for reasons other than it containing dynamic allocation or
+ /// that FP eliminatation is turned off. For example, Cygwin main function
+ /// contains stack pointer re-alignment code which requires FP.
+ bool ForceFramePointer;
+
+ /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+ /// stack frame in bytes.
+ unsigned CalleeSavedFrameSize;
+
+ /// BytesToPopOnReturn - amount of bytes function pops on return.
+ /// Used on windows platform for stdcall & fastcall name decoration
+ unsigned BytesToPopOnReturn;
+
+ /// If the function requires additional name decoration, DecorationStyle holds
+ /// the right way to do so.
+ NameDecorationStyle DecorationStyle;
+
+public:
+ X86MachineFunctionInfo() : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ DecorationStyle(None) {}
+
+ X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
+ CalleeSavedFrameSize(0),
+ BytesToPopOnReturn(0),
+ DecorationStyle(None) {}
+
+ bool getForceFramePointer() const { return ForceFramePointer;}
+ void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+ unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+ void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+ unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+ void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+ NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+ void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..da65db0
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,1613 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class. This
+// file is responsible for the frame pointer elimination optimization on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+namespace {
+ cl::opt<bool>
+ NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"));
+ cl::opt<bool>
+ PrintFailedFusing("print-failed-fuse-candidates",
+ cl::desc("Print instructions that the allocator wants to"
+ " fuse, but the X86 backend currently can't"),
+ cl::Hidden);
+}
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+ const TargetInstrInfo &tii)
+ : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP),
+ TM(tm), TII(tii) {
+ // Cache some information.
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+ Is64Bit = Subtarget->is64Bit();
+ if (Is64Bit) {
+ SlotSize = 8;
+ StackPtr = X86::RSP;
+ FramePtr = X86::RBP;
+ } else {
+ SlotSize = 4;
+ StackPtr = X86::ESP;
+ FramePtr = X86::EBP;
+ }
+}
+
+bool X86RegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+ unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+ for (unsigned i = CSI.size(); i != 0; --i) {
+ unsigned Reg = CSI[i-1].getReg();
+ // Add the callee-saved register as live-in. It's killed at the spill.
+ MBB.addLiveIn(Reg);
+ BuildMI(MBB, MI, TII.get(Opc)).addReg(Reg);
+ }
+ return true;
+}
+
+bool X86RegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const {
+ if (CSI.empty())
+ return false;
+
+ unsigned Opc = Is64Bit ? X86::POP64r : X86::POP32r;
+ for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+ unsigned Reg = CSI[i].getReg();
+ BuildMI(MBB, MI, TII.get(Opc), Reg);
+ }
+ return true;
+}
+
+void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, int FrameIdx,
+ const TargetRegisterClass *RC) const {
+ unsigned Opc;
+ if (RC == &X86::GR64RegClass) {
+ Opc = X86::MOV64mr;
+ } else if (RC == &X86::GR32RegClass) {
+ Opc = X86::MOV32mr;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16mr;
+ } else if (RC == &X86::GR8RegClass) {
+ Opc = X86::MOV8mr;
+ } else if (RC == &X86::GR32_RegClass) {
+ Opc = X86::MOV32_mr;
+ } else if (RC == &X86::GR16_RegClass) {
+ Opc = X86::MOV16_mr;
+ } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+ Opc = X86::ST_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::ST_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSmr;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDmr;
+ } else if (RC == &X86::VR128RegClass) {
+ Opc = X86::MOVAPSmr;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64mr;
+ } else {
+ assert(0 && "Unknown regclass");
+ abort();
+ }
+ addFrameReference(BuildMI(MBB, MI, TII.get(Opc)), FrameIdx)
+ .addReg(SrcReg, false, false, true);
+}
+
+void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIdx,
+ const TargetRegisterClass *RC) const{
+ unsigned Opc;
+ if (RC == &X86::GR64RegClass) {
+ Opc = X86::MOV64rm;
+ } else if (RC == &X86::GR32RegClass) {
+ Opc = X86::MOV32rm;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rm;
+ } else if (RC == &X86::GR8RegClass) {
+ Opc = X86::MOV8rm;
+ } else if (RC == &X86::GR32_RegClass) {
+ Opc = X86::MOV32_rm;
+ } else if (RC == &X86::GR16_RegClass) {
+ Opc = X86::MOV16_rm;
+ } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+ Opc = X86::LD_Fp64m;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::LD_Fp32m;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::MOVSSrm;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::MOVSDrm;
+ } else if (RC == &X86::VR128RegClass) {
+ Opc = X86::MOVAPSrm;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rm;
+ } else {
+ assert(0 && "Unknown regclass");
+ abort();
+ }
+ addFrameReference(BuildMI(MBB, MI, TII.get(Opc), DestReg), FrameIdx);
+}
+
+void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *RC) const {
+ unsigned Opc;
+ if (RC == &X86::GR64RegClass) {
+ Opc = X86::MOV64rr;
+ } else if (RC == &X86::GR32RegClass) {
+ Opc = X86::MOV32rr;
+ } else if (RC == &X86::GR16RegClass) {
+ Opc = X86::MOV16rr;
+ } else if (RC == &X86::GR8RegClass) {
+ Opc = X86::MOV8rr;
+ } else if (RC == &X86::GR32_RegClass) {
+ Opc = X86::MOV32_rr;
+ } else if (RC == &X86::GR16_RegClass) {
+ Opc = X86::MOV16_rr;
+ } else if (RC == &X86::RFP32RegClass) {
+ Opc = X86::MOV_Fp3232;
+ } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+ Opc = X86::MOV_Fp6464;
+ } else if (RC == &X86::FR32RegClass) {
+ Opc = X86::FsMOVAPSrr;
+ } else if (RC == &X86::FR64RegClass) {
+ Opc = X86::FsMOVAPDrr;
+ } else if (RC == &X86::VR128RegClass) {
+ Opc = X86::MOVAPSrr;
+ } else if (RC == &X86::VR64RegClass) {
+ Opc = X86::MMX_MOVQ64rr;
+ } else {
+ assert(0 && "Unknown regclass");
+ abort();
+ }
+ BuildMI(MBB, MI, TII.get(Opc), DestReg).addReg(SrcReg);
+}
+
+
+void X86RegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ unsigned DestReg,
+ const MachineInstr *Orig) const {
+ MachineInstr *MI = Orig->clone();
+ MI->getOperand(0).setReg(DestReg);
+ MBB.insert(I, MI);
+}
+
+static MachineInstr *FuseTwoAddrInst(unsigned Opcode, unsigned FrameIndex,
+ MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ unsigned NumOps = TII.getNumOperands(MI->getOpcode())-2;
+ // Create the base instruction with the memory operand as the first part.
+ MachineInstrBuilder MIB = addFrameReference(BuildMI(TII.get(Opcode)),
+ FrameIndex);
+
+ // Loop over the rest of the ri operands, converting them over.
+ for (unsigned i = 0; i != NumOps; ++i) {
+ MachineOperand &MO = MI->getOperand(i+2);
+ if (MO.isReg())
+ MIB = MIB.addReg(MO.getReg(), false, MO.isImplicit());
+ else if (MO.isImm())
+ MIB = MIB.addImm(MO.getImm());
+ else if (MO.isGlobalAddress())
+ MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+ else if (MO.isJumpTableIndex())
+ MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+ else if (MO.isExternalSymbol())
+ MIB = MIB.addExternalSymbol(MO.getSymbolName());
+ else
+ assert(0 && "Unknown operand type!");
+ }
+ return MIB;
+}
+
+static MachineInstr *FuseInst(unsigned Opcode, unsigned OpNo,
+ unsigned FrameIndex, MachineInstr *MI,
+ const TargetInstrInfo &TII) {
+ MachineInstrBuilder MIB = BuildMI(TII.get(Opcode));
+
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (i == OpNo) {
+ assert(MO.isReg() && "Expected to fold into reg operand!");
+ MIB = addFrameReference(MIB, FrameIndex);
+ } else if (MO.isReg())
+ MIB = MIB.addReg(MO.getReg(), MO.isDef(), MO.isImplicit());
+ else if (MO.isImm())
+ MIB = MIB.addImm(MO.getImm());
+ else if (MO.isGlobalAddress())
+ MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+ else if (MO.isJumpTableIndex())
+ MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+ else if (MO.isExternalSymbol())
+ MIB = MIB.addExternalSymbol(MO.getSymbolName());
+ else
+ assert(0 && "Unknown operand for FuseInst!");
+ }
+ return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII,
+ unsigned Opcode, unsigned FrameIndex,
+ MachineInstr *MI) {
+ return addFrameReference(BuildMI(TII.get(Opcode)), FrameIndex).addImm(0);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+ /// TableEntry - Maps the 'from' opcode to a fused form of the 'to' opcode.
+ ///
+ struct TableEntry {
+ unsigned from; // Original opcode.
+ unsigned to; // New opcode.
+
+ // less operators used by STL search.
+ bool operator<(const TableEntry &TE) const { return from < TE.from; }
+ friend bool operator<(const TableEntry &TE, unsigned V) {
+ return TE.from < V;
+ }
+ friend bool operator<(unsigned V, const TableEntry &TE) {
+ return V < TE.from;
+ }
+ };
+}
+
+/// TableIsSorted - Return true if the table is in 'from' opcode order.
+///
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+ for (unsigned i = 1; i != NumEntries; ++i)
+ if (!(Table[i-1] < Table[i])) {
+ cerr << "Entries out of order " << Table[i-1].from
+ << " " << Table[i].from << "\n";
+ return false;
+ }
+ return true;
+}
+
+/// TableLookup - Return the table entry matching the specified opcode.
+/// Otherwise return NULL.
+static const TableEntry *TableLookup(const TableEntry *Table, unsigned N,
+ unsigned Opcode) {
+ const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+ if (I != Table+N && I->from == Opcode)
+ return I;
+ return NULL;
+}
+
+#define ARRAY_SIZE(TABLE) \
+ (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE) \
+ { static bool TABLE##Checked = false; \
+ if (!TABLE##Checked) { \
+ assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) && \
+ "All lookup tables must be sorted for efficient access!"); \
+ TABLE##Checked = true; \
+ } \
+ }
+#endif
+
+
+MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
+ unsigned i,
+ int FrameIndex) const {
+ // Check switch flag
+ if (NoFusing) return NULL;
+
+ // Table (and size) to search
+ const TableEntry *OpcodeTablePtr = NULL;
+ unsigned OpcodeTableSize = 0;
+ bool isTwoAddrFold = false;
+ unsigned NumOps = TII.getNumOperands(MI->getOpcode());
+ bool isTwoAddr = NumOps > 1 &&
+ MI->getInstrDescriptor()->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+ MachineInstr *NewMI = NULL;
+ // Folding a memory location into the two-address part of a two-address
+ // instruction is different than folding it other places. It requires
+ // replacing the *two* registers with the memory location.
+ if (isTwoAddr && NumOps >= 2 && i < 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() &&
+ MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+ static const TableEntry OpcodeTable[] = {
+ { X86::ADC32ri, X86::ADC32mi },
+ { X86::ADC32ri8, X86::ADC32mi8 },
+ { X86::ADC32rr, X86::ADC32mr },
+ { X86::ADC64ri32, X86::ADC64mi32 },
+ { X86::ADC64ri8, X86::ADC64mi8 },
+ { X86::ADC64rr, X86::ADC64mr },
+ { X86::ADD16ri, X86::ADD16mi },
+ { X86::ADD16ri8, X86::ADD16mi8 },
+ { X86::ADD16rr, X86::ADD16mr },
+ { X86::ADD32ri, X86::ADD32mi },
+ { X86::ADD32ri8, X86::ADD32mi8 },
+ { X86::ADD32rr, X86::ADD32mr },
+ { X86::ADD64ri32, X86::ADD64mi32 },
+ { X86::ADD64ri8, X86::ADD64mi8 },
+ { X86::ADD64rr, X86::ADD64mr },
+ { X86::ADD8ri, X86::ADD8mi },
+ { X86::ADD8rr, X86::ADD8mr },
+ { X86::AND16ri, X86::AND16mi },
+ { X86::AND16ri8, X86::AND16mi8 },
+ { X86::AND16rr, X86::AND16mr },
+ { X86::AND32ri, X86::AND32mi },
+ { X86::AND32ri8, X86::AND32mi8 },
+ { X86::AND32rr, X86::AND32mr },
+ { X86::AND64ri32, X86::AND64mi32 },
+ { X86::AND64ri8, X86::AND64mi8 },
+ { X86::AND64rr, X86::AND64mr },
+ { X86::AND8ri, X86::AND8mi },
+ { X86::AND8rr, X86::AND8mr },
+ { X86::DEC16r, X86::DEC16m },
+ { X86::DEC32r, X86::DEC32m },
+ { X86::DEC64_16r, X86::DEC16m },
+ { X86::DEC64_32r, X86::DEC32m },
+ { X86::DEC64r, X86::DEC64m },
+ { X86::DEC8r, X86::DEC8m },
+ { X86::INC16r, X86::INC16m },
+ { X86::INC32r, X86::INC32m },
+ { X86::INC64_16r, X86::INC16m },
+ { X86::INC64_32r, X86::INC32m },
+ { X86::INC64r, X86::INC64m },
+ { X86::INC8r, X86::INC8m },
+ { X86::NEG16r, X86::NEG16m },
+ { X86::NEG32r, X86::NEG32m },
+ { X86::NEG64r, X86::NEG64m },
+ { X86::NEG8r, X86::NEG8m },
+ { X86::NOT16r, X86::NOT16m },
+ { X86::NOT32r, X86::NOT32m },
+ { X86::NOT64r, X86::NOT64m },
+ { X86::NOT8r, X86::NOT8m },
+ { X86::OR16ri, X86::OR16mi },
+ { X86::OR16ri8, X86::OR16mi8 },
+ { X86::OR16rr, X86::OR16mr },
+ { X86::OR32ri, X86::OR32mi },
+ { X86::OR32ri8, X86::OR32mi8 },
+ { X86::OR32rr, X86::OR32mr },
+ { X86::OR64ri32, X86::OR64mi32 },
+ { X86::OR64ri8, X86::OR64mi8 },
+ { X86::OR64rr, X86::OR64mr },
+ { X86::OR8ri, X86::OR8mi },
+ { X86::OR8rr, X86::OR8mr },
+ { X86::ROL16r1, X86::ROL16m1 },
+ { X86::ROL16rCL, X86::ROL16mCL },
+ { X86::ROL16ri, X86::ROL16mi },
+ { X86::ROL32r1, X86::ROL32m1 },
+ { X86::ROL32rCL, X86::ROL32mCL },
+ { X86::ROL32ri, X86::ROL32mi },
+ { X86::ROL64r1, X86::ROL64m1 },
+ { X86::ROL64rCL, X86::ROL64mCL },
+ { X86::ROL64ri, X86::ROL64mi },
+ { X86::ROL8r1, X86::ROL8m1 },
+ { X86::ROL8rCL, X86::ROL8mCL },
+ { X86::ROL8ri, X86::ROL8mi },
+ { X86::ROR16r1, X86::ROR16m1 },
+ { X86::ROR16rCL, X86::ROR16mCL },
+ { X86::ROR16ri, X86::ROR16mi },
+ { X86::ROR32r1, X86::ROR32m1 },
+ { X86::ROR32rCL, X86::ROR32mCL },
+ { X86::ROR32ri, X86::ROR32mi },
+ { X86::ROR64r1, X86::ROR64m1 },
+ { X86::ROR64rCL, X86::ROR64mCL },
+ { X86::ROR64ri, X86::ROR64mi },
+ { X86::ROR8r1, X86::ROR8m1 },
+ { X86::ROR8rCL, X86::ROR8mCL },
+ { X86::ROR8ri, X86::ROR8mi },
+ { X86::SAR16r1, X86::SAR16m1 },
+ { X86::SAR16rCL, X86::SAR16mCL },
+ { X86::SAR16ri, X86::SAR16mi },
+ { X86::SAR32r1, X86::SAR32m1 },
+ { X86::SAR32rCL, X86::SAR32mCL },
+ { X86::SAR32ri, X86::SAR32mi },
+ { X86::SAR64r1, X86::SAR64m1 },
+ { X86::SAR64rCL, X86::SAR64mCL },
+ { X86::SAR64ri, X86::SAR64mi },
+ { X86::SAR8r1, X86::SAR8m1 },
+ { X86::SAR8rCL, X86::SAR8mCL },
+ { X86::SAR8ri, X86::SAR8mi },
+ { X86::SBB32ri, X86::SBB32mi },
+ { X86::SBB32ri8, X86::SBB32mi8 },
+ { X86::SBB32rr, X86::SBB32mr },
+ { X86::SBB64ri32, X86::SBB64mi32 },
+ { X86::SBB64ri8, X86::SBB64mi8 },
+ { X86::SBB64rr, X86::SBB64mr },
+ { X86::SHL16r1, X86::SHL16m1 },
+ { X86::SHL16rCL, X86::SHL16mCL },
+ { X86::SHL16ri, X86::SHL16mi },
+ { X86::SHL32r1, X86::SHL32m1 },
+ { X86::SHL32rCL, X86::SHL32mCL },
+ { X86::SHL32ri, X86::SHL32mi },
+ { X86::SHL64r1, X86::SHL64m1 },
+ { X86::SHL64rCL, X86::SHL64mCL },
+ { X86::SHL64ri, X86::SHL64mi },
+ { X86::SHL8r1, X86::SHL8m1 },
+ { X86::SHL8rCL, X86::SHL8mCL },
+ { X86::SHL8ri, X86::SHL8mi },
+ { X86::SHLD16rrCL, X86::SHLD16mrCL },
+ { X86::SHLD16rri8, X86::SHLD16mri8 },
+ { X86::SHLD32rrCL, X86::SHLD32mrCL },
+ { X86::SHLD32rri8, X86::SHLD32mri8 },
+ { X86::SHLD64rrCL, X86::SHLD64mrCL },
+ { X86::SHLD64rri8, X86::SHLD64mri8 },
+ { X86::SHR16r1, X86::SHR16m1 },
+ { X86::SHR16rCL, X86::SHR16mCL },
+ { X86::SHR16ri, X86::SHR16mi },
+ { X86::SHR32r1, X86::SHR32m1 },
+ { X86::SHR32rCL, X86::SHR32mCL },
+ { X86::SHR32ri, X86::SHR32mi },
+ { X86::SHR64r1, X86::SHR64m1 },
+ { X86::SHR64rCL, X86::SHR64mCL },
+ { X86::SHR64ri, X86::SHR64mi },
+ { X86::SHR8r1, X86::SHR8m1 },
+ { X86::SHR8rCL, X86::SHR8mCL },
+ { X86::SHR8ri, X86::SHR8mi },
+ { X86::SHRD16rrCL, X86::SHRD16mrCL },
+ { X86::SHRD16rri8, X86::SHRD16mri8 },
+ { X86::SHRD32rrCL, X86::SHRD32mrCL },
+ { X86::SHRD32rri8, X86::SHRD32mri8 },
+ { X86::SHRD64rrCL, X86::SHRD64mrCL },
+ { X86::SHRD64rri8, X86::SHRD64mri8 },
+ { X86::SUB16ri, X86::SUB16mi },
+ { X86::SUB16ri8, X86::SUB16mi8 },
+ { X86::SUB16rr, X86::SUB16mr },
+ { X86::SUB32ri, X86::SUB32mi },
+ { X86::SUB32ri8, X86::SUB32mi8 },
+ { X86::SUB32rr, X86::SUB32mr },
+ { X86::SUB64ri32, X86::SUB64mi32 },
+ { X86::SUB64ri8, X86::SUB64mi8 },
+ { X86::SUB64rr, X86::SUB64mr },
+ { X86::SUB8ri, X86::SUB8mi },
+ { X86::SUB8rr, X86::SUB8mr },
+ { X86::XOR16ri, X86::XOR16mi },
+ { X86::XOR16ri8, X86::XOR16mi8 },
+ { X86::XOR16rr, X86::XOR16mr },
+ { X86::XOR32ri, X86::XOR32mi },
+ { X86::XOR32ri8, X86::XOR32mi8 },
+ { X86::XOR32rr, X86::XOR32mr },
+ { X86::XOR64ri32, X86::XOR64mi32 },
+ { X86::XOR64ri8, X86::XOR64mi8 },
+ { X86::XOR64rr, X86::XOR64mr },
+ { X86::XOR8ri, X86::XOR8mi },
+ { X86::XOR8rr, X86::XOR8mr }
+ };
+ ASSERT_SORTED(OpcodeTable);
+ OpcodeTablePtr = OpcodeTable;
+ OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+ isTwoAddrFold = true;
+ } else if (i == 0) { // If operand 0
+ if (MI->getOpcode() == X86::MOV16r0)
+ NewMI = MakeM0Inst(TII, X86::MOV16mi, FrameIndex, MI);
+ else if (MI->getOpcode() == X86::MOV32r0)
+ NewMI = MakeM0Inst(TII, X86::MOV32mi, FrameIndex, MI);
+ else if (MI->getOpcode() == X86::MOV64r0)
+ NewMI = MakeM0Inst(TII, X86::MOV64mi32, FrameIndex, MI);
+ else if (MI->getOpcode() == X86::MOV8r0)
+ NewMI = MakeM0Inst(TII, X86::MOV8mi, FrameIndex, MI);
+ if (NewMI) {
+ NewMI->copyKillDeadInfo(MI);
+ return NewMI;
+ }
+
+ static const TableEntry OpcodeTable[] = {
+ { X86::CMP16ri, X86::CMP16mi },
+ { X86::CMP16ri8, X86::CMP16mi8 },
+ { X86::CMP32ri, X86::CMP32mi },
+ { X86::CMP32ri8, X86::CMP32mi8 },
+ { X86::CMP8ri, X86::CMP8mi },
+ { X86::DIV16r, X86::DIV16m },
+ { X86::DIV32r, X86::DIV32m },
+ { X86::DIV64r, X86::DIV64m },
+ { X86::DIV8r, X86::DIV8m },
+ { X86::FsMOVAPDrr, X86::MOVSDmr },
+ { X86::FsMOVAPSrr, X86::MOVSSmr },
+ { X86::IDIV16r, X86::IDIV16m },
+ { X86::IDIV32r, X86::IDIV32m },
+ { X86::IDIV64r, X86::IDIV64m },
+ { X86::IDIV8r, X86::IDIV8m },
+ { X86::IMUL16r, X86::IMUL16m },
+ { X86::IMUL32r, X86::IMUL32m },
+ { X86::IMUL64r, X86::IMUL64m },
+ { X86::IMUL8r, X86::IMUL8m },
+ { X86::MOV16ri, X86::MOV16mi },
+ { X86::MOV16rr, X86::MOV16mr },
+ { X86::MOV32ri, X86::MOV32mi },
+ { X86::MOV32rr, X86::MOV32mr },
+ { X86::MOV64ri32, X86::MOV64mi32 },
+ { X86::MOV64rr, X86::MOV64mr },
+ { X86::MOV8ri, X86::MOV8mi },
+ { X86::MOV8rr, X86::MOV8mr },
+ { X86::MOVAPDrr, X86::MOVAPDmr },
+ { X86::MOVAPSrr, X86::MOVAPSmr },
+ { X86::MOVPDI2DIrr, X86::MOVPDI2DImr },
+ { X86::MOVPQIto64rr,X86::MOVPQIto64mr },
+ { X86::MOVPS2SSrr, X86::MOVPS2SSmr },
+ { X86::MOVSDrr, X86::MOVSDmr },
+ { X86::MOVSDto64rr, X86::MOVSDto64mr },
+ { X86::MOVSS2DIrr, X86::MOVSS2DImr },
+ { X86::MOVSSrr, X86::MOVSSmr },
+ { X86::MOVUPDrr, X86::MOVUPDmr },
+ { X86::MOVUPSrr, X86::MOVUPSmr },
+ { X86::MUL16r, X86::MUL16m },
+ { X86::MUL32r, X86::MUL32m },
+ { X86::MUL64r, X86::MUL64m },
+ { X86::MUL8r, X86::MUL8m },
+ { X86::SETAEr, X86::SETAEm },
+ { X86::SETAr, X86::SETAm },
+ { X86::SETBEr, X86::SETBEm },
+ { X86::SETBr, X86::SETBm },
+ { X86::SETEr, X86::SETEm },
+ { X86::SETGEr, X86::SETGEm },
+ { X86::SETGr, X86::SETGm },
+ { X86::SETLEr, X86::SETLEm },
+ { X86::SETLr, X86::SETLm },
+ { X86::SETNEr, X86::SETNEm },
+ { X86::SETNPr, X86::SETNPm },
+ { X86::SETNSr, X86::SETNSm },
+ { X86::SETPr, X86::SETPm },
+ { X86::SETSr, X86::SETSm },
+ { X86::TEST16ri, X86::TEST16mi },
+ { X86::TEST32ri, X86::TEST32mi },
+ { X86::TEST64ri32, X86::TEST64mi32 },
+ { X86::TEST8ri, X86::TEST8mi },
+ { X86::XCHG16rr, X86::XCHG16mr },
+ { X86::XCHG32rr, X86::XCHG32mr },
+ { X86::XCHG64rr, X86::XCHG64mr },
+ { X86::XCHG8rr, X86::XCHG8mr }
+ };
+ ASSERT_SORTED(OpcodeTable);
+ OpcodeTablePtr = OpcodeTable;
+ OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+ } else if (i == 1) {
+ static const TableEntry OpcodeTable[] = {
+ { X86::CMP16rr, X86::CMP16rm },
+ { X86::CMP32rr, X86::CMP32rm },
+ { X86::CMP64ri32, X86::CMP64mi32 },
+ { X86::CMP64ri8, X86::CMP64mi8 },
+ { X86::CMP64rr, X86::CMP64rm },
+ { X86::CMP8rr, X86::CMP8rm },
+ { X86::CMPPDrri, X86::CMPPDrmi },
+ { X86::CMPPSrri, X86::CMPPSrmi },
+ { X86::CMPSDrr, X86::CMPSDrm },
+ { X86::CMPSSrr, X86::CMPSSrm },
+ { X86::CVTSD2SSrr, X86::CVTSD2SSrm },
+ { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm },
+ { X86::CVTSI2SDrr, X86::CVTSI2SDrm },
+ { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm },
+ { X86::CVTSI2SSrr, X86::CVTSI2SSrm },
+ { X86::CVTSS2SDrr, X86::CVTSS2SDrm },
+ { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm },
+ { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm },
+ { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm },
+ { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm },
+ { X86::FsMOVAPDrr, X86::MOVSDrm },
+ { X86::FsMOVAPSrr, X86::MOVSSrm },
+ { X86::IMUL16rri, X86::IMUL16rmi },
+ { X86::IMUL16rri8, X86::IMUL16rmi8 },
+ { X86::IMUL32rri, X86::IMUL32rmi },
+ { X86::IMUL32rri8, X86::IMUL32rmi8 },
+ { X86::IMUL64rr, X86::IMUL64rm },
+ { X86::IMUL64rri32, X86::IMUL64rmi32 },
+ { X86::IMUL64rri8, X86::IMUL64rmi8 },
+ { X86::Int_CMPSDrr, X86::Int_CMPSDrm },
+ { X86::Int_CMPSSrr, X86::Int_CMPSSrm },
+ { X86::Int_COMISDrr, X86::Int_COMISDrm },
+ { X86::Int_COMISSrr, X86::Int_COMISSrm },
+ { X86::Int_CVTDQ2PDrr, X86::Int_CVTDQ2PDrm },
+ { X86::Int_CVTDQ2PSrr, X86::Int_CVTDQ2PSrm },
+ { X86::Int_CVTPD2DQrr, X86::Int_CVTPD2DQrm },
+ { X86::Int_CVTPD2PSrr, X86::Int_CVTPD2PSrm },
+ { X86::Int_CVTPS2DQrr, X86::Int_CVTPS2DQrm },
+ { X86::Int_CVTPS2PDrr, X86::Int_CVTPS2PDrm },
+ { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
+ { X86::Int_CVTSD2SIrr, X86::Int_CVTSD2SIrm },
+ { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm },
+ { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
+ { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm },
+ { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
+ { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm },
+ { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm },
+ { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
+ { X86::Int_CVTSS2SIrr, X86::Int_CVTSS2SIrm },
+ { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
+ { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+ { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
+ { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+ { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
+ { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
+ { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm },
+ { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm },
+ { X86::MOV16rr, X86::MOV16rm },
+ { X86::MOV32rr, X86::MOV32rm },
+ { X86::MOV64rr, X86::MOV64rm },
+ { X86::MOV64toPQIrr, X86::MOV64toPQIrm },
+ { X86::MOV64toSDrr, X86::MOV64toSDrm },
+ { X86::MOV8rr, X86::MOV8rm },
+ { X86::MOVAPDrr, X86::MOVAPDrm },
+ { X86::MOVAPSrr, X86::MOVAPSrm },
+ { X86::MOVDDUPrr, X86::MOVDDUPrm },
+ { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm },
+ { X86::MOVDI2SSrr, X86::MOVDI2SSrm },
+ { X86::MOVSD2PDrr, X86::MOVSD2PDrm },
+ { X86::MOVSDrr, X86::MOVSDrm },
+ { X86::MOVSHDUPrr, X86::MOVSHDUPrm },
+ { X86::MOVSLDUPrr, X86::MOVSLDUPrm },
+ { X86::MOVSS2PSrr, X86::MOVSS2PSrm },
+ { X86::MOVSSrr, X86::MOVSSrm },
+ { X86::MOVSX16rr8, X86::MOVSX16rm8 },
+ { X86::MOVSX32rr16, X86::MOVSX32rm16 },
+ { X86::MOVSX32rr8, X86::MOVSX32rm8 },
+ { X86::MOVSX64rr16, X86::MOVSX64rm16 },
+ { X86::MOVSX64rr32, X86::MOVSX64rm32 },
+ { X86::MOVSX64rr8, X86::MOVSX64rm8 },
+ { X86::MOVUPDrr, X86::MOVUPDrm },
+ { X86::MOVUPSrr, X86::MOVUPSrm },
+ { X86::MOVZX16rr8, X86::MOVZX16rm8 },
+ { X86::MOVZX32rr16, X86::MOVZX32rm16 },
+ { X86::MOVZX32rr8, X86::MOVZX32rm8 },
+ { X86::MOVZX64rr16, X86::MOVZX64rm16 },
+ { X86::MOVZX64rr8, X86::MOVZX64rm8 },
+ { X86::PSHUFDri, X86::PSHUFDmi },
+ { X86::PSHUFHWri, X86::PSHUFHWmi },
+ { X86::PSHUFLWri, X86::PSHUFLWmi },
+ { X86::PsMOVZX64rr32, X86::PsMOVZX64rm32 },
+ { X86::TEST16rr, X86::TEST16rm },
+ { X86::TEST32rr, X86::TEST32rm },
+ { X86::TEST64rr, X86::TEST64rm },
+ { X86::TEST8rr, X86::TEST8rm },
+ // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+ { X86::UCOMISDrr, X86::UCOMISDrm },
+ { X86::UCOMISSrr, X86::UCOMISSrm },
+ { X86::XCHG16rr, X86::XCHG16rm },
+ { X86::XCHG32rr, X86::XCHG32rm },
+ { X86::XCHG64rr, X86::XCHG64rm },
+ { X86::XCHG8rr, X86::XCHG8rm }
+ };
+ ASSERT_SORTED(OpcodeTable);
+ OpcodeTablePtr = OpcodeTable;
+ OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+ } else if (i == 2) {
+ static const TableEntry OpcodeTable[] = {
+ { X86::ADC32rr, X86::ADC32rm },
+ { X86::ADC64rr, X86::ADC64rm },
+ { X86::ADD16rr, X86::ADD16rm },
+ { X86::ADD32rr, X86::ADD32rm },
+ { X86::ADD64rr, X86::ADD64rm },
+ { X86::ADD8rr, X86::ADD8rm },
+ { X86::ADDPDrr, X86::ADDPDrm },
+ { X86::ADDPSrr, X86::ADDPSrm },
+ { X86::ADDSDrr, X86::ADDSDrm },
+ { X86::ADDSSrr, X86::ADDSSrm },
+ { X86::ADDSUBPDrr, X86::ADDSUBPDrm },
+ { X86::ADDSUBPSrr, X86::ADDSUBPSrm },
+ { X86::AND16rr, X86::AND16rm },
+ { X86::AND32rr, X86::AND32rm },
+ { X86::AND64rr, X86::AND64rm },
+ { X86::AND8rr, X86::AND8rm },
+ { X86::ANDNPDrr, X86::ANDNPDrm },
+ { X86::ANDNPSrr, X86::ANDNPSrm },
+ { X86::ANDPDrr, X86::ANDPDrm },
+ { X86::ANDPSrr, X86::ANDPSrm },
+ { X86::CMOVA16rr, X86::CMOVA16rm },
+ { X86::CMOVA32rr, X86::CMOVA32rm },
+ { X86::CMOVA64rr, X86::CMOVA64rm },
+ { X86::CMOVAE16rr, X86::CMOVAE16rm },
+ { X86::CMOVAE32rr, X86::CMOVAE32rm },
+ { X86::CMOVAE64rr, X86::CMOVAE64rm },
+ { X86::CMOVB16rr, X86::CMOVB16rm },
+ { X86::CMOVB32rr, X86::CMOVB32rm },
+ { X86::CMOVB64rr, X86::CMOVB64rm },
+ { X86::CMOVBE16rr, X86::CMOVBE16rm },
+ { X86::CMOVBE32rr, X86::CMOVBE32rm },
+ { X86::CMOVBE64rr, X86::CMOVBE64rm },
+ { X86::CMOVE16rr, X86::CMOVE16rm },
+ { X86::CMOVE32rr, X86::CMOVE32rm },
+ { X86::CMOVE64rr, X86::CMOVE64rm },
+ { X86::CMOVG16rr, X86::CMOVG16rm },
+ { X86::CMOVG32rr, X86::CMOVG32rm },
+ { X86::CMOVG64rr, X86::CMOVG64rm },
+ { X86::CMOVGE16rr, X86::CMOVGE16rm },
+ { X86::CMOVGE32rr, X86::CMOVGE32rm },
+ { X86::CMOVGE64rr, X86::CMOVGE64rm },
+ { X86::CMOVL16rr, X86::CMOVL16rm },
+ { X86::CMOVL32rr, X86::CMOVL32rm },
+ { X86::CMOVL64rr, X86::CMOVL64rm },
+ { X86::CMOVLE16rr, X86::CMOVLE16rm },
+ { X86::CMOVLE32rr, X86::CMOVLE32rm },
+ { X86::CMOVLE64rr, X86::CMOVLE64rm },
+ { X86::CMOVNE16rr, X86::CMOVNE16rm },
+ { X86::CMOVNE32rr, X86::CMOVNE32rm },
+ { X86::CMOVNE64rr, X86::CMOVNE64rm },
+ { X86::CMOVNP16rr, X86::CMOVNP16rm },
+ { X86::CMOVNP32rr, X86::CMOVNP32rm },
+ { X86::CMOVNP64rr, X86::CMOVNP64rm },
+ { X86::CMOVNS16rr, X86::CMOVNS16rm },
+ { X86::CMOVNS32rr, X86::CMOVNS32rm },
+ { X86::CMOVNS64rr, X86::CMOVNS64rm },
+ { X86::CMOVP16rr, X86::CMOVP16rm },
+ { X86::CMOVP32rr, X86::CMOVP32rm },
+ { X86::CMOVP64rr, X86::CMOVP64rm },
+ { X86::CMOVS16rr, X86::CMOVS16rm },
+ { X86::CMOVS32rr, X86::CMOVS32rm },
+ { X86::CMOVS64rr, X86::CMOVS64rm },
+ { X86::DIVPDrr, X86::DIVPDrm },
+ { X86::DIVPSrr, X86::DIVPSrm },
+ { X86::DIVSDrr, X86::DIVSDrm },
+ { X86::DIVSSrr, X86::DIVSSrm },
+ { X86::HADDPDrr, X86::HADDPDrm },
+ { X86::HADDPSrr, X86::HADDPSrm },
+ { X86::HSUBPDrr, X86::HSUBPDrm },
+ { X86::HSUBPSrr, X86::HSUBPSrm },
+ { X86::IMUL16rr, X86::IMUL16rm },
+ { X86::IMUL32rr, X86::IMUL32rm },
+ { X86::MAXPDrr, X86::MAXPDrm },
+ { X86::MAXPDrr_Int, X86::MAXPDrm_Int },
+ { X86::MAXPSrr, X86::MAXPSrm },
+ { X86::MAXPSrr_Int, X86::MAXPSrm_Int },
+ { X86::MAXSDrr, X86::MAXSDrm },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int },
+ { X86::MAXSSrr, X86::MAXSSrm },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int },
+ { X86::MINPDrr, X86::MINPDrm },
+ { X86::MINPDrr_Int, X86::MINPDrm_Int },
+ { X86::MINPSrr, X86::MINPSrm },
+ { X86::MINPSrr_Int, X86::MINPSrm_Int },
+ { X86::MINSDrr, X86::MINSDrm },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int },
+ { X86::MINSSrr, X86::MINSSrm },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int },
+ { X86::MULPDrr, X86::MULPDrm },
+ { X86::MULPSrr, X86::MULPSrm },
+ { X86::MULSDrr, X86::MULSDrm },
+ { X86::MULSSrr, X86::MULSSrm },
+ { X86::OR16rr, X86::OR16rm },
+ { X86::OR32rr, X86::OR32rm },
+ { X86::OR64rr, X86::OR64rm },
+ { X86::OR8rr, X86::OR8rm },
+ { X86::ORPDrr, X86::ORPDrm },
+ { X86::ORPSrr, X86::ORPSrm },
+ { X86::PACKSSDWrr, X86::PACKSSDWrm },
+ { X86::PACKSSWBrr, X86::PACKSSWBrm },
+ { X86::PACKUSWBrr, X86::PACKUSWBrm },
+ { X86::PADDBrr, X86::PADDBrm },
+ { X86::PADDDrr, X86::PADDDrm },
+ { X86::PADDQrr, X86::PADDQrm },
+ { X86::PADDSBrr, X86::PADDSBrm },
+ { X86::PADDSWrr, X86::PADDSWrm },
+ { X86::PADDWrr, X86::PADDWrm },
+ { X86::PANDNrr, X86::PANDNrm },
+ { X86::PANDrr, X86::PANDrm },
+ { X86::PAVGBrr, X86::PAVGBrm },
+ { X86::PAVGWrr, X86::PAVGWrm },
+ { X86::PCMPEQBrr, X86::PCMPEQBrm },
+ { X86::PCMPEQDrr, X86::PCMPEQDrm },
+ { X86::PCMPEQWrr, X86::PCMPEQWrm },
+ { X86::PCMPGTBrr, X86::PCMPGTBrm },
+ { X86::PCMPGTDrr, X86::PCMPGTDrm },
+ { X86::PCMPGTWrr, X86::PCMPGTWrm },
+ { X86::PINSRWrri, X86::PINSRWrmi },
+ { X86::PMADDWDrr, X86::PMADDWDrm },
+ { X86::PMAXSWrr, X86::PMAXSWrm },
+ { X86::PMAXUBrr, X86::PMAXUBrm },
+ { X86::PMINSWrr, X86::PMINSWrm },
+ { X86::PMINUBrr, X86::PMINUBrm },
+ { X86::PMULHUWrr, X86::PMULHUWrm },
+ { X86::PMULHWrr, X86::PMULHWrm },
+ { X86::PMULLWrr, X86::PMULLWrm },
+ { X86::PMULUDQrr, X86::PMULUDQrm },
+ { X86::PORrr, X86::PORrm },
+ { X86::PSADBWrr, X86::PSADBWrm },
+ { X86::PSLLDrr, X86::PSLLDrm },
+ { X86::PSLLQrr, X86::PSLLQrm },
+ { X86::PSLLWrr, X86::PSLLWrm },
+ { X86::PSRADrr, X86::PSRADrm },
+ { X86::PSRAWrr, X86::PSRAWrm },
+ { X86::PSRLDrr, X86::PSRLDrm },
+ { X86::PSRLQrr, X86::PSRLQrm },
+ { X86::PSRLWrr, X86::PSRLWrm },
+ { X86::PSUBBrr, X86::PSUBBrm },
+ { X86::PSUBDrr, X86::PSUBDrm },
+ { X86::PSUBSBrr, X86::PSUBSBrm },
+ { X86::PSUBSWrr, X86::PSUBSWrm },
+ { X86::PSUBWrr, X86::PSUBWrm },
+ { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm },
+ { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm },
+ { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm },
+ { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm },
+ { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm },
+ { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm },
+ { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm },
+ { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm },
+ { X86::PXORrr, X86::PXORrm },
+ { X86::RCPPSr, X86::RCPPSm },
+ { X86::RCPPSr_Int, X86::RCPPSm_Int },
+ { X86::RSQRTPSr, X86::RSQRTPSm },
+ { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int },
+ { X86::RSQRTSSr, X86::RSQRTSSm },
+ { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int },
+ { X86::SBB32rr, X86::SBB32rm },
+ { X86::SBB64rr, X86::SBB64rm },
+ { X86::SHUFPDrri, X86::SHUFPDrmi },
+ { X86::SHUFPSrri, X86::SHUFPSrmi },
+ { X86::SQRTPDr, X86::SQRTPDm },
+ { X86::SQRTPDr_Int, X86::SQRTPDm_Int },
+ { X86::SQRTPSr, X86::SQRTPSm },
+ { X86::SQRTPSr_Int, X86::SQRTPSm_Int },
+ { X86::SQRTSDr, X86::SQRTSDm },
+ { X86::SQRTSDr_Int, X86::SQRTSDm_Int },
+ { X86::SQRTSSr, X86::SQRTSSm },
+ { X86::SQRTSSr_Int, X86::SQRTSSm_Int },
+ { X86::SUB16rr, X86::SUB16rm },
+ { X86::SUB32rr, X86::SUB32rm },
+ { X86::SUB64rr, X86::SUB64rm },
+ { X86::SUB8rr, X86::SUB8rm },
+ { X86::SUBPDrr, X86::SUBPDrm },
+ { X86::SUBPSrr, X86::SUBPSrm },
+ { X86::SUBSDrr, X86::SUBSDrm },
+ { X86::SUBSSrr, X86::SUBSSrm },
+ // FIXME: TEST*rr -> swapped operand of TEST*mr.
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrm },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrm },
+ { X86::UNPCKLPDrr, X86::UNPCKLPDrm },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrm },
+ { X86::XOR16rr, X86::XOR16rm },
+ { X86::XOR32rr, X86::XOR32rm },
+ { X86::XOR64rr, X86::XOR64rm },
+ { X86::XOR8rr, X86::XOR8rm },
+ { X86::XORPDrr, X86::XORPDrm },
+ { X86::XORPSrr, X86::XORPSrm }
+ };
+ ASSERT_SORTED(OpcodeTable);
+ OpcodeTablePtr = OpcodeTable;
+ OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+ }
+
+ // If table selected...
+ if (OpcodeTablePtr) {
+ // Find the Opcode to fuse
+ unsigned fromOpcode = MI->getOpcode();
+ // Lookup fromOpcode in table
+ if (const TableEntry *Entry = TableLookup(OpcodeTablePtr, OpcodeTableSize,
+ fromOpcode)) {
+ if (isTwoAddrFold)
+ NewMI = FuseTwoAddrInst(Entry->to, FrameIndex, MI, TII);
+ else
+ NewMI = FuseInst(Entry->to, i, FrameIndex, MI, TII);
+ NewMI->copyKillDeadInfo(MI);
+ return NewMI;
+ }
+ }
+
+ // No fusion
+ if (PrintFailedFusing)
+ cerr << "We failed to fuse ("
+ << ((i == 1) ? "r" : "s") << "): " << *MI;
+ return NULL;
+}
+
+
+const unsigned *X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+ const {
+ static const unsigned CalleeSavedRegs32Bit[] = {
+ X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs32EHRet[] = {
+ X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP, 0
+ };
+
+ static const unsigned CalleeSavedRegs64Bit[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+
+ if (Is64Bit)
+ return CalleeSavedRegs64Bit;
+ else {
+ if (MF) {
+ MachineFrameInfo *MFI = MF->getFrameInfo();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ if (MMI && MMI->callsEHReturn())
+ return CalleeSavedRegs32EHRet;
+ }
+ return CalleeSavedRegs32Bit;
+ }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+ static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass,
+ &X86::GR32RegClass, &X86::GR32RegClass, 0
+ };
+ static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass,
+ &X86::GR64RegClass, &X86::GR64RegClass, 0
+ };
+
+ if (Is64Bit)
+ return CalleeSavedRegClasses64Bit;
+ else {
+ if (MF) {
+ MachineFrameInfo *MFI = MF->getFrameInfo();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ if (MMI && MMI->callsEHReturn())
+ return CalleeSavedRegClasses32EHRet;
+ }
+ return CalleeSavedRegClasses32Bit;
+ }
+
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+ BitVector Reserved(getNumRegs());
+ Reserved.set(X86::RSP);
+ Reserved.set(X86::ESP);
+ Reserved.set(X86::SP);
+ Reserved.set(X86::SPL);
+ if (hasFP(MF)) {
+ Reserved.set(X86::RBP);
+ Reserved.set(X86::EBP);
+ Reserved.set(X86::BP);
+ Reserved.set(X86::BPL);
+ }
+ return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register. This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+ return (NoFramePointerElim ||
+ MF.getFrameInfo()->hasVarSizedObjects() ||
+ MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+ (MMI && MMI->callsUnwindInit()));
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ if (hasFP(MF)) {
+ // If we have a frame pointer, turn the adjcallstackup instruction into a
+ // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+ // <amt>'
+ MachineInstr *Old = I;
+ uint64_t Amount = Old->getOperand(0).getImm();
+ if (Amount != 0) {
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+ Amount = (Amount+Align-1)/Align*Align;
+
+ MachineInstr *New = 0;
+ if (Old->getOpcode() == X86::ADJCALLSTACKDOWN) {
+ New=BuildMI(TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ } else {
+ assert(Old->getOpcode() == X86::ADJCALLSTACKUP);
+ // factor out the amount the callee already popped.
+ uint64_t CalleeAmt = Old->getOperand(1).getImm();
+ Amount -= CalleeAmt;
+ if (Amount) {
+ unsigned Opc = (Amount < 128) ?
+ (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+ (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+ New = BuildMI(TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ }
+ }
+
+ // Replace the pseudo instruction with a new instruction...
+ if (New) MBB.insert(I, New);
+ }
+ } else if (I->getOpcode() == X86::ADJCALLSTACKUP) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+ unsigned Opc = (CalleeAmt < 128) ?
+ (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+ (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+ MachineInstr *New =
+ BuildMI(TII.get(Opc), StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+ MBB.insert(I, New);
+ }
+ }
+
+ MBB.erase(I);
+}
+
+void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+ int SPAdj, RegScavenger *RS) const{
+ assert(SPAdj == 0 && "Unexpected");
+
+ unsigned i = 0;
+ MachineInstr &MI = *II;
+ MachineFunction &MF = *MI.getParent()->getParent();
+ while (!MI.getOperand(i).isFrameIndex()) {
+ ++i;
+ assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+ }
+
+ int FrameIndex = MI.getOperand(i).getFrameIndex();
+ // This must be part of a four operand memory reference. Replace the
+ // FrameIndex with base register with EBP. Add an offset to the offset.
+ MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false);
+
+ // Now add the frame object offset to the offset from EBP.
+ int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+ MI.getOperand(i+3).getImm()+SlotSize;
+
+ if (!hasFP(MF))
+ Offset += MF.getFrameInfo()->getStackSize();
+ else
+ Offset += SlotSize; // Skip the saved EBP
+
+ MI.getOperand(i+3).ChangeToImmediate(Offset);
+}
+
+void
+X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+ if (hasFP(MF)) {
+ // Create a frame entry for the EBP register that must be saved.
+ int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+ (int)SlotSize * -2);
+ assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+ "Slot for EBP register must be last in order to be found!");
+ }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+ unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+ const TargetInstrInfo &TII) {
+ bool isSub = NumBytes < 0;
+ uint64_t Offset = isSub ? -NumBytes : NumBytes;
+ unsigned Opc = isSub
+ ? ((Offset < 128) ?
+ (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+ (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+ : ((Offset < 128) ?
+ (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+ (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+ uint64_t Chunk = (1LL << 31) - 1;
+
+ while (Offset) {
+ uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+ BuildMI(MBB, MBBI, TII.get(Opc), StackPtr).addReg(StackPtr).addImm(ThisVal);
+ Offset -= ThisVal;
+ }
+}
+
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+ MachineBasicBlock &MBB = MF.front(); // Prolog goes in entry BB
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+ const Function* Fn = MF.getFunction();
+ const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+ MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = MBB.begin();
+
+ // Prepare for frame info.
+ unsigned FrameLabelId = 0, StartLabelId = 0;
+
+ // Get the number of bytes to allocate from the FrameInfo
+ uint64_t StackSize = MFI->getStackSize();
+ uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+
+ if (MMI && MMI->needsFrameInfo()) {
+ // Mark function start
+ StartLabelId = MMI->NextLabelID();
+ BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(StartLabelId);
+ }
+
+ if (hasFP(MF)) {
+ // Get the offset of the stack slot for the EBP register... which is
+ // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+ // Update the frame offset adjustment.
+ MFI->setOffsetAdjustment(SlotSize-NumBytes);
+
+ // Save EBP into the appropriate stack slot...
+ BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+ .addReg(FramePtr);
+ NumBytes -= SlotSize;
+
+ if (MMI && MMI->needsFrameInfo()) {
+ // Mark effective beginning of when frame pointer becomes valid.
+ FrameLabelId = MMI->NextLabelID();
+ BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(FrameLabelId);
+ }
+
+ // Update EBP with the new base value...
+ BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+ .addReg(StackPtr);
+ }
+
+ unsigned ReadyLabelId = 0;
+ if (MMI && MMI->needsFrameInfo()) {
+ // Mark effective beginning of when frame pointer is ready.
+ ReadyLabelId = MMI->NextLabelID();
+ BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(ReadyLabelId);
+ }
+
+ // Skip the callee-saved push instructions.
+ while (MBBI != MBB.end() &&
+ (MBBI->getOpcode() == X86::PUSH32r ||
+ MBBI->getOpcode() == X86::PUSH64r))
+ ++MBBI;
+
+ if (NumBytes) { // adjust stack pointer: ESP -= numbytes
+ if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+ // Check, whether EAX is livein for this function
+ bool isEAXAlive = false;
+ for (MachineFunction::livein_iterator II = MF.livein_begin(),
+ EE = MF.livein_end(); (II != EE) && !isEAXAlive; ++II) {
+ unsigned Reg = II->first;
+ isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+ Reg == X86::AH || Reg == X86::AL);
+ }
+
+ // Function prologue calls _alloca to probe the stack when allocating
+ // more than 4k bytes in one go. Touching the stack at 4K increments is
+ // necessary to ensure that the guard pages used by the OS virtual memory
+ // manager are allocated in correct sequence.
+ if (!isEAXAlive) {
+ BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes);
+ BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca");
+ } else {
+ // Save EAX
+ BuildMI(MBB, MBBI, TII.get(X86::PUSH32r), X86::EAX);
+ // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+ // allocated bytes for EAX.
+ BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+ BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+ .addExternalSymbol("_alloca");
+ // Restore EAX
+ MachineInstr *MI = addRegOffset(BuildMI(TII.get(X86::MOV32rm),X86::EAX),
+ StackPtr, NumBytes-4);
+ MBB.insert(MBBI, MI);
+ }
+ } else {
+ // If there is an ADD32ri or SUB32ri of ESP immediately after this
+ // instruction, merge the two instructions.
+ if (MBBI != MBB.end()) {
+ MachineBasicBlock::iterator NI = next(MBBI);
+ unsigned Opc = MBBI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ MBBI->getOperand(0).getReg() == StackPtr) {
+ NumBytes -= MBBI->getOperand(2).getImm();
+ MBB.erase(MBBI);
+ MBBI = NI;
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ MBBI->getOperand(0).getReg() == StackPtr) {
+ NumBytes += MBBI->getOperand(2).getImm();
+ MBB.erase(MBBI);
+ MBBI = NI;
+ }
+ }
+
+ if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+ }
+ }
+
+ if (MMI && MMI->needsFrameInfo()) {
+ std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+ const TargetAsmInfo *TAI = MF.getTarget().getTargetAsmInfo();
+
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth =
+ (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+ TargetFrameInfo::StackGrowsUp ?
+ TAI->getAddressSize() : -TAI->getAddressSize());
+
+ if (StackSize) {
+ // Show update of SP.
+ if (hasFP(MF)) {
+ // Adjust SP
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ } else {
+ MachineLocation SPDst(MachineLocation::VirtualFP);
+ MachineLocation SPSrc(MachineLocation::VirtualFP, -StackSize+stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ }
+ } else {
+ //FIXME: Verify & implement for FP
+ MachineLocation SPDst(StackPtr);
+ MachineLocation SPSrc(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+ }
+
+ // Add callee saved registers to move list.
+ const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+ for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+ int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+ unsigned Reg = CSI[I].getReg();
+ MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+ MachineLocation CSSrc(Reg);
+ Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+ }
+
+ if (hasFP(MF)) {
+ // Save FP
+ MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+ MachineLocation FPSrc(FramePtr);
+ Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+ }
+
+ MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+ MachineLocation FPSrc(MachineLocation::VirtualFP);
+ Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+ }
+
+ // If it's main() on Cygwin\Mingw32 we should align stack as well
+ if (Fn->hasExternalLinkage() && Fn->getName() == "main" &&
+ Subtarget->isTargetCygMing()) {
+ BuildMI(MBB, MBBI, TII.get(X86::AND32ri), X86::ESP)
+ .addReg(X86::ESP).addImm(-Align);
+
+ // Probe the stack
+ BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(Align);
+ BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)).addExternalSymbol("_alloca");
+ }
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+ MachineBasicBlock &MBB) const {
+ const MachineFrameInfo *MFI = MF.getFrameInfo();
+ X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+ MachineBasicBlock::iterator MBBI = prior(MBB.end());
+ unsigned RetOpcode = MBBI->getOpcode();
+
+ switch (RetOpcode) {
+ case X86::RET:
+ case X86::RETI:
+ case X86::EH_RETURN:
+ case X86::TAILJMPd:
+ case X86::TAILJMPr:
+ case X86::TAILJMPm: break; // These are ok
+ default:
+ assert(0 && "Can only insert epilog into returning blocks");
+ }
+
+ // Get the number of bytes to allocate from the FrameInfo
+ uint64_t StackSize = MFI->getStackSize();
+ unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ uint64_t NumBytes = StackSize - CSSize;
+
+ if (hasFP(MF)) {
+ // pop EBP.
+ BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+ NumBytes -= SlotSize;
+ }
+
+ // Skip the callee-saved pop instructions.
+ while (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ if (PI->getOpcode() != X86::POP32r && PI->getOpcode() != X86::POP64r)
+ break;
+ --MBBI;
+ }
+
+ // If dynamic alloca is used, then reset esp to point to the last
+ // callee-saved slot before popping them off!
+ if (MFI->hasVarSizedObjects()) {
+ unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+ MachineInstr *MI = addRegOffset(BuildMI(TII.get(Opc), StackPtr),
+ FramePtr, -CSSize);
+ MBB.insert(MBBI, MI);
+ NumBytes = 0;
+ }
+
+ if (NumBytes) { // adjust stack pointer back: ESP += numbytes
+ // If there is an ADD32ri or SUB32ri of ESP immediately before this
+ // instruction, merge the two instructions.
+ if (MBBI != MBB.begin()) {
+ MachineBasicBlock::iterator PI = prior(MBBI);
+ unsigned Opc = PI->getOpcode();
+ if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+ Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ NumBytes += PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+ Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+ PI->getOperand(0).getReg() == StackPtr) {
+ NumBytes -= PI->getOperand(2).getImm();
+ MBB.erase(PI);
+ }
+ }
+
+ if (NumBytes)
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+ }
+
+ // We're returning from function via eh_return.
+ if (RetOpcode == X86::EH_RETURN) {
+ MBBI = prior(MBB.end());
+ MachineOperand &DestAddr = MBBI->getOperand(0);
+ assert(DestAddr.isReg() && "Offset should be in register!");
+ BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
+ addReg(DestAddr.getReg());
+ }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+ if (Is64Bit)
+ return X86::RIP; // Should have dwarf #16
+ else
+ return X86::EIP; // Should have dwarf #8
+}
+
+unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+ return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+ const {
+ // Calculate amount of bytes used for return address storing
+ int stackGrowth = (Is64Bit ? -8 : -4);
+
+ // Initial state of the frame pointer is esp+4.
+ MachineLocation Dst(MachineLocation::VirtualFP);
+ MachineLocation Src(StackPtr, stackGrowth);
+ Moves.push_back(MachineMove(0, Dst, Src));
+
+ // Add return address to move list
+ MachineLocation CSDst(StackPtr, stackGrowth);
+ MachineLocation CSSrc(getRARegister());
+ Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+ assert(0 && "What is the exception register");
+ return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+ assert(0 && "What is the exception handler register");
+ return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) {
+ switch (VT) {
+ default: return Reg;
+ case MVT::i8:
+ if (High) {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AH;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DH;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CH;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BH;
+ }
+ } else {
+ switch (Reg) {
+ default: return 0;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AL;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DL;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CL;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BL;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SIL;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DIL;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BPL;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SPL;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8B;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9B;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10B;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11B;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12B;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13B;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14B;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15B;
+ }
+ }
+ case MVT::i16:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::AX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::DX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::CX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::BX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8W;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9W;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10W;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11W;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12W;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13W;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14W;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15W;
+ }
+ case MVT::i32:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::EAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::EDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::ECX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::EBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::ESI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::EDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::EBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::ESP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8D;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9D;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10D;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11D;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12D;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13D;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14D;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15D;
+ }
+ case MVT::i64:
+ switch (Reg) {
+ default: return Reg;
+ case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+ return X86::RAX;
+ case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+ return X86::RDX;
+ case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+ return X86::RCX;
+ case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+ return X86::RBX;
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::RSI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::RDI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::RBP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::RSP;
+ case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+ return X86::R8;
+ case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+ return X86::R9;
+ case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+ return X86::R10;
+ case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+ return X86::R11;
+ case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+ return X86::R12;
+ case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+ return X86::R13;
+ case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+ return X86::R14;
+ case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+ return X86::R15;
+ }
+ }
+
+ return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..ab9e33f
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,130 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+ class Type;
+ class TargetInstrInfo;
+ class X86TargetMachine;
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+ X86TargetMachine &TM;
+ const TargetInstrInfo &TII;
+
+private:
+ /// Is64Bit - Is the target 64-bits.
+ bool Is64Bit;
+
+ /// SlotSize - Stack slot size in bytes.
+ unsigned SlotSize;
+
+ /// StackPtr - X86 physical register used as stack ptr.
+ unsigned StackPtr;
+
+ /// FramePtr - X86 physical register used as frame ptr.
+ unsigned FramePtr;
+
+public:
+ X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+ /// Code Generation virtual methods...
+ bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const;
+
+ bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ const std::vector<CalleeSavedInfo> &CSI) const;
+
+ void storeRegToStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned SrcReg, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ void loadRegFromStackSlot(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, int FrameIndex,
+ const TargetRegisterClass *RC) const;
+
+ void copyRegToReg(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI,
+ unsigned DestReg, unsigned SrcReg,
+ const TargetRegisterClass *RC) const;
+
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ unsigned DestReg, const MachineInstr *Orig) const;
+
+ /// foldMemoryOperand - If this target supports it, fold a load or store of
+ /// the specified stack slot into the specified machine instruction for the
+ /// specified operand. If this is possible, the target should perform the
+ /// folding and return true, otherwise it should return false. If it folds
+ /// the instruction, it is likely that the MachineInstruction the iterator
+ /// references has been changed.
+ MachineInstr* foldMemoryOperand(MachineInstr* MI,
+ unsigned OpNum,
+ int FrameIndex) const;
+
+ /// getCalleeSavedRegs - Return a null-terminated list of all of the
+ /// callee-save registers on this target.
+ const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+ /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+ /// register classes to spill each callee-saved register with. The order and
+ /// length of this list match the getCalleeSavedRegs() list.
+ const TargetRegisterClass* const* getCalleeSavedRegClasses(
+ const MachineFunction *MF = 0) const;
+
+ /// getReservedRegs - Returns a bitset indexed by physical register number
+ /// indicating if a register is a special register that has particular uses and
+ /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+ /// register scavenger to determine what registers are free.
+ BitVector getReservedRegs(const MachineFunction &MF) const;
+
+ bool hasFP(const MachineFunction &MF) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const;
+
+ void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+ int SPAdj, RegScavenger *RS = NULL) const;
+
+ void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+ void emitPrologue(MachineFunction &MF) const;
+ void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+ // Debug information queries.
+ unsigned getRARegister() const;
+ unsigned getFrameRegister(MachineFunction &MF) const;
+ void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+ // Exception handling queries.
+ unsigned getEHExceptionRegister() const;
+ unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::ValueType, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..a1e7bb9
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,468 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File ------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Register definitions...
+//
+let Namespace = "X86" in {
+
+ // In the register alias definitions below, we define which registers alias
+ // which others. We only specify which registers the small registers alias,
+ // because the register file generator is smart enough to figure out that
+ // AL aliases AX if we tell it that AX aliased AL (for example).
+
+ // FIXME: X86-64 have different Dwarf numbers.
+ // 8-bit registers
+ // Low registers
+ def AL : Register<"AL">, DwarfRegNum<0>;
+ def CL : Register<"CL">, DwarfRegNum<1>;
+ def DL : Register<"DL">, DwarfRegNum<2>;
+ def BL : Register<"BL">, DwarfRegNum<3>;
+
+ // X86-64 only
+ def SIL : Register<"SIL">, DwarfRegNum<4>;
+ def DIL : Register<"DIL">, DwarfRegNum<5>;
+ def BPL : Register<"BPL">, DwarfRegNum<6>;
+ def SPL : Register<"SPL">, DwarfRegNum<7>;
+ def R8B : Register<"R8B">, DwarfRegNum<8>;
+ def R9B : Register<"R9B">, DwarfRegNum<9>;
+ def R10B : Register<"R10B">, DwarfRegNum<10>;
+ def R11B : Register<"R11B">, DwarfRegNum<11>;
+ def R12B : Register<"R12B">, DwarfRegNum<12>;
+ def R13B : Register<"R13B">, DwarfRegNum<13>;
+ def R14B : Register<"R14B">, DwarfRegNum<14>;
+ def R15B : Register<"R15B">, DwarfRegNum<15>;
+
+ // High registers X86-32 only
+ def AH : Register<"AH">, DwarfRegNum<0>;
+ def CH : Register<"CH">, DwarfRegNum<1>;
+ def DH : Register<"DH">, DwarfRegNum<2>;
+ def BH : Register<"BH">, DwarfRegNum<3>;
+
+ // 16-bit registers
+ def AX : RegisterWithSubRegs<"AX", [AH,AL]>, DwarfRegNum<0>;
+ def CX : RegisterWithSubRegs<"CX", [CH,CL]>, DwarfRegNum<1>;
+ def DX : RegisterWithSubRegs<"DX", [DH,DL]>, DwarfRegNum<2>;
+ def BX : RegisterWithSubRegs<"BX", [BH,BL]>, DwarfRegNum<3>;
+ def SP : RegisterWithSubRegs<"SP", [SPL]>, DwarfRegNum<4>;
+ def BP : RegisterWithSubRegs<"BP", [BPL]>, DwarfRegNum<5>;
+ def SI : RegisterWithSubRegs<"SI", [SIL]>, DwarfRegNum<6>;
+ def DI : RegisterWithSubRegs<"DI", [DIL]>, DwarfRegNum<7>;
+ def IP : Register<"IP">, DwarfRegNum<8>;
+
+ // X86-64 only
+ def R8W : RegisterWithSubRegs<"R8W", [R8B]>, DwarfRegNum<8>;
+ def R9W : RegisterWithSubRegs<"R9W", [R9B]>, DwarfRegNum<9>;
+ def R10W : RegisterWithSubRegs<"R10W", [R10B]>, DwarfRegNum<10>;
+ def R11W : RegisterWithSubRegs<"R11W", [R11B]>, DwarfRegNum<11>;
+ def R12W : RegisterWithSubRegs<"R12W", [R12B]>, DwarfRegNum<12>;
+ def R13W : RegisterWithSubRegs<"R13W", [R13B]>, DwarfRegNum<13>;
+ def R14W : RegisterWithSubRegs<"R14W", [R14B]>, DwarfRegNum<14>;
+ def R15W : RegisterWithSubRegs<"R15W", [R15B]>, DwarfRegNum<15>;
+
+ // 32-bit registers
+ def EAX : RegisterWithSubRegs<"EAX", [AX]>, DwarfRegNum<0>;
+ def ECX : RegisterWithSubRegs<"ECX", [CX]>, DwarfRegNum<1>;
+ def EDX : RegisterWithSubRegs<"EDX", [DX]>, DwarfRegNum<2>;
+ def EBX : RegisterWithSubRegs<"EBX", [BX]>, DwarfRegNum<3>;
+ def ESP : RegisterWithSubRegs<"ESP", [SP]>, DwarfRegNum<4>;
+ def EBP : RegisterWithSubRegs<"EBP", [BP]>, DwarfRegNum<5>;
+ def ESI : RegisterWithSubRegs<"ESI", [SI]>, DwarfRegNum<6>;
+ def EDI : RegisterWithSubRegs<"EDI", [DI]>, DwarfRegNum<7>;
+ def EIP : RegisterWithSubRegs<"EIP", [IP]>, DwarfRegNum<8>;
+
+ // X86-64 only
+ def R8D : RegisterWithSubRegs<"R8D", [R8W]>, DwarfRegNum<8>;
+ def R9D : RegisterWithSubRegs<"R9D", [R9W]>, DwarfRegNum<9>;
+ def R10D : RegisterWithSubRegs<"R10D", [R10W]>, DwarfRegNum<10>;
+ def R11D : RegisterWithSubRegs<"R11D", [R11W]>, DwarfRegNum<11>;
+ def R12D : RegisterWithSubRegs<"R12D", [R12W]>, DwarfRegNum<12>;
+ def R13D : RegisterWithSubRegs<"R13D", [R13W]>, DwarfRegNum<13>;
+ def R14D : RegisterWithSubRegs<"R14D", [R14W]>, DwarfRegNum<14>;
+ def R15D : RegisterWithSubRegs<"R15D", [R15W]>, DwarfRegNum<15>;
+
+ // 64-bit registers, X86-64 only
+ def RAX : RegisterWithSubRegs<"RAX", [EAX]>, DwarfRegNum<0>;
+ def RDX : RegisterWithSubRegs<"RDX", [EDX]>, DwarfRegNum<1>;
+ def RCX : RegisterWithSubRegs<"RCX", [ECX]>, DwarfRegNum<2>;
+ def RBX : RegisterWithSubRegs<"RBX", [EBX]>, DwarfRegNum<3>;
+ def RSI : RegisterWithSubRegs<"RSI", [ESI]>, DwarfRegNum<4>;
+ def RDI : RegisterWithSubRegs<"RDI", [EDI]>, DwarfRegNum<5>;
+ def RBP : RegisterWithSubRegs<"RBP", [EBP]>, DwarfRegNum<6>;
+ def RSP : RegisterWithSubRegs<"RSP", [ESP]>, DwarfRegNum<7>;
+
+ def R8 : RegisterWithSubRegs<"R8", [R8D]>, DwarfRegNum<8>;
+ def R9 : RegisterWithSubRegs<"R9", [R9D]>, DwarfRegNum<9>;
+ def R10 : RegisterWithSubRegs<"R10", [R10D]>, DwarfRegNum<10>;
+ def R11 : RegisterWithSubRegs<"R11", [R11D]>, DwarfRegNum<11>;
+ def R12 : RegisterWithSubRegs<"R12", [R12D]>, DwarfRegNum<12>;
+ def R13 : RegisterWithSubRegs<"R13", [R13D]>, DwarfRegNum<13>;
+ def R14 : RegisterWithSubRegs<"R14", [R14D]>, DwarfRegNum<14>;
+ def R15 : RegisterWithSubRegs<"R15", [R15D]>, DwarfRegNum<15>;
+ def RIP : RegisterWithSubRegs<"RIP", [EIP]>, DwarfRegNum<16>;
+
+ // MMX Registers. These are actually aliased to ST0 .. ST7
+ def MM0 : Register<"MM0">, DwarfRegNum<29>;
+ def MM1 : Register<"MM1">, DwarfRegNum<30>;
+ def MM2 : Register<"MM2">, DwarfRegNum<31>;
+ def MM3 : Register<"MM3">, DwarfRegNum<32>;
+ def MM4 : Register<"MM4">, DwarfRegNum<33>;
+ def MM5 : Register<"MM5">, DwarfRegNum<34>;
+ def MM6 : Register<"MM6">, DwarfRegNum<35>;
+ def MM7 : Register<"MM7">, DwarfRegNum<36>;
+
+ // Pseudo Floating Point registers
+ def FP0 : Register<"FP0">, DwarfRegNum<-1>;
+ def FP1 : Register<"FP1">, DwarfRegNum<-1>;
+ def FP2 : Register<"FP2">, DwarfRegNum<-1>;
+ def FP3 : Register<"FP3">, DwarfRegNum<-1>;
+ def FP4 : Register<"FP4">, DwarfRegNum<-1>;
+ def FP5 : Register<"FP5">, DwarfRegNum<-1>;
+ def FP6 : Register<"FP6">, DwarfRegNum<-1>;
+
+ // XMM Registers, used by the various SSE instruction set extensions
+ def XMM0: Register<"XMM0">, DwarfRegNum<17>;
+ def XMM1: Register<"XMM1">, DwarfRegNum<18>;
+ def XMM2: Register<"XMM2">, DwarfRegNum<19>;
+ def XMM3: Register<"XMM3">, DwarfRegNum<20>;
+ def XMM4: Register<"XMM4">, DwarfRegNum<21>;
+ def XMM5: Register<"XMM5">, DwarfRegNum<22>;
+ def XMM6: Register<"XMM6">, DwarfRegNum<23>;
+ def XMM7: Register<"XMM7">, DwarfRegNum<24>;
+
+ // X86-64 only
+ def XMM8: Register<"XMM8">, DwarfRegNum<25>;
+ def XMM9: Register<"XMM9">, DwarfRegNum<26>;
+ def XMM10: Register<"XMM10">, DwarfRegNum<27>;
+ def XMM11: Register<"XMM11">, DwarfRegNum<28>;
+ def XMM12: Register<"XMM12">, DwarfRegNum<29>;
+ def XMM13: Register<"XMM13">, DwarfRegNum<30>;
+ def XMM14: Register<"XMM14">, DwarfRegNum<31>;
+ def XMM15: Register<"XMM15">, DwarfRegNum<32>;
+
+ // Floating point stack registers
+ def ST0 : Register<"ST(0)">, DwarfRegNum<11>;
+ def ST1 : Register<"ST(1)">, DwarfRegNum<12>;
+ def ST2 : Register<"ST(2)">, DwarfRegNum<13>;
+ def ST3 : Register<"ST(3)">, DwarfRegNum<14>;
+ def ST4 : Register<"ST(4)">, DwarfRegNum<15>;
+ def ST5 : Register<"ST(5)">, DwarfRegNum<16>;
+ def ST6 : Register<"ST(6)">, DwarfRegNum<17>;
+ def ST7 : Register<"ST(7)">, DwarfRegNum<18>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes. The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B.
+// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+def GR8 : RegisterClass<"X86", [i8], 8,
+ [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+ R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SPL or BPL.
+ static const unsigned X86_GR8_AO_64_fp[] =
+ {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+ X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B};
+ // If not, just don't allocate SPL.
+ static const unsigned X86_GR8_AO_64[] =
+ {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+ X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+ X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL};
+ // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+ static const unsigned X86_GR8_AO_32[] =
+ {X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH};
+
+ GR8Class::iterator
+ GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_AO_32;
+ else if (RI->hasFP(MF))
+ return X86_GR8_AO_64_fp;
+ else
+ return X86_GR8_AO_64;
+ }
+
+ GR8Class::iterator
+ GR8Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+ else if (RI->hasFP(MF))
+ return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+ else
+ return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+ }
+ }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+ [AX, CX, DX, SI, DI, BX, BP, SP,
+ R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate SP or BP.
+ static const unsigned X86_GR16_AO_64_fp[] =
+ {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+ X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+ X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W};
+ static const unsigned X86_GR16_AO_32_fp[] =
+ {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX};
+ // If not, just don't allocate SPL.
+ static const unsigned X86_GR16_AO_64[] =
+ {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+ X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+ X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP};
+ static const unsigned X86_GR16_AO_32[] =
+ {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP};
+
+ GR16Class::iterator
+ GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_64_fp;
+ else
+ return X86_GR16_AO_64;
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_32_fp;
+ else
+ return X86_GR16_AO_32;
+ }
+ }
+
+ GR16Class::iterator
+ GR16Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+ else
+ return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+ else
+ return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+ }
+ }
+ }];
+}
+
+
+def GR32 : RegisterClass<"X86", [i32], 32,
+ [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+ R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]> {
+ let MethodProtos = [{
+ iterator allocation_order_begin(const MachineFunction &MF) const;
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ // Does the function dedicate RBP / EBP to being a frame ptr?
+ // If so, don't allocate ESP or EBP.
+ static const unsigned X86_GR32_AO_64_fp[] =
+ {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D};
+ static const unsigned X86_GR32_AO_32_fp[] =
+ {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX};
+ // If not, just don't allocate SPL.
+ static const unsigned X86_GR32_AO_64[] =
+ {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+ X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+ X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP};
+ static const unsigned X86_GR32_AO_32[] =
+ {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP};
+
+ GR32Class::iterator
+ GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_64_fp;
+ else
+ return X86_GR32_AO_64;
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_32_fp;
+ else
+ return X86_GR32_AO_32;
+ }
+ }
+
+ GR32Class::iterator
+ GR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (Subtarget.is64Bit()) {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+ else
+ return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+ } else {
+ if (RI->hasFP(MF))
+ return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+ else
+ return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+ }
+ }
+ }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64,
+ [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+ RBX, R14, R15, R12, R13, RBP, RSP]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ GR64Class::iterator
+ GR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const MRegisterInfo *RI = TM.getRegisterInfo();
+ if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+ return end()-2; // If so, don't allocate RSP or RBP
+ else
+ return end()-1; // If not, just don't allocate RSP
+ }
+ }];
+}
+
+
+// GR16, GR32 subclasses which contain registers that have R8 sub-registers.
+// These should only be used for 32-bit mode.
+def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]>;
+def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR32Class::iterator
+ FR32Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ FR64Class::iterator
+ FR64Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values. This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware. In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86", [f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86", [f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f64], 32,
+ [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ RSTClass::iterator
+ RSTClass::allocation_order_end(const MachineFunction &MF) const {
+ return begin();
+ }
+ }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64 : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
+ [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+ [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+ XMM8, XMM9, XMM10, XMM11,
+ XMM12, XMM13, XMM14, XMM15]> {
+ let MethodProtos = [{
+ iterator allocation_order_end(const MachineFunction &MF) const;
+ }];
+ let MethodBodies = [{
+ VR128Class::iterator
+ VR128Class::allocation_order_end(const MachineFunction &MF) const {
+ const TargetMachine &TM = MF.getTarget();
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+ if (!Subtarget.is64Bit())
+ return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+ else
+ return end();
+ }
+ }];
+}
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..3dd2b24
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,34 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+ namespace X86 {
+ enum RelocationType {
+ // reloc_pcrel_word - PC relative relocation, add the relocated value to
+ // the value already in memory, after we adjust it for where the PC is.
+ reloc_pcrel_word = 0,
+
+ // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+ // add the relocated value to the value already in memory.
+ reloc_absolute_word = 1,
+ reloc_absolute_dword = 2
+ };
+ }
+}
+
+#endif
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..1a75e04
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,293 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+cl::opt<X86Subtarget::AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(
+ clEnumValN(X86Subtarget::ATT, "att", " Emit AT&T-style assembly"),
+ clEnumValN(X86Subtarget::Intel, "intel", " Emit Intel-style assembly"),
+ clEnumValEnd));
+
+
+/// True if accessing the GV requires an extra load. For Windows, dllimported
+/// symbols are indirect, loading the value at address GV rather then the
+/// value of GV itself. This means that the GlobalAddress must be in the base
+/// or index register of the address, not the GV offset field.
+bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+ const TargetMachine& TM,
+ bool isDirectCall) const
+{
+ // FIXME: PIC
+ if (TM.getRelocationModel() != Reloc::Static)
+ if (isTargetDarwin()) {
+ return (!isDirectCall &&
+ (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+ (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode())));
+ } else if (TM.getRelocationModel() == Reloc::PIC_ && isPICStyleGOT()) {
+ // Extra load is needed for all non-statics.
+ return (!isDirectCall &&
+ (GV->isDeclaration() || !GV->hasInternalLinkage()));
+ } else if (isTargetCygMing() || isTargetWindows()) {
+ return (GV->hasDLLImportLinkage());
+ }
+
+ return false;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments. If we can't run cpuid on the host, return true.
+bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+ unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__)
+ // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+ asm ("movq\t%%rbx, %%rsi\n\t"
+ "cpuid\n\t"
+ "xchgq\t%%rbx, %%rsi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+#if defined(__GNUC__)
+ asm ("movl\t%%ebx, %%esi\n\t"
+ "cpuid\n\t"
+ "xchgl\t%%ebx, %%esi\n\t"
+ : "=a" (*rEAX),
+ "=S" (*rEBX),
+ "=c" (*rECX),
+ "=d" (*rEDX)
+ : "a" (value));
+ return false;
+#elif defined(_MSC_VER)
+ __asm {
+ mov eax,value
+ cpuid
+ mov esi,rEAX
+ mov dword ptr [esi],eax
+ mov esi,rEBX
+ mov dword ptr [esi],ebx
+ mov esi,rECX
+ mov dword ptr [esi],ecx
+ mov esi,rEDX
+ mov dword ptr [esi],edx
+ }
+ return false;
+#endif
+#endif
+ return true;
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+ return;
+
+ X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+
+ if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
+ if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
+ if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
+ if (ECX & 0x1) X86SSELevel = SSE3;
+ if ((ECX >> 9) & 0x1) X86SSELevel = SSSE3;
+
+ if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
+ memcmp(text.c, "AuthenticAMD", 12) == 0) {
+ X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ HasX86_64 = (EDX >> 29) & 0x1;
+ }
+}
+
+static const char *GetCurrentX86CPU() {
+ unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+ if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+ return "generic";
+ unsigned Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+ unsigned Model = (EAX >> 4) & 0xf; // Bits 4 - 7
+ X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+ bool Em64T = (EDX >> 29) & 0x1;
+
+ union {
+ unsigned u[3];
+ char c[12];
+ } text;
+
+ X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+ if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+ switch (Family) {
+ case 3:
+ return "i386";
+ case 4:
+ return "i486";
+ case 5:
+ switch (Model) {
+ case 4: return "pentium-mmx";
+ default: return "pentium";
+ }
+ case 6:
+ switch (Model) {
+ case 1: return "pentiumpro";
+ case 3:
+ case 5:
+ case 6: return "pentium2";
+ case 7:
+ case 8:
+ case 10:
+ case 11: return "pentium3";
+ case 9:
+ case 13: return "pentium-m";
+ case 14: return "yonah";
+ case 15: return "core2";
+ default: return "i686";
+ }
+ case 15: {
+ switch (Model) {
+ case 3:
+ case 4:
+ return (Em64T) ? "nocona" : "prescott";
+ default:
+ return (Em64T) ? "x86-64" : "pentium4";
+ }
+ }
+
+ default:
+ return "generic";
+ }
+ } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+ // FIXME: this poorly matches the generated SubtargetFeatureKV table. There
+ // appears to be no way to generate the wide variety of AMD-specific targets
+ // from the information returned from CPUID.
+ switch (Family) {
+ case 4:
+ return "i486";
+ case 5:
+ switch (Model) {
+ case 6:
+ case 7: return "k6";
+ case 8: return "k6-2";
+ case 9:
+ case 13: return "k6-3";
+ default: return "pentium";
+ }
+ case 6:
+ switch (Model) {
+ case 4: return "athlon-tbird";
+ case 6:
+ case 7:
+ case 8: return "athlon-mp";
+ case 10: return "athlon-xp";
+ default: return "athlon";
+ }
+ case 15:
+ switch (Model) {
+ case 1: return "opteron";
+ case 5: return "athlon-fx"; // also opteron
+ default: return "athlon64";
+ }
+ default:
+ return "generic";
+ }
+ } else {
+ return "generic";
+ }
+}
+
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+ : AsmFlavor(AsmWriterFlavor)
+ , PICStyle(PICStyle::None)
+ , X86SSELevel(NoMMXSSE)
+ , HasX86_64(false)
+ , stackAlignment(8)
+ // FIXME: this is a known good value for Yonah. How about others?
+ , MinRepStrSizeThreshold(128)
+ , Is64Bit(is64Bit)
+ , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+ // Determine default and user specified characteristics
+ if (!FS.empty()) {
+ // If feature string is not empty, parse features string.
+ std::string CPU = GetCurrentX86CPU();
+ ParseSubtargetFeatures(FS, CPU);
+
+ if (Is64Bit && !HasX86_64)
+ cerr << "Warning: Generation of 64-bit code for a 32-bit processor "
+ << "requested.\n";
+ if (Is64Bit && X86SSELevel < SSE2)
+ cerr << "Warning: 64-bit processors all have at least SSE2.\n";
+ } else {
+ // Otherwise, use CPUID to auto-detect feature set.
+ AutoDetectSubtargetFeatures();
+ }
+
+ // If requesting codegen for X86-64, make sure that 64-bit and SSE2 features
+ // are enabled. These are available on all x86-64 CPUs.
+ if (Is64Bit) {
+ HasX86_64 = true;
+ if (X86SSELevel < SSE2)
+ X86SSELevel = SSE2;
+ }
+
+ // Set the boolean corresponding to the current target triple, or the default
+ // if one cannot be determined, to true.
+ const std::string& TT = M.getTargetTriple();
+ if (TT.length() > 5) {
+ if (TT.find("cygwin") != std::string::npos)
+ TargetType = isCygwin;
+ else if (TT.find("mingw") != std::string::npos)
+ TargetType = isMingw;
+ else if (TT.find("darwin") != std::string::npos)
+ TargetType = isDarwin;
+ else if (TT.find("win32") != std::string::npos)
+ TargetType = isWindows;
+ } else if (TT.empty()) {
+#if defined(__CYGWIN__)
+ TargetType = isCygwin;
+#elif defined(__MINGW32__)
+ TargetType = isMingw;
+#elif defined(__APPLE__)
+ TargetType = isDarwin;
+#elif defined(_WIN32)
+ TargetType = isWindows;
+#endif
+ }
+
+ // If the asm syntax hasn't been overridden on the command line, use whatever
+ // the target wants.
+ if (AsmFlavor == X86Subtarget::Unset) {
+ if (TargetType == isWindows) {
+ AsmFlavor = X86Subtarget::Intel;
+ } else {
+ AsmFlavor = X86Subtarget::ATT;
+ }
+ }
+
+ if (TargetType == isDarwin ||
+ TargetType == isCygwin ||
+ TargetType == isMingw ||
+ (TargetType == isELF && Is64Bit))
+ stackAlignment = 16;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..2cda970
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,156 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+class GlobalValue;
+class TargetMachine;
+
+namespace PICStyle {
+enum Style {
+ Stub, GOT, RIPRel, WinPIC, None
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+public:
+ enum AsmWriterFlavorTy {
+ // Note: This numbering has to match the GCC assembler dialects for inline
+ // asm alternatives to work right.
+ ATT = 0, Intel = 1, Unset
+ };
+protected:
+ enum X86SSEEnum {
+ NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
+ };
+
+ enum X863DNowEnum {
+ NoThreeDNow, ThreeDNow, ThreeDNowA
+ };
+
+ /// AsmFlavor - Which x86 asm dialect to use.
+ AsmWriterFlavorTy AsmFlavor;
+
+ /// PICStyle - Which PIC style to use
+ PICStyle::Style PICStyle;
+
+ /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, or none supported.
+ X86SSEEnum X86SSELevel;
+
+ /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+ X863DNowEnum X863DNowLevel;
+
+ /// HasX86_64 - True if the processor supports X86-64 instructions.
+ bool HasX86_64;
+
+ /// stackAlignment - The minimum alignment known to hold of the stack frame on
+ /// entry to the function and which must be maintained by every function.
+ unsigned stackAlignment;
+
+ /// Min. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+ unsigned MinRepStrSizeThreshold;
+
+private:
+ /// Is64Bit - True if the processor supports 64-bit instructions and module
+ /// pointer size is 64 bit.
+ bool Is64Bit;
+
+public:
+ enum {
+ isELF, isCygwin, isDarwin, isWindows, isMingw
+ } TargetType;
+
+ /// This constructor initializes the data members to match that
+ /// of the specified module.
+ ///
+ X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+
+ /// getStackAlignment - Returns the minimum alignment known to hold of the
+ /// stack frame on entry to the function and which must be maintained by every
+ /// function for this subtarget.
+ unsigned getStackAlignment() const { return stackAlignment; }
+
+ /// getMinRepStrSizeThreshold - Returns the minimum memset / memcpy size
+ /// required to turn the operation into a X86 rep/movs or rep/stos
+ /// instruction. This is only used if the src / dst alignment is not DWORD
+ /// aligned.
+ unsigned getMinRepStrSizeThreshold() const { return MinRepStrSizeThreshold; }
+
+ /// ParseSubtargetFeatures - Parses features string setting specified
+ /// subtarget options. Definition of function is auto generated by tblgen.
+ void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+ /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+ /// instruction.
+ void AutoDetectSubtargetFeatures();
+
+ bool is64Bit() const { return Is64Bit; }
+
+ PICStyle::Style getPICStyle() const { return PICStyle; }
+ void setPICStyle(PICStyle::Style Style) { PICStyle = Style; }
+
+ bool hasMMX() const { return X86SSELevel >= MMX; }
+ bool hasSSE1() const { return X86SSELevel >= SSE1; }
+ bool hasSSE2() const { return X86SSELevel >= SSE2; }
+ bool hasSSE3() const { return X86SSELevel >= SSE3; }
+ bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+ bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+ bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+
+ unsigned getAsmFlavor() const {
+ return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+ }
+
+ bool isFlavorAtt() const { return AsmFlavor == ATT; }
+ bool isFlavorIntel() const { return AsmFlavor == Intel; }
+
+ bool isTargetDarwin() const { return TargetType == isDarwin; }
+ bool isTargetELF() const { return TargetType == isELF; }
+ bool isTargetWindows() const { return TargetType == isWindows; }
+ bool isTargetMingw() const { return TargetType == isMingw; }
+ bool isTargetCygMing() const { return (TargetType == isMingw ||
+ TargetType == isCygwin); }
+ bool isTargetCygwin() const { return TargetType == isCygwin; }
+
+ bool isPICStyleSet() const { return PICStyle != PICStyle::None; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyle::GOT; }
+ bool isPICStyleStub() const { return PICStyle == PICStyle::Stub; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyle::RIPRel; }
+ bool isPICStyleWinPIC() const { return PICStyle == PICStyle:: WinPIC; }
+
+ /// True if accessing the GV requires an extra load. For Windows, dllimported
+ /// symbols are indirect, loading the value at address GV rather then the
+ /// value of GV itself. This means that the GlobalAddress must be in the base
+ /// or index register of the address, not the GV offset field.
+ bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+ bool isDirectCall) const;
+
+};
+
+namespace X86 {
+ /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+ /// the specified arguments. If we can't run cpuid on the host, return true.
+ bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+ unsigned *rECX, unsigned *rEDX);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp
new file mode 100644
index 0000000..4bb854e
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.cpp
@@ -0,0 +1,280 @@
+//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+static const char* x86_asm_table[] = {"{si}", "S",
+ "{di}", "D",
+ "{ax}", "a",
+ "{cx}", "c",
+ "{memory}", "memory",
+ "{flags}", "",
+ "{dirflag}", "",
+ "{fpsr}", "",
+ "{cc}", "cc",
+ 0,0};
+
+X86TargetAsmInfo::X86TargetAsmInfo(const X86TargetMachine &TM) {
+ const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+
+ // FIXME - Should be simplified.
+
+ AsmTransCBE = x86_asm_table;
+
+ switch (Subtarget->TargetType) {
+ case X86Subtarget::isDarwin:
+ AlignmentIsInBytes = false;
+ GlobalPrefix = "_";
+ if (!Subtarget->is64Bit())
+ Data64bitsDirective = 0; // we can't emit a 64-bit unit
+ ZeroDirective = "\t.space\t"; // ".space N" emits N zeros.
+ PrivateGlobalPrefix = "L"; // Marker for constant pool idxs
+ BSSSection = 0; // no BSS section.
+ ZeroFillDirective = "\t.zerofill\t"; // Uses .zerofill
+ ConstantPoolSection = "\t.const\n";
+ JumpTableDataSection = "\t.const\n";
+ CStringSection = "\t.cstring";
+ FourByteConstantSection = "\t.literal4\n";
+ EightByteConstantSection = "\t.literal8\n";
+ if (Subtarget->is64Bit())
+ SixteenByteConstantSection = "\t.literal16\n";
+ ReadOnlySection = "\t.const\n";
+ LCOMMDirective = "\t.lcomm\t";
+ COMMDirectiveTakesAlignment = false;
+ HasDotTypeDotSizeDirective = false;
+ if (TM.getRelocationModel() == Reloc::Static) {
+ StaticCtorsSection = ".constructor";
+ StaticDtorsSection = ".destructor";
+ } else {
+ StaticCtorsSection = ".mod_init_func";
+ StaticDtorsSection = ".mod_term_func";
+ }
+ InlineAsmStart = "# InlineAsm Start";
+ InlineAsmEnd = "# InlineAsm End";
+ SetDirective = "\t.set";
+ UsedDirective = "\t.no_dead_strip\t";
+ WeakRefDirective = "\t.weak_reference\t";
+ HiddenDirective = "\t.private_extern\t";
+
+ // In non-PIC modes, emit a special label before jump tables so that the
+ // linker can perform more accurate dead code stripping.
+ if (TM.getRelocationModel() != Reloc::PIC_) {
+ // Emit a local label that is preserved until the linker runs.
+ JumpTableSpecialLabelPrefix = "l";
+ }
+
+ SupportsDebugInformation = true;
+ NeedsSet = true;
+ DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+ DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+ DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+ DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+ DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+ DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+ DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+ DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+ DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+ DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+ DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+ break;
+
+ case X86Subtarget::isELF:
+ ReadOnlySection = "\t.section\t.rodata";
+ FourByteConstantSection = "\t.section\t.rodata.cst4,\"aM\",@progbits,4";
+ EightByteConstantSection = "\t.section\t.rodata.cst8,\"aM\",@progbits,8";
+ SixteenByteConstantSection = "\t.section\t.rodata.cst16,\"aM\",@progbits,16";
+ CStringSection = "\t.section\t.rodata.str1.1,\"aMS\",@progbits,1";
+ PrivateGlobalPrefix = ".L";
+ WeakRefDirective = "\t.weak\t";
+ SetDirective = "\t.set\t";
+ PCSymbol = ".";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+ AbsoluteDebugSectionOffsets = true;
+ AbsoluteEHSectionOffsets = false;
+ SupportsDebugInformation = true;
+ DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"\",@progbits";
+ DwarfInfoSection = "\t.section\t.debug_info,\"\",@progbits";
+ DwarfLineSection = "\t.section\t.debug_line,\"\",@progbits";
+ DwarfFrameSection = "\t.section\t.debug_frame,\"\",@progbits";
+ DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+ DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+ DwarfStrSection = "\t.section\t.debug_str,\"\",@progbits";
+ DwarfLocSection = "\t.section\t.debug_loc,\"\",@progbits";
+ DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+ DwarfRangesSection = "\t.section\t.debug_ranges,\"\",@progbits";
+ DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+ if (!Subtarget->is64Bit())
+ SupportsExceptionHandling = true;
+ DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+ DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+ break;
+
+ case X86Subtarget::isCygwin:
+ case X86Subtarget::isMingw:
+ GlobalPrefix = "_";
+ LCOMMDirective = "\t.lcomm\t";
+ COMMDirectiveTakesAlignment = false;
+ HasDotTypeDotSizeDirective = false;
+ StaticCtorsSection = "\t.section .ctors,\"aw\"";
+ StaticDtorsSection = "\t.section .dtors,\"aw\"";
+ HiddenDirective = NULL;
+ PrivateGlobalPrefix = "L"; // Prefix for private global symbols
+ WeakRefDirective = "\t.weak\t";
+ SetDirective = "\t.set\t";
+
+ // Set up DWARF directives
+ HasLEB128 = true; // Target asm supports leb128 directives (little-endian)
+ AbsoluteDebugSectionOffsets = true;
+ AbsoluteEHSectionOffsets = false;
+ SupportsDebugInformation = true;
+ DwarfSectionOffsetDirective = "\t.secrel32\t";
+ DwarfAbbrevSection = "\t.section\t.debug_abbrev,\"dr\"";
+ DwarfInfoSection = "\t.section\t.debug_info,\"dr\"";
+ DwarfLineSection = "\t.section\t.debug_line,\"dr\"";
+ DwarfFrameSection = "\t.section\t.debug_frame,\"dr\"";
+ DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\"";
+ DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\"";
+ DwarfStrSection = "\t.section\t.debug_str,\"dr\"";
+ DwarfLocSection = "\t.section\t.debug_loc,\"dr\"";
+ DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\"";
+ DwarfRangesSection = "\t.section\t.debug_ranges,\"dr\"";
+ DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\"";
+ break;
+
+ case X86Subtarget::isWindows:
+ GlobalPrefix = "_";
+ HasDotTypeDotSizeDirective = false;
+ break;
+
+ default: break;
+ }
+
+ if (Subtarget->isFlavorIntel()) {
+ GlobalPrefix = "_";
+ CommentString = ";";
+
+ PrivateGlobalPrefix = "$";
+ AlignDirective = "\talign\t";
+ ZeroDirective = "\tdb\t";
+ ZeroDirectiveSuffix = " dup(0)";
+ AsciiDirective = "\tdb\t";
+ AscizDirective = 0;
+ Data8bitsDirective = "\tdb\t";
+ Data16bitsDirective = "\tdw\t";
+ Data32bitsDirective = "\tdd\t";
+ Data64bitsDirective = "\tdq\t";
+ HasDotTypeDotSizeDirective = false;
+
+ TextSection = "_text";
+ DataSection = "_data";
+ JumpTableDataSection = NULL;
+ SwitchToSectionDirective = "";
+ TextSectionStartSuffix = "\tsegment 'CODE'";
+ DataSectionStartSuffix = "\tsegment 'DATA'";
+ SectionEndDirectiveSuffix = "\tends\n";
+ }
+
+ AssemblerDialect = Subtarget->getAsmFlavor();
+}
+
+bool X86TargetAsmInfo::LowerToBSwap(CallInst *CI) const {
+ // FIXME: this should verify that we are targetting a 486 or better. If not,
+ // we will turn this bswap into something that will be lowered to logical ops
+ // instead of emitting the bswap asm. For now, we don't support 486 or lower
+ // so don't worry about this.
+
+ // Verify this is a simple bswap.
+ if (CI->getNumOperands() != 2 ||
+ CI->getType() != CI->getOperand(1)->getType() ||
+ !CI->getType()->isInteger())
+ return false;
+
+ const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+ if (!Ty || Ty->getBitWidth() % 16 != 0)
+ return false;
+
+ // Okay, we can do this xform, do so now.
+ const Type *Tys[] = { Ty, Ty };
+ Module *M = CI->getParent()->getParent()->getParent();
+ Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 2);
+
+ Value *Op = CI->getOperand(1);
+ Op = new CallInst(Int, Op, CI->getName(), CI);
+
+ CI->replaceAllUsesWith(Op);
+ CI->eraseFromParent();
+ return true;
+}
+
+
+bool X86TargetAsmInfo::ExpandInlineAsm(CallInst *CI) const {
+ InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+ std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+ std::string AsmStr = IA->getAsmString();
+
+ // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+ std::vector<std::string> AsmPieces;
+ SplitString(AsmStr, AsmPieces, "\n"); // ; as separator?
+
+ switch (AsmPieces.size()) {
+ default: return false;
+ case 1:
+ AsmStr = AsmPieces[0];
+ AsmPieces.clear();
+ SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace.
+
+ // bswap $0
+ if (AsmPieces.size() == 2 &&
+ AsmPieces[0] == "bswap" && AsmPieces[1] == "$0") {
+ // No need to check constraints, nothing other than the equivalent of
+ // "=r,0" would be valid here.
+ return LowerToBSwap(CI);
+ }
+ break;
+ case 3:
+ if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
+ Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+ Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+ // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
+ std::vector<std::string> Words;
+ SplitString(AsmPieces[0], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+ Words.clear();
+ SplitString(AsmPieces[1], Words, " \t");
+ if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+ Words.clear();
+ SplitString(AsmPieces[2], Words, " \t,");
+ if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+ Words[2] == "%edx") {
+ return LowerToBSwap(CI);
+ }
+ }
+ }
+ }
+ break;
+ }
+ return false;
+}
diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h
new file mode 100644
index 0000000..cc509d1
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.h
@@ -0,0 +1,33 @@
+//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+ // Forward declaration.
+ class X86TargetMachine;
+
+ struct X86TargetAsmInfo : public TargetAsmInfo {
+ X86TargetAsmInfo(const X86TargetMachine &TM);
+
+ virtual bool ExpandInlineAsm(CallInst *CI) const;
+ private:
+ bool LowerToBSwap(CallInst *CI) const;
+ };
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..4d4bd3f
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,190 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+/// X86TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library. In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this. Though it is unused, do not remove it.
+extern "C" int X86TargetMachineModule;
+int X86TargetMachineModule = 0;
+
+namespace {
+ // Register the target.
+ RegisterTarget<X86_32TargetMachine>
+ X("x86", " 32-bit X86: Pentium-Pro and above");
+ RegisterTarget<X86_64TargetMachine>
+ Y("x86-64", " 64-bit X86: EM64T and AMD64");
+}
+
+const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
+ return new X86TargetAsmInfo(*this);
+}
+
+unsigned X86_32TargetMachine::getJITMatchQuality() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+ return 10;
+#endif
+ return 0;
+}
+
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__)
+ return 10;
+#endif
+ return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
+ // We strongly match "i[3-9]86-*".
+ std::string TT = M.getTargetTriple();
+ if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+ TT[4] == '-' && TT[1] - '3' < 6)
+ return 20;
+ // If the target triple is something non-X86, we don't match.
+ if (!TT.empty()) return 0;
+
+ if (M.getEndianness() == Module::LittleEndian &&
+ M.getPointerSize() == Module::Pointer32)
+ return 10; // Weak match
+ else if (M.getEndianness() != Module::AnyEndianness ||
+ M.getPointerSize() != Module::AnyPointerSize)
+ return 0; // Match for some other target
+
+ return getJITMatchQuality()/2;
+}
+
+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+ // We strongly match "x86_64-*".
+ std::string TT = M.getTargetTriple();
+ if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+ TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+ return 20;
+
+ // We strongly match "amd64-*".
+ if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
+ TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
+ return 20;
+
+ // If the target triple is something non-X86-64, we don't match.
+ if (!TT.empty()) return 0;
+
+ if (M.getEndianness() == Module::LittleEndian &&
+ M.getPointerSize() == Module::Pointer64)
+ return 10; // Weak match
+ else if (M.getEndianness() != Module::AnyEndianness ||
+ M.getPointerSize() != Module::AnyPointerSize)
+ return 0; // Match for some other target
+
+ return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS)
+ : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+ : X86TargetMachine(M, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an ILP32 architecture model
+///
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
+ bool is64Bit)
+ : Subtarget(M, FS, is64Bit),
+ DataLayout(Subtarget.is64Bit() ?
+ std::string("e-p:64:64-f64:32:64-i64:32:64") :
+ std::string("e-p:32:32-f64:32:64-i64:32:64")),
+ FrameInfo(TargetFrameInfo::StackGrowsDown,
+ Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+ InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+ if (getRelocationModel() == Reloc::Default)
+ if (Subtarget.isTargetDarwin() || Subtarget.isTargetCygMing())
+ setRelocationModel(Reloc::DynamicNoPIC);
+ else
+ setRelocationModel(Reloc::Static);
+ if (Subtarget.is64Bit()) {
+ // No DynamicNoPIC support under X86-64.
+ if (getRelocationModel() == Reloc::DynamicNoPIC)
+ setRelocationModel(Reloc::PIC_);
+ // Default X86-64 code model is small.
+ if (getCodeModel() == CodeModel::Default)
+ setCodeModel(CodeModel::Small);
+ }
+
+ if (Subtarget.isTargetCygMing())
+ Subtarget.setPICStyle(PICStyle::WinPIC);
+ else if (Subtarget.isTargetDarwin())
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyle::RIPRel);
+ else
+ Subtarget.setPICStyle(PICStyle::Stub);
+ else if (Subtarget.isTargetELF())
+ if (Subtarget.is64Bit())
+ Subtarget.setPICStyle(PICStyle::RIPRel);
+ else
+ Subtarget.setPICStyle(PICStyle::GOT);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+ // Install an instruction selector.
+ PM.add(createX86ISelDag(*this, Fast));
+ return false;
+}
+
+bool X86TargetMachine::addPostRegAlloc(FunctionPassManager &PM, bool Fast) {
+ PM.add(createX86FloatingPointStackifierPass());
+ return true; // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast,
+ std::ostream &Out) {
+ PM.add(createX86CodePrinterPass(Out, *this));
+ return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+ MachineCodeEmitter &MCE) {
+ // FIXME: Move this to TargetJITInfo!
+ setRelocationModel(Reloc::Static);
+ Subtarget.setPICStyle(PICStyle::None);
+
+ // JIT cannot ensure globals are placed in the lower 4G of address.
+ if (Subtarget.is64Bit())
+ setCodeModel(CodeModel::Large);
+
+ PM.add(createX86CodeEmitterPass(*this, MCE));
+ return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+ MachineCodeEmitter &MCE) {
+ PM.add(createX86CodeEmitterPass(*this, MCE));
+ return false;
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..0a4f1b5
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,95 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+
+class X86TargetMachine : public LLVMTargetMachine {
+ X86Subtarget Subtarget;
+ const TargetData DataLayout; // Calculates type size & alignment
+ TargetFrameInfo FrameInfo;
+ X86InstrInfo InstrInfo;
+ X86JITInfo JITInfo;
+ X86TargetLowering TLInfo;
+ X86ELFWriterInfo ELFWriterInfo;
+
+protected:
+ virtual const TargetAsmInfo *createTargetAsmInfo() const;
+
+public:
+ X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+ virtual const X86InstrInfo *getInstrInfo() const { return &InstrInfo; }
+ virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+ virtual TargetJITInfo *getJITInfo() { return &JITInfo; }
+ virtual const TargetSubtarget *getSubtargetImpl() const{ return &Subtarget; }
+ virtual X86TargetLowering *getTargetLowering() const {
+ return const_cast<X86TargetLowering*>(&TLInfo);
+ }
+ virtual const MRegisterInfo *getRegisterInfo() const {
+ return &InstrInfo.getRegisterInfo();
+ }
+ virtual const TargetData *getTargetData() const { return &DataLayout; }
+ virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+ return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+ }
+
+ static unsigned getModuleMatchQuality(const Module &M);
+ static unsigned getJITMatchQuality();
+
+ // Set up the pass pipeline.
+ virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);
+ virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast);
+ virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast,
+ std::ostream &Out);
+ virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+ MachineCodeEmitter &MCE);
+ virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+ MachineCodeEmitter &MCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+ X86_32TargetMachine(const Module &M, const std::string &FS);
+
+ static unsigned getJITMatchQuality();
+ static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+ X86_64TargetMachine(const Module &M, const std::string &FS);
+
+ static unsigned getJITMatchQuality();
+ static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif