It's not necessary to do rounding for alloca operations when the requested
alignment is equal to the stack alignment.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40004 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..5416cdb
--- /dev/null
+++ b/lib/Target/X86/Makefile
@@ -0,0 +1,20 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+                X86GenRegisterInfo.inc X86GenInstrNames.inc \
+                X86GenInstrInfo.inc X86GenAsmWriter.inc \
+                X86GenAsmWriter1.inc X86GenDAGISel.inc  \
+                X86GenCallingConv.inc X86GenSubtarget.inc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..d94fa02
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt
@@ -0,0 +1,99 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+On darwin/x86, we should codegen:
+
+        ret double 0.000000e+00
+
+as fld0/ret, not as:
+
+        movl $0, 4(%esp)
+        movl $0, (%esp)
+        fldl (%esp)
+	...
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (extloadf64f32 addr:$src2)))]>;
+                // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (X86fild addr:$src2, i32)))]>;
+                // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global.  Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P	->		fld Q
+fld Q			fld P
+fxch
+
+or:
+
+fxch	->		fucomi
+fucomi			jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..57c7c3f
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt
@@ -0,0 +1,69 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+  return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+        subl $28, %esp
+        movl 32(%esp), %eax
+        movd %eax, %mm0
+        movq %mm0, (%esp)
+        movl (%esp), %eax
+        movl %eax, 20(%esp)
+        movq %mm0, 8(%esp)
+        movl 12(%esp), %eax
+        movl %eax, 16(%esp)
+        movq 16(%esp), %mm0
+        addl $28, %esp
+        ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+        subl    $12, %esp
+        movl    16(%esp), %eax
+        movl    20(%esp), %edx
+        movl    $0, (%eax)
+        movl    %edx, 4(%eax)
+        addl    $12, %esp
+        ret     $4
+
+//===---------------------------------------------------------------------===//
+
+int main() {
+  __m64 A[1] = { _mm_cvtsi32_si64(1)  };
+  __m64 B[1] = { _mm_cvtsi32_si64(10) };
+  __m64 sum = _mm_cvtsi32_si64(0);
+
+  sum = __builtin_ia32_paddq(__builtin_ia32_paddq(A[0], B[0]), sum);
+
+  printf("Sum = %d\n", _mm_cvtsi64_si32(sum));
+  return 0;
+}
+
+Generates:
+
+        movl $11, %eax
+###     movd %eax, %mm0
+###     movq %mm0, 8(%esp)
+###     movl 8(%esp), %eax
+        movl %eax, 4(%esp)
+        movl $_str, (%esp)
+        call L_printf$stub
+        xorl %eax, %eax
+        addl $28, %esp
+
+These instructions are unnecessary.
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..f4b54c4
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt
@@ -0,0 +1,629 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+        %C = select bool %B, double 123.412, double 523.01123123
+        ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
+like this:
+
+  X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time).  *Note* however that this can only be done
+if Y is dead.  Here's a testcase:
+
+%.str_3 = external global [15 x sbyte]          ; <[15 x sbyte]*> [#uses=0]
+implementation   ; Functions:
+declare void %printf(int, ...)
+void %main() {
+build_tree.exit:
+        br label %no_exit.i7
+no_exit.i7:             ; preds = %no_exit.i7, %build_tree.exit
+        %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ]      ; <double> [#uses=1]
+        %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ]     ; <double> [#uses=1]
+        %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+        %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+        br bool false, label %Compute_Tree.exit23, label %no_exit.i7
+Compute_Tree.exit23:            ; preds = %no_exit.i7
+        tail call void (int, ...)* %printf( int 0 )
+        store double %tmp.34.i18, double* null
+        ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+        xorpd %XMM1, %XMM1
+        addsd %XMM0, %XMM1
+***     movsd %XMM2, QWORD PTR [%ESP + 8]
+***     addsd %XMM2, %XMM1
+***     movsd QWORD PTR [%ESP + 8], %XMM2
+        jmp .BBmain_1   # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+        %C = setlt double %A, %B
+        %z = add double %Z, 0.0    ;; select operand is not a load
+        %D = select bool %C, double %Y, double %z
+        ret double %D
+}
+
+We currently emit:
+
+_X:
+        subl $12, %esp
+        xorpd %xmm0, %xmm0
+        addsd 24(%esp), %xmm0
+        movsd 32(%esp), %xmm1
+        movsd 16(%esp), %xmm2
+        ucomisd 40(%esp), %xmm1
+        jb LBB_X_2
+LBB_X_1:
+        movsd %xmm0, %xmm2
+LBB_X_2:
+        movsd %xmm2, (%esp)
+        fldl (%esp)
+        addl $12, %esp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This will be solved when we go to a dynamic programming based isel.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to commute 2-addr instructions, allowing us to eliminate
+the reg-reg copy in this example:
+
+float foo(int *x, float *y, unsigned c) {
+  float res = 0.0;
+  unsigned i;
+  for (i = 0; i < c; i++) {
+    float xx = (float)x[i];
+    xx = xx * y[i];
+    xx += res;
+    res = xx;
+  }
+  return res;
+}
+
+LBB_foo_3:      # no_exit
+        cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI]
+        mulss %XMM0, DWORD PTR [%EAX + 4*%ESI]
+        addss %XMM0, %XMM1
+        inc %ESI
+        cmp %ESI, %ECX
+****    movaps %XMM1, %XMM0
+        jb LBB_foo_3    # no_exit
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+  if (copysign(1.0, x) == copysign(1.0, y))
+into:
+  if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+	movaps LCPI5_5, %xmm2
+	divps %xmm1, %xmm2
+	mulps %xmm2, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm2
+	andps LCPI5_1, %xmm3
+	por %xmm2, %xmm3
+	movdqa %xmm3, (%edi)
+
+	movaps LCPI5_5, %xmm1
+	divps %xmm0, %xmm1
+	mulps %xmm1, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm1
+	andps LCPI5_1, %xmm3
+	orps %xmm1, %xmm3
+	movaps %xmm3, 112(%esp)
+	movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+        movaps    (%edx), %xmm2                                 #59.21
+        movaps    (%edx), %xmm5                                 #60.21
+        movaps    (%edx), %xmm4                                 #61.21
+        movaps    (%edx), %xmm3                                 #62.21
+        movl      40(%ecx), %ebp                                #69.49
+        shufps    $0, %xmm2, %xmm5                              #60.21
+        movl      100(%esp), %ebx                               #69.20
+        movl      (%ebx), %edi                                  #69.20
+        imull     %ebp, %edi                                    #69.49
+        addl      (%eax), %edi                                  #70.33
+        shufps    $85, %xmm2, %xmm4                             #61.21
+        shufps    $170, %xmm2, %xmm3                            #62.21
+        shufps    $255, %xmm2, %xmm2                            #63.21
+        lea       (%ebp,%ebp,2), %ebx                           #69.49
+        negl      %ebx                                          #69.49
+        lea       -3(%edi,%ebx), %ebx                           #70.33
+        shll      $4, %ebx                                      #68.37
+        addl      32(%ecx), %ebx                                #68.37
+        testb     $15, %bl                                      #91.13
+        jne       L_B1.24       # Prob 5%                       #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%reg1078 = MOV32ri -3
+	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+	%reg1080 = IMUL32rr %reg1079, %reg1037
+	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+	%reg1082 = SHL32ri %reg1038, 4
+	%reg1039 = ADD32rr %reg1036, %reg1082
+	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+	%reg1040 = MOV32rr %reg1039
+	%reg1084 = AND32ri8 %reg1039, 15
+	CMP32ri8 %reg1084, 0
+	JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%EAX = MOV32ri -3
+	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+	%EDX = MOV32rm %EDX, 1, %NOREG, 40
+	IMUL32rr %EAX<def&use>, %EDX
+	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 0
+	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+	%EAX = LEA32r %ESI, 1, %EAX, -3
+	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 32
+	%EDI = MOV32rr %EAX
+	SHL32ri %EDI<def&use>, 4
+	ADD32rr %EDI<def&use>, %ESI
+	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+	%XMM1 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM1<def&use>, %XMM1, 170
+	%XMM2 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM2<def&use>, %XMM2, 0
+	%XMM3 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM3<def&use>, %XMM3, 255
+	SHUFPSrr %XMM0<def&use>, %XMM0, 85
+	%EBX = MOV32rr %EDI
+	AND32ri8 %EBX<def&use>, 15
+	CMP32ri8 %EBX, 0
+	JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4:	# cond_true44
+	addps %xmm1, %xmm2
+	subps %xmm3, %xmm2
+	movaps (%ecx), %xmm4
+	movaps %xmm2, %xmm1
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+	addps %xmm2, %xmm1    <=== Commute!
+	subps %xmm3, %xmm1
+	movaps (%ecx), %xmm4
+	movaps %xmm1, %xmm1   <=== Eliminate!
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of 
+xmm1 are already zero'd.  We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea.  Consider code like this:
+
+__m128 test(__m128 a) {
+  float b = *(float*)&A;
+  ...
+  return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled.  This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could 
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte 
+chunk is used, bring in 3 zeros the the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+#include <emmintrin.h>
+void test(__m128d *r, __m128d *A, double B) {
+  *r = _mm_loadl_pd(*A, &B);
+}
+
+We generates:
+
+	subl $12, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, (%esp)
+	movl 20(%esp), %eax
+	movapd (%eax), %xmm0
+	movlpd (%esp), %xmm0
+	movl 16(%esp), %eax
+	movapd %xmm0, (%eax)
+	addl $12, %esp
+	ret
+
+icc generates:
+
+        movl      4(%esp), %edx                                 #3.6
+        movl      8(%esp), %eax                                 #3.6
+        movapd    (%eax), %xmm0                                 #4.22
+        movlpd    12(%esp), %xmm0                               #4.8
+        movapd    %xmm0, (%edx)                                 #4.3
+        ret                                                     #5.1
+
+So icc is smart enough to know that B is in memory so it doesn't load it and
+store it back to stack.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+  return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+void %test(float* %P, <4 x float>* %P2 ) {
+        %xFloat0.688 = load float* %P
+        %loadVector37.712 = load <4 x float>* %P2
+        %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3
+        store <4 x float> %inFloat3.713, <4 x float>* %P2
+        ret void
+}
+
+Generates:
+
+_test:
+        pxor %xmm0, %xmm0
+        movd %xmm0, %eax        ;; EAX = 0!
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+        pinsrw $6, %eax, %xmm0
+        shrl $16, %eax          ;; EAX = 0 again!
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+It would be better to generate:
+
+_test:
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+	xor %eax, %eax
+        pinsrw $6, %eax, %xmm0
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+or use pxor (to make a zero vector) and shuffle (to insert it).
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should compile this:
+#include <xmmintrin.h>
+typedef union {
+  int i[4];
+  float f[4];
+  __m128 v;
+} vector4_t;
+void swizzle (const void *a, vector4_t * b, vector4_t * c) {
+  b->v = _mm_loadl_pi (b->v, (__m64 *) a);
+  c->v = _mm_loadl_pi (c->v, ((__m64 *) a) + 1);
+}
+
+to:
+
+_swizzle:
+        movl    4(%esp), %eax
+        movl    8(%esp), %edx
+        movl    12(%esp), %ecx
+        movlps  (%eax), %xmm0
+        movlps  %xmm0, (%edx)
+        movlps  8(%eax), %xmm0
+        movlps  %xmm0, (%ecx)
+        ret
+
+not:
+
+swizzle:
+        movl 8(%esp), %eax
+        movaps (%eax), %xmm0
+        movl 4(%esp), %ecx
+        movlps (%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        movl 12(%esp), %eax
+        movaps (%eax), %xmm0
+        movlps 8(%ecx), %xmm0
+        movaps %xmm0, (%eax)
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+#include <emmintrin.h>
+__m128i test(long long i) { return _mm_cvtsi64x_si128(i); }
+
+Should turn into a single 'movq %rdi, %xmm0' instruction.  Instead, we 
+get this (on x86-64):
+
+_test:
+	movd %rdi, %xmm1
+	xorps %xmm0, %xmm0
+	movsd %xmm1, %xmm0
+	ret
+
+The LLVM IR is:
+
+target triple = "x86_64-apple-darwin8"
+define <2 x i64> @test(i64 %i) {
+entry:
+	%tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0	
+	%tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1
+	ret <2 x i64> %tmp11
+}
+
+//===---------------------------------------------------------------------===//
+
+These functions should produce the same code:
+
+#include <emmintrin.h>
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int foo(__m128i* val) {
+  return __builtin_ia32_vec_ext_v4si(*val, 1);
+}
+int bar(__m128i* val) {
+  union vs {
+    __m128i *_v;
+    int* _s;
+  } v = {val};
+  return v._s[1];
+}
+
+We currently produce (with -m64):
+
+_foo:
+        pshufd $1, (%rdi), %xmm0
+        movd %xmm0, %eax
+        ret
+_bar:
+        movl 4(%rdi), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..191904a
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt
@@ -0,0 +1,223 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+Make use of "Red Zone".
+
+//===---------------------------------------------------------------------===//
+
+Implement __int128 and long double support.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+  xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+	jmp	_xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+	ucomiss	LC0(%rip), %xmm0
+	cvttss2siq	%xmm0, %rdx
+	jb	L3
+	subss	LC0(%rip), %xmm0
+	movabsq	$-9223372036854775808, %rax
+	cvttss2siq	%xmm0, %rdx
+	xorq	%rax, %rdx
+L3:
+	movq	%rdx, %rax
+	ret
+
+instead of
+
+_conv:
+	movss LCPI1_0(%rip), %xmm1
+	cvttss2siq %xmm0, %rcx
+	movaps %xmm0, %xmm2
+	subss %xmm1, %xmm2
+	cvttss2siq %xmm2, %rax
+	movabsq $-9223372036854775808, %rdx
+	xorq %rdx, %rax
+	ucomiss %xmm1, %xmm0
+	cmovb %rcx, %rax
+	ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+  memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+	movq _b@GOTPCREL(%rip), %rax
+	movzbq (%rax), %rax
+	movq %rax, %rcx
+	shlq $8, %rcx
+	orq %rax, %rcx
+	movq %rcx, %rax
+	shlq $16, %rax
+	orq %rcx, %rax
+	movq %rax, %rcx
+	shlq $32, %rcx
+	movq _X@GOTPCREL(%rip), %rdx
+	orq %rax, %rcx
+	movq %rcx, (%rdx)
+	ret
+
+gcc:
+	movq	_b@GOTPCREL(%rip), %rax
+	movabsq	$72340172838076673, %rdx
+	movzbq	(%rax), %rax
+	imulq	%rdx, %rax
+	movq	_X@GOTPCREL(%rip), %rdx
+	movq	%rax, (%rdx)
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+	movzbl	%al, %edx
+	leaq	0(,%rdx,4), %rax
+	leaq	4+L2(%rip), %rdx
+	leaq	239(%rsp), %rax
+       	jmp	*%rdx
+	movaps	%xmm7, -15(%rax)
+	movaps	%xmm6, -31(%rax)
+	movaps	%xmm5, -47(%rax)
+	movaps	%xmm4, -63(%rax)
+	movaps	%xmm3, -79(%rax)
+	movaps	%xmm2, -95(%rax)
+	movaps	%xmm1, -111(%rax)
+	movaps	%xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
+   POD structure or union type, or contains unaligned fields, it has class 
+   MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS. 
+3. Each field of an object is classified recursively so that always two fields
+   are considered. The resulting class is calculated according to the classes
+   of the fields in the eightbyte: 
+   (a) If both classes are equal, this is the resulting class. 
+   (b) If one of the classes is NO_CLASS, the resulting class is the other 
+       class. 
+   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
+   (d) If one of the classes is INTEGER, the result is the INTEGER. 
+   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+      class. 
+   (f) Otherwise class SSE is used. 
+4. Then a post merger cleanup is done: 
+   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
+   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+    typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+    typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+    typedef struct { int64_t i; int8_t j; int64_t d; } S;
+    void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in 
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+    void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+    void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+    call void %test( undef, undef, undef, undef, undef, undef,
+                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+    void %test(int %d,
+               long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+        movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+        leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..f15090a
--- /dev/null
+++ b/lib/Target/X86/README.txt
@@ -0,0 +1,1150 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+Missing features:
+  - Support for SSE4: http://www.intel.com/software/penryn
+http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
+  - support for 3DNow!
+  - weird abis?
+
+//===---------------------------------------------------------------------===//
+
+Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
+Hi and Lo parts (combination of MUL and MULH[SU] into one node).  Add this to
+X86, & make the dag combiner produce it when needed.  This will eliminate one
+imul from the code generated for:
+
+long long test(long long X, long long Y) { return X*Y; }
+
+by using the EAX result from the mul.  We should add a similar node for
+DIVREM.
+
+another case is:
+
+long long test(int X, int Y) { return (long long)X*Y; }
+
+... which should only be one imul instruction.
+
+or:
+
+unsigned long long int t2(unsigned int a, unsigned int b) {
+       return (unsigned long long)a * b;
+}
+
+... which should be one mul instruction.
+
+
+This can be done with a custom expander, but it would be nice to move this to
+generic code.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+        return X/Y;
+}
+
+This can be done trivially with a custom legalizer.  What about overflow 
+though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be  ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+        movzbl  %dil, %eax
+        xorl    $1, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+   issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+   Sequencing of Instructions".
+4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
+   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
+   and other related papers.
+   http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+
+//===---------------------------------------------------------------------===//
+
+Count leading zeros and count trailing zeros:
+
+int clz(int X) { return __builtin_clz(X); }
+int ctz(int X) { return __builtin_ctz(X); }
+
+$ gcc t.c -S -o - -O3  -fomit-frame-pointer -masm=intel
+clz:
+        bsr     %eax, DWORD PTR [%esp+4]
+        xor     %eax, 31
+        ret
+ctz:
+        bsf     %eax, DWORD PTR [%esp+4]
+        ret
+
+however, check that these are defined for 0 and 32.  Our intrinsics are, GCC's
+aren't.
+
+Another example (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor.  They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare.  The
+pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+commutative, it is not matched with the load on both sides.  The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+How about intrinsics? An example is:
+  *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
+
+compiles to
+	pmuludq (%eax), %xmm0
+	movl 8(%esp), %eax
+	movdqa (%eax), %xmm1
+	pmulhuw %xmm0, %xmm1
+
+The transformation probably requires a X86 specific pass or a DAG combiner
+target specific hook.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+        movl 8(%esp), %eax
+        cmpl %eax, 4(%esp)
+        setl %al
+        movzbl %al, %eax
+        ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+        movl 8(%esp), %ebx
+        xor  %eax, %eax
+        cmpl %ebx, 4(%esp)
+        setl %al
+        ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important.  e.g., for:
+
+void setbit(int *target, int bit) {
+    *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+    *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+	movl $16843009, 4(%edx)
+	movl $16843009, (%edx)
+	movw $257, 8(%edx)
+
+It might be better to generate
+
+	movl $16843009, %eax
+	movl %eax, 4(%edx)
+	movl %eax, (%edx)
+	movw al, 8(%edx)
+	
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
+get this:
+
+int %test1(int %X) {
+        %Y = div int %X, 8
+        ret int %Y
+}
+
+_test1:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        sarl $31, %ecx
+        shrl $29, %ecx
+        addl %ecx, %eax
+        sarl $3, %eax
+        ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+        movl    4(%esp), %eax
+        cmpl    $-1, %eax
+        leal    7(%eax), %ecx
+        cmovle  %ecx, %eax
+        sarl    $3, %eax
+        ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+The first BB of this code:
+
+declare bool %foo()
+int %bar() {
+        %V = call bool %foo()
+        br bool %V, label %T, label %F
+T:
+        ret int 1
+F:
+        call bool %foo()
+        ret int 12
+}
+
+compiles to:
+
+_bar:
+        subl $12, %esp
+        call L_foo$stub
+        xorb $1, %al
+        testb %al, %al
+        jne LBB_bar_2   # F
+
+It would be better to emit "cmp %al, 1" than a xor and test.
+
+//===---------------------------------------------------------------------===//
+
+Enable X86InstrInfo::convertToThreeAddress().
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+%X = weak global int 0
+
+void %foo(int %N) {
+	%N = cast int %N to uint
+	%tmp.24 = setgt int %N, 0
+	br bool %tmp.24, label %no_exit, label %return
+
+no_exit:
+	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
+	%i.0.0 = cast uint %indvar to int
+	volatile store int %i.0.0, int* %X
+	%indvar.next = add uint %indvar, 1
+	%exitcond = seteq uint %indvar.next, %N
+	br bool %exitcond, label %return, label %no_exit
+
+return:
+	ret void
+}
+
+compiles into:
+
+	.text
+	.align	4
+	.globl	_foo
+_foo:
+	movl 4(%esp), %eax
+	cmpl $1, %eax
+	jl LBB_foo_4	# return
+LBB_foo_1:	# no_exit.preheader
+	xorl %ecx, %ecx
+LBB_foo_2:	# no_exit
+	movl L_X$non_lazy_ptr, %edx
+	movl %ecx, (%edx)
+	incl %ecx
+	cmpl %eax, %ecx
+	jne LBB_foo_2	# no_exit
+LBB_foo_3:	# return.loopexit
+LBB_foo_4:	# return
+	ret
+
+We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
+remateralization is implemented. This can be accomplished with 1) a target
+dependent LICM pass or 2) makeing SelectDAG represent the whole function. 
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+We are generating far worse code than gcc:
+
+volatile short X, Y;
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+LBB1_1:	#bb.preheader
+	xorl %ecx, %ecx
+	xorw %dx, %dx
+LBB1_2:	#bb
+	movl L_X$non_lazy_ptr, %esi
+	movw %dx, (%esi)
+	movw %dx, %si
+	shlw $2, %si
+	movl L_Y$non_lazy_ptr, %edi
+	movw %si, (%edi)
+	incl %ecx
+	incw %dx
+	cmpl %eax, %ecx
+	jne LBB1_2	#bb
+
+vs.
+
+	xorl	%edx, %edx
+	movl	L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
+	movl	L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
+L4:
+	movw	%dx, (%esi)
+	leal	0(,%edx,4), %eax
+	movw	%ax, (%ecx)
+	addl	$1, %edx
+	cmpl	%edx, %edi
+	jne	L4
+
+There are 3 issues:
+
+1. Lack of post regalloc LICM.
+2. Poor sub-regclass support. That leads to inability to promote the 16-bit
+   arithmetic op to 32-bit and making use of leal.
+3. LSR unable to reused IV for a different type (i16 vs. i32) even though
+   the cast would be free.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+mov $reg, 48(%esp)
+...
+leal 48(%esp), %eax
+mov %eax, (%esp)
+call _foo
+
+Obviously it would have been better for the first mov (or any op) to store
+directly %esp[0] if there are no other uses.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+  if (_mm_comige_ss(*A, *B))
+    return 3;
+  else
+    return 4;
+}
+
+_test:
+	movl 8(%esp), %eax
+	movaps (%eax), %xmm0
+	movl 4(%esp), %eax
+	movaps (%eax), %xmm1
+	comiss %xmm0, %xmm1
+	setae %al
+	movzbl %al, %ecx
+	movl $3, %eax
+	movl $4, %edx
+	cmpl $0, %ecx
+	cmove %edx, %eax
+	ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+If shorter, we should use things like:
+movzwl %ax, %eax
+instead of:
+andl $65535, %EAX
+
+The former can also be used when the two-addressy nature of the 'and' would
+require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
+
+//===---------------------------------------------------------------------===//
+
+Bad codegen:
+
+char foo(int x) { return x; }
+
+_foo:
+	movl 4(%esp), %eax
+	shll $24, %eax
+	sarl $24, %eax
+	ret
+
+SIGN_EXTEND_INREG can be implemented as (sext (trunc)) to take advantage of 
+sub-registers.
+
+//===---------------------------------------------------------------------===//
+
+Consider this:
+
+typedef struct pair { float A, B; } pair;
+void pairtest(pair P, float *FP) {
+        *FP = P.A+P.B;
+}
+
+We currently generate this code with llvmgcc4:
+
+_pairtest:
+        movl 8(%esp), %eax
+        movl 4(%esp), %ecx
+        movd %eax, %xmm0
+        movd %ecx, %xmm1
+        addss %xmm0, %xmm1
+        movl 12(%esp), %eax
+        movss %xmm1, (%eax)
+        ret
+
+we should be able to generate:
+_pairtest:
+        movss 4(%esp), %xmm0
+        movl 12(%esp), %eax
+        addss 8(%esp), %xmm0
+        movss %xmm0, (%eax)
+        ret
+
+The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
+integer chunks.  It does this so that structs like {short,short} are passed in
+a single 32-bit integer stack slot.  We should handle the safe cases above much
+nicer, while still handling the hard cases.
+
+While true in general, in this specific case we could do better by promoting
+load int + bitcast to float -> load fload.  This basically needs alignment info,
+the code is already implemented (but disabled) in dag combine).
+
+//===---------------------------------------------------------------------===//
+
+Another instruction selector deficiency:
+
+void %bar() {
+	%tmp = load int (int)** %foo
+	%tmp = tail call int %tmp( int 3 )
+	ret void
+}
+
+_bar:
+	subl $12, %esp
+	movl L_foo$non_lazy_ptr, %eax
+	movl (%eax), %eax
+	call *%eax
+	addl $12, %esp
+	ret
+
+The current isel scheme will not allow the load to be folded in the call since
+the load's chain result is read by the callseq_start.
+
+//===---------------------------------------------------------------------===//
+
+Don't forget to find a way to squash noop truncates in the JIT environment.
+
+//===---------------------------------------------------------------------===//
+
+Implement anyext in the same manner as truncate that would allow them to be
+eliminated.
+
+//===---------------------------------------------------------------------===//
+
+How about implementing truncate / anyext as a property of machine instruction
+operand? i.e. Print as 32-bit super-class register / 16-bit sub-class register.
+Do this for the cases where a truncate / anyext is guaranteed to be eliminated.
+For IA32 that is truncate from 32 to 16 and anyext from 16 to 32.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
+}
+
+We currently emits
+	imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+	movl	4(%esp), %eax
+	leal	(%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+//===---------------------------------------------------------------------===//
+
+Implement CTTZ, CTLZ with bsf and bsr.
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+int %foo(int* %a, int %t) {
+entry:
+        br label %cond_true
+
+cond_true:              ; preds = %cond_true, %entry
+        %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]  
+        %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
+        %tmp2 = getelementptr int* %a, int %x.0.0              
+        %tmp3 = load int* %tmp2         ; <int> [#uses=1]
+        %tmp5 = add int %t_addr.0.0, %x.0.0             ; <int> [#uses=1]
+        %tmp7 = add int %tmp5, %tmp3            ; <int> [#uses=2]
+        %tmp9 = add int %x.0.0, 1               ; <int> [#uses=2]
+        %tmp = setgt int %tmp9, 39              ; <bool> [#uses=1]
+        br bool %tmp, label %bb12, label %cond_true
+
+bb12:           ; preds = %cond_true
+        ret int %tmp7
+}
+
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
+
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
+
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
+
+//===---------------------------------------------------------------------===//
+
+Currently we don't have elimination of redundant stack manipulations. Consider
+the code:
+
+int %main() {
+entry:
+	call fastcc void %test1( )
+	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
+	ret int 0
+}
+
+declare fastcc void %test1()
+
+declare fastcc void %test2(sbyte*)
+
+
+This currently compiles to:
+
+	subl $16, %esp
+	call _test5
+	addl $12, %esp
+	subl $16, %esp
+	movl $_test5, (%esp)
+	call _test6
+	addl $12, %esp
+
+The add\sub pair is really unneeded here.
+
+//===---------------------------------------------------------------------===//
+
+We currently compile sign_extend_inreg into two shifts:
+
+long foo(long X) {
+  return (long)(signed char)X;
+}
+
+becomes:
+
+_foo:
+        movl 4(%esp), %eax
+        shll $24, %eax
+        sarl $24, %eax
+        ret
+
+This could be:
+
+_foo:
+        movsbl  4(%esp),%eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+uint %test3(uint %X) {
+        %tmp1 = rem uint %X, 255
+        ret uint %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)                                                                          
+937  937 0x3d0a incl     %esi                           
+3    3   0x3d0b cmpb     %bl, %dl                                               
+27   27  0x3d0d jnz      0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Start using the flags more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+and:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x + y) < 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+
+add_zf:
+        addl    (%rdi), %esi
+        movl    %edx, %eax
+        cmovns  %ecx, %eax
+        ret
+
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        testl %esi, %esi
+        cmovs %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <math.h>
+int foo(double X) { return isnan(X); }
+
+compiles to (-m64):
+
+_foo:
+        pxor %xmm1, %xmm1
+        ucomisd %xmm1, %xmm0
+        setp %al
+        movzbl %al, %eax
+        ret
+
+the pxor is not needed, we could compare the value against itself.
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+  if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
+        ret
+LBB1_1:
+        call L_abort$stub
+
+It would be better to produce:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+We need to teach the codegen to convert two-address INC instructions to LEA
+when the flags are dead.  For example, on X86-64, compile:
+
+int foo(int A, int B) {
+  return A+1;
+}
+
+to:
+
+_foo:
+        leal    1(%edi), %eax
+        ret
+
+instead of:
+
+_foo:
+        incl %edi
+        movl %edi, %eax
+        ret
+
+Another example is:
+
+;; X's live range extends beyond the shift, so the register allocator
+;; cannot coalesce it with Y.  Because of this, a copy needs to be
+;; emitted before the shift to save the register value before it is
+;; clobbered.  However, this copy is not needed if the register
+;; allocator turns the shift into an LEA.  This also occurs for ADD.
+
+; Check that the shift gets turned into an LEA.
+; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
+; RUN:   not grep {mov E.X, E.X}
+
+%G = external global int
+
+int %test1(int %X, int %Y) {
+        %Z = add int %X, %Y
+        volatile store int %Y, int* %G
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+int %test2(int %X) {
+        %Z = add int %X, 1  ;; inc
+        volatile store int %Z, int* %G
+        ret int %X
+}
+
+//===---------------------------------------------------------------------===//
+
+We use push/pop of stack space around calls in situations where we don't have to.
+Call to f below produces:
+        subl $16, %esp      <<<<<
+        movl %eax, (%esp)
+        call L_f$stub
+        addl $16, %esp     <<<<<
+The stack push/pop can be moved into the prolog/epilog.  It does this because it's
+building the frame pointer, but this should not be sufficient, only the use of alloca
+should cause it to do this.
+(There are other issues shown by this code, but this is one.)
+
+typedef struct _range_t {
+    float fbias;
+    float fscale;
+    int ibias;
+    int iscale;
+    int ishift;
+    unsigned char lut[];
+} range_t;
+
+struct _decode_t {
+    int type:4;
+    int unit:4;
+    int alpha:8;
+    int N:8;
+    int bpc:8;
+    int bpp:16;
+    int skip:8;
+    int swap:8;
+    const range_t*const*range;
+};
+
+typedef struct _decode_t decode_t;
+
+extern int f(const decode_t* decode);
+
+int decode_byte (const decode_t* decode) {
+  if (decode->swap != 0)
+    return f(decode);
+  return 0;
+}
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+#include <xmmintrin.h>
+unsigned test(float f) {
+ return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
+}
+
+Compiles to:
+_test:
+        movss 4(%esp), %xmm0
+        movd %xmm0, %eax
+        ret
+
+it should compile to a move from the stack slot directly into eax.  DAGCombine
+has this xform, but it is currently disabled until the alignment fields of 
+the load/store nodes are trustworthy.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+For code like:
+phi (undef, x)
+
+We get an implicit def on the undef side. If the phi is spilled, we then get:
+implicitdef xmm1
+store xmm1 -> stack
+
+It should be possible to teach the x86 backend to "fold" the store into the
+implicitdef, which just deletes the implicit def.
+
+These instructions should go away:
+#IMPLICIT_DEF %xmm1 
+movaps %xmm1, 192(%esp) 
+movaps %xmm1, 224(%esp) 
+movaps %xmm1, 176(%esp)
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..c7663be
--- /dev/null
+++ b/lib/Target/X86/X86.h
@@ -0,0 +1,66 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include <iosfwd>
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPassManager;
+class FunctionPass;
+class MachineCodeEmitter;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM, bool Fast);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86
+/// assembly code for a MachineFunction to the given output stream,
+/// using the given target machine description.
+///
+FunctionPass *createX86CodePrinterPass(std::ostream &o, X86TargetMachine &tm);
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86CodeEmitterPass(X86TargetMachine &TM,
+                                       MachineCodeEmitter &MCE);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..98362c8
--- /dev/null
+++ b/lib/Target/X86/X86.td
@@ -0,0 +1,150 @@
+//===- X86.td - Target definition file for the Intel X86 arch ---*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "../Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+ 
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      [FeatureMMX]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions">;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions",
+                                      [FeatureSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      []>;
+def : Proc<"pentium2",        [FeatureMMX]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium-m",       [FeatureSSE2]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"x86-64",          [Feature64Bit]>;
+def : Proc<"yonah",           [FeatureSSE3]>;
+def : Proc<"prescott",        [FeatureSSE3]>;
+def : Proc<"nocona",          [FeatureSSE3]>;
+def : Proc<"core2",           [FeatureSSSE3]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA]>;
+def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA]>;
+def : Proc<"k8",              [Feature3DNowA, Feature64Bit]>;
+def : Proc<"opteron",         [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon64",        [Feature3DNowA, Feature64Bit]>;
+def : Proc<"athlon-fx",       [Feature3DNowA, Feature64Bit]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+  let TSFlagsFields = ["FormBits",
+                       "hasOpSizePrefix",
+                       "hasAdSizePrefix",
+                       "Prefix",
+                       "hasREX_WPrefix",
+                       "ImmTypeBits",
+                       "FPFormBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       6,
+                       7,
+                       8,
+                       12,
+                       13,
+                       16,
+                       24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTAsmPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelAsmPrinter";
+  int Variant = 1;
+}
+
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}
diff --git a/lib/Target/X86/X86ATTAsmPrinter.cpp b/lib/Target/X86/X86ATTAsmPrinter.cpp
new file mode 100755
index 0000000..e97babe
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.cpp
@@ -0,0 +1,607 @@
+//===-- X86ATTAsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to AT&T format assembly
+// language. This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTAsmPrinter.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "X86TargetAsmInfo.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+static std::string computePICLabel(unsigned FnNum,
+                                   const TargetAsmInfo *TAI,
+                                   const X86Subtarget* Subtarget)  {
+  std::string label;
+  if (Subtarget->isTargetDarwin())
+    label =  "\"L" + utostr_32(FnNum) + "$pb\"";
+  else if (Subtarget->isTargetELF())
+    label = ".Lllvm$" + utostr_32(FnNum) + "$piclabel";
+  else
+    assert(0 && "Don't know how to print PIC label!\n");
+
+  return label;
+}
+
+/// getSectionForFunction - Return the section that we should emit the
+/// specified function body into.
+std::string X86ATTAsmPrinter::getSectionForFunction(const Function &F) const {
+  switch (F.getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage: 
+  case Function::DLLExportLinkage:
+  case Function::ExternalLinkage:
+    return TAI->getTextSection();
+  case Function::WeakLinkage:
+  case Function::LinkOnceLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      return ".section __TEXT,__textcoal_nt,coalesced,pure_instructions";
+    } else if (Subtarget->isTargetCygMing()) {
+      return "\t.section\t.text$linkonce." + CurrentFnName + ",\"ax\"";
+    } else {
+      return "\t.section\t.llvm.linkonce.t." + CurrentFnName +
+             ",\"ax\",@progbits";
+    }
+  }
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86ATTAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  if (TAI->doesSupportDebugInformation()) {
+    // Let PassManager know we need debug information and relay
+    // the MachineModuleInfo address on to DwarfWriter.
+    DW.SetModuleInfo(&getAnalysis<MachineModuleInfo>());
+  }
+
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  unsigned CC = F->getCallingConv();
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+    
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+    EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+    break;
+  case Function::DLLExportLinkage:
+    DLLExportedFns.insert(Mang->makeNameProper(F->getName(), ""));
+    //FALLS THROUGH
+  case Function::ExternalLinkage:
+    EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+    O << "\t.globl\t" << CurrentFnName << "\n";    
+    break;
+  case Function::LinkOnceLinkage:
+  case Function::WeakLinkage:
+    if (Subtarget->isTargetDarwin()) {
+      O << "\t.globl\t" << CurrentFnName << "\n";
+      O << "\t.weak_definition\t" << CurrentFnName << "\n";
+    } else if (Subtarget->isTargetCygMing()) {
+      EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+      O << "\t.globl " << CurrentFnName << "\n";
+      O << "\t.linkonce discard\n";
+    } else {
+      EmitAlignment(4, F);     // FIXME: This should be parameterized somewhere.
+      O << "\t.weak " << CurrentFnName << "\n";
+    }
+    break;
+  }
+  if (F->hasHiddenVisibility()) {
+    if (const char *Directive = TAI->getHiddenDirective())
+      O << Directive << CurrentFnName << "\n";
+  } else if (F->hasProtectedVisibility()) {
+    if (const char *Directive = TAI->getProtectedDirective())
+      O << Directive << CurrentFnName << "\n";
+  }
+
+  if (Subtarget->isTargetELF())
+    O << "\t.type " << CurrentFnName << ",@function\n";
+  else if (Subtarget->isTargetCygMing()) {
+    O << "\t.def\t " << CurrentFnName
+      << ";\t.scl\t" <<
+      (F->getLinkage() == Function::InternalLinkage ? COFF::C_STAT : COFF::C_EXT)
+      << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+      << ";\t.endef\n";
+  }
+
+  O << CurrentFnName << ":\n";
+  // Add some workaround for linkonce linkage on Cygwin\MinGW
+  if (Subtarget->isTargetCygMing() &&
+      (F->getLinkage() == Function::LinkOnceLinkage ||
+       F->getLinkage() == Function::WeakLinkage))
+    O << "Lllvm$workaround$fake$stub$" << CurrentFnName << ":\n";
+
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit pre-function debug information.
+    DW.BeginFunction(&MF);
+  }
+
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block.
+    if (I->pred_begin() != I->pred_end()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  if (TAI->hasDotTypeDotSizeDirective())
+    O << "\t.size " << CurrentFnName << ", .-" << CurrentFnName << "\n";
+
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit post-function debug information.
+    DW.EndFunction();
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+  
+  // We didn't modify anything.
+  return false;
+}
+
+static inline bool printGOT(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleGOT() && TM.getRelocationModel() == Reloc::PIC_;
+}
+
+static inline bool printStub(TargetMachine &TM, const X86Subtarget* ST) {
+  return ST->isPICStyleStub() && TM.getRelocationModel() != Reloc::Static;
+}
+
+void X86ATTAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                    const char *Modifier, bool NotRIPRel) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Virtual registers should not make it this far!");
+    O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      MVT::ValueType VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+      O << (char)tolower(*Name);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier ||
+        (strcmp(Modifier, "debug") && strcmp(Modifier, "mem")))
+      O << '$';
+    O << MO.getImmedValue();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << "_"
+      << MO.getJumpTableIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+    
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << '$';
+    O << TAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getConstantPoolIndex();
+
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (Subtarget->isPICStyleStub())
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+      else if (Subtarget->isPICStyleGOT())
+        O << "@GOTOFF";
+    }
+    
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << "+" << Offset;
+    else if (Offset < 0)
+      O << Offset;
+
+    if (isMemOp && Subtarget->isPICStyleRIPRel() && !NotRIPRel)
+      O << "(%rip)";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    bool needCloseParen = false;
+
+    GlobalValue *GV = MO.getGlobal();
+    GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+    bool isThreadLocal = GVar && GVar->isThreadLocal();
+
+    std::string Name = Mang->getValueName(GV);
+    X86SharedAsmPrinter::decorateName(Name, GV);
+    
+    if (!isMemOp && !isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    if (printStub(TM, Subtarget)) {
+      // Link-once, declaration, or Weakly-linked global variables need
+      // non-lazily-resolved stubs
+      if (GV->isDeclaration() ||
+          GV->hasWeakLinkage() ||
+          GV->hasLinkOnceLinkage()) {
+        // Dynamically-resolved functions need a stub for the function.
+        if (isCallOp && isa<Function>(GV)) {
+          FnStubs.insert(Name);
+          O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+        } else {
+          GVStubs.insert(Name);
+          O << TAI->getPrivateGlobalPrefix() << Name << "$non_lazy_ptr";
+        }
+      } else {
+        if (GV->hasDLLImportLinkage())
+          O << "__imp_";          
+        O << Name;
+      }
+      
+      if (!isCallOp && TM.getRelocationModel() == Reloc::PIC_)
+        O << "-\"" << TAI->getPrivateGlobalPrefix() << getFunctionNumber()
+          << "$pb\"";
+    } else {
+      if (GV->hasDLLImportLinkage()) {
+        O << "__imp_";          
+      }       
+      O << Name;
+
+      if (isCallOp && isa<Function>(GV)) {
+        if (printGOT(TM, Subtarget)) {
+          // Assemble call via PLT for non-local symbols
+          if (!(GV->hasHiddenVisibility() || GV->hasProtectedVisibility()) ||
+              GV->isDeclaration())
+            O << "@PLT";
+        }
+        if (Subtarget->isTargetCygMing() && GV->isDeclaration())
+          // Save function name for later type emission
+          FnStubs.insert(Name);
+      }
+    }
+
+    if (GV->hasExternalWeakLinkage())
+      ExtWeakSymbols.insert(GV);
+    
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << "+" << Offset;
+    else if (Offset < 0)
+      O << Offset;
+
+    if (isThreadLocal) {
+      if (TM.getRelocationModel() == Reloc::PIC_)
+        O << "@TLSGD"; // general dynamic TLS model
+      else
+        if (GV->isDeclaration())
+          O << "@INDNTPOFF"; // initial exec TLS model
+        else
+          O << "@NTPOFF"; // local exec TLS model
+    } else if (isMemOp) {
+      if (printGOT(TM, Subtarget)) {
+        if (Subtarget->GVRequiresExtraLoad(GV, TM, false))
+          O << "@GOT";
+        else
+          O << "@GOTOFF";
+      } else if (Subtarget->isPICStyleRIPRel() && !NotRIPRel) {
+        if ((GV->isDeclaration() ||
+             GV->hasWeakLinkage() ||
+             GV->hasLinkOnceLinkage()) &&
+            TM.getRelocationModel() != Reloc::Static)
+          O << "@GOTPCREL";
+
+        if (needCloseParen) {
+          needCloseParen = false;
+          O << ')';
+        }
+
+        // Use rip when possible to reduce code size, except when
+        // index or base register are also part of the address. e.g.
+        // foo(%rip)(%rcx,%rax,4) is not legal
+        O << "(%rip)";
+      }
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool needCloseParen = false;
+    std::string Name(TAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    if (isCallOp && printStub(TM, Subtarget)) {
+      FnStubs.insert(Name);
+      O << TAI->getPrivateGlobalPrefix() << Name << "$stub";
+      return;
+    }
+    if (!isCallOp)
+      O << '$';
+    else if (Name[0] == '$') {
+      // The name begins with a dollar-sign. In order to avoid having it look
+      // like an integer immediate to the assembler, enclose it in parens.
+      O << '(';
+      needCloseParen = true;
+    }
+
+    O << Name;
+
+    if (printGOT(TM, Subtarget)) {
+      std::string GOTName(TAI->getGlobalPrefix());
+      GOTName+="_GLOBAL_OFFSET_TABLE_";
+      if (Name == GOTName)
+        // HACK! Emit extra offset to PC during printing GOT offset to
+        // compensate for the size of popl instruction. The resulting code
+        // should look like:
+        //   call .piclabel
+        // piclabel:
+        //   popl %some_register
+        //   addl $_GLOBAL_ADDRESS_TABLE_ + [.-piclabel], %some_register
+        O << " + [.-"
+          << computePICLabel(getFunctionNumber(), TAI, Subtarget) << "]";
+
+      if (isCallOp)
+        O << "@PLT";
+    }
+
+    if (needCloseParen)
+      O << ')';
+
+    if (!isCallOp && Subtarget->isPICStyleRIPRel())
+      O << "(%rip)";
+
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86ATTAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImmedValue();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86ATTAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier){
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  MachineOperand BaseReg  = MI->getOperand(Op);
+  MachineOperand IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  bool NotRIPRel = IndexReg.getReg() || BaseReg.getReg();
+  if (DispSpec.isGlobalAddress() ||
+      DispSpec.isConstantPoolIndex() ||
+      DispSpec.isJumpTableIndex()) {
+    printOperand(MI, Op+3, "mem", NotRIPRel);
+  } else {
+    int DispVal = DispSpec.getImmedValue();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << DispVal;
+  }
+
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    unsigned ScaleVal = MI->getOperand(Op+1).getImmedValue();
+    unsigned BaseRegOperand = 0, IndexRegOperand = 2;
+      
+    // There are cases where we can end up with ESP/RSP in the indexreg slot.
+    // If this happens, swap the base/index register to support assemblers that
+    // don't work when the index is *SP.
+    if (IndexReg.getReg() == X86::ESP || IndexReg.getReg() == X86::RSP) {
+      assert(ScaleVal == 1 && "Scale not supported for stack pointer!");
+      std::swap(BaseReg, IndexReg);
+      std::swap(BaseRegOperand, IndexRegOperand);
+    }
+    
+    O << "(";
+    if (BaseReg.getReg())
+      printOperand(MI, Op+BaseRegOperand, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ",";
+      printOperand(MI, Op+IndexRegOperand, Modifier);
+      if (ScaleVal != 1)
+        O << "," << ScaleVal;
+    }
+    O << ")";
+  }
+}
+
+void X86ATTAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  std::string label = computePICLabel(getFunctionNumber(), TAI, Subtarget);
+  O << label << "\n" << label << ":";
+}
+
+
+bool X86ATTAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                         const char Mode) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  }
+
+  O << '%';
+  for (const char *Name = RI.get(Reg).Name; *Name; ++Name)
+    O << (char)tolower(*Name);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86ATTAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant, 
+                                       const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      printOperand(MI, OpNo, "mem");
+      return false;
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+      if (MI->getOperand(OpNo).isReg())
+        return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+      printOperand(MI, OpNo);
+      return false;
+      
+    case 'P': // Don't print @PLT, but do print as memory.
+      printOperand(MI, OpNo, "mem");
+      return false;
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86ATTAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo,
+                                             unsigned AsmVariant, 
+                                             const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in AT&T syntax to the current output stream.
+///
+void X86ATTAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // See if a truncate instruction can be turned into a nop.
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
+    const MachineOperand &MO0 = MI->getOperand(0);
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned Reg0 = MO0.getReg();
+    unsigned Reg1 = MO1.getReg();
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+    else
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+    O << TAI->getCommentString() << " TRUNCATE ";
+    if (Reg0 != Reg1)
+      O << "\n\t";
+    break;
+  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
+  }
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter.inc"
+
diff --git a/lib/Target/X86/X86ATTAsmPrinter.h b/lib/Target/X86/X86ATTAsmPrinter.h
new file mode 100755
index 0000000..a3bdce9
--- /dev/null
+++ b/lib/Target/X86/X86ATTAsmPrinter.h
@@ -0,0 +1,87 @@
+//===-- X86ATTAsmPrinter.h - Convert X86 LLVM code to AT&T assembly -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ATTASMPRINTER_H
+#define X86ATTASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+struct X86ATTAsmPrinter : public X86SharedAsmPrinter {
+ X86ATTAsmPrinter(std::ostream &O, X86TargetMachine &TM, const TargetAsmInfo *T)
+    : X86SharedAsmPrinter(O, TM, T) { }
+
+  virtual const char *getPassName() const {
+    return "X86 AT&T-Style Assembly Printer";
+  }
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0, bool NotRIPRel = false);
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo, "subreg64");
+  }
+  
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  
+  void printMachineInstruction(const MachineInstr *MI);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  bool runOnMachineFunction(MachineFunction &F);
+  
+  /// getSectionForFunction - Return the section that we should emit the
+  /// specified function body into.
+  virtual std::string getSectionForFunction(const Function &F) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
new file mode 100644
index 0000000..59b9b1f
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -0,0 +1,409 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM IR to X86 assembly -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86ATTAsmPrinter.h"
+#include "X86COFF.h"
+#include "X86IntelAsmPrinter.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData *TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+  
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI)
+    // Size should be aligned to DWORD boundary
+    Size += ((TD->getTypeSize(AI->getType()) + 3)/4)*4;
+  
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// decorateName - Query FunctionInfoMap and use this information for various
+/// name decoration.
+void X86SharedAsmPrinter::decorateName(std::string &Name,
+                                       const GlobalValue *GV) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  unsigned CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+
+  // Decorate names only when we're targeting Cygwin/Mingw32 targets
+  if (!Subtarget->isTargetCygMing())
+    return;
+    
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+
+  const X86MachineFunctionInfo *Info;
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TM.getTargetData());
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+  
+  const FunctionType *FT = F->getFunctionType();
+  switch (Info->getDecorationStyle()) {
+  case None:
+    break;
+  case StdCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && FT->isStructReturn())) 
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+    break;
+  case FastCall:
+    // "Pure" variadic functions do not receive @0 suffix.
+    if (!FT->isVarArg() || (FT->getNumParams() == 0) ||
+        (FT->getNumParams() == 1 && FT->isStructReturn())) 
+      Name += '@' + utostr_32(Info->getBytesToPopOnReturn());
+
+    if (Name[0] == '_') {
+      Name[0] = '@';
+    } else {
+      Name = '@' + Name;
+    }    
+    break;
+  default:
+    assert(0 && "Unsupported DecorationStyle");
+  }
+}
+
+/// doInitialization
+bool X86SharedAsmPrinter::doInitialization(Module &M) {
+  if (TAI->doesSupportDebugInformation()) {
+    // Emit initial debug information.
+    DW.BeginModule(&M);
+  }
+
+  AsmPrinter::doInitialization(M);
+
+  // Darwin wants symbols to be quoted if they have complex names.
+  if (Subtarget->isTargetDarwin())
+    Mang->setUseQuotes(true);
+
+  return false;
+}
+
+bool X86SharedAsmPrinter::doFinalization(Module &M) {
+  // Note: this code is not shared by the Intel printer as it is too different
+  // from how MASM does things.  When making changes here don't forget to look
+  // at X86IntelAsmPrinter::doFinalization().
+  const TargetData *TD = TM.getTargetData();
+  
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (!I->hasInitializer())
+      continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I)) {
+      if (Subtarget->isTargetDarwin() &&
+          TM.getRelocationModel() == Reloc::Static) {
+        if (I->getName() == "llvm.global_ctors")
+          O << ".reference .constructors_used\n";
+        else if (I->getName() == "llvm.global_dtors")
+          O << ".reference .destructors_used\n";
+      }
+      continue;
+    }
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    const Type *Type = C->getType();
+    unsigned Size = TD->getTypeSize(Type);
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+
+    if (I->hasHiddenVisibility()) {
+      if (const char *Directive = TAI->getHiddenDirective())
+        O << Directive << name << "\n";
+    } else if (I->hasProtectedVisibility()) {
+      if (const char *Directive = TAI->getProtectedDirective())
+        O << Directive << name << "\n";
+    }
+    
+    if (Subtarget->isTargetELF())
+      O << "\t.type " << name << ",@object\n";
+    
+    if (C->isNullValue()) {
+      if (I->hasExternalLinkage()) {
+        if (const char *Directive = TAI->getZeroFillDirective()) {
+          O << "\t.globl\t" << name << "\n";
+          O << Directive << "__DATA__, __common, " << name << ", "
+            << Size << ", " << Align << "\n";
+          continue;
+        }
+      }
+      
+      if (!I->hasSection() && !I->isThreadLocal() &&
+          (I->hasInternalLinkage() || I->hasWeakLinkage() ||
+           I->hasLinkOnceLinkage())) {
+        if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+        if (!NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(TAI->getBSSSection(), I);
+        else
+          SwitchToDataSection(TAI->getDataSection(), I);
+        if (TAI->getLCOMMDirective() != NULL) {
+          if (I->hasInternalLinkage()) {
+            O << TAI->getLCOMMDirective() << name << "," << Size;
+            if (Subtarget->isTargetDarwin())
+              O << "," << Align;
+          } else
+            O << TAI->getCOMMDirective()  << name << "," << Size;
+        } else {
+          if (!Subtarget->isTargetCygMing()) {
+            if (I->hasInternalLinkage())
+              O << "\t.local\t" << name << "\n";
+          }
+          O << TAI->getCOMMDirective()  << name << "," << Size;
+          if (TAI->getCOMMDirectiveTakesAlignment())
+            O << "," << (TAI->getAlignmentIsInBytes() ? (1 << Align) : Align);
+        }
+        O << "\t\t" << TAI->getCommentString() << " " << I->getName() << "\n";
+        continue;
+      }
+    }
+
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      if (Subtarget->isTargetDarwin()) {
+        O << "\t.globl " << name << "\n"
+          << "\t.weak_definition " << name << "\n";
+        SwitchToDataSection(".section __DATA,__const_coal,coalesced", I);
+      } else if (Subtarget->isTargetCygMing()) {
+        std::string SectionName(".section\t.data$linkonce." +
+                                name +
+                                ",\"aw\"");
+        SwitchToDataSection(SectionName.c_str(), I);
+        O << "\t.globl " << name << "\n"
+          << "\t.linkonce same_size\n";
+      } else {
+        std::string SectionName("\t.section\t.llvm.linkonce.d." +
+                                name +
+                                ",\"aw\",@progbits");
+        SwitchToDataSection(SectionName.c_str(), I);
+        O << "\t.weak " << name << "\n";
+      }
+      break;
+    case GlobalValue::AppendingLinkage:
+      // FIXME: appending linkage variables should go into a section of
+      // their name or something.  For now, just emit them as external.
+    case GlobalValue::DLLExportLinkage:
+      DLLExportedGVs.insert(Mang->makeNameProper(I->getName(),""));
+      // FALL THROUGH
+    case GlobalValue::ExternalLinkage:
+      // If external or appending, declare as a global symbol
+      O << "\t.globl " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage: {
+      if (I->isConstant()) {
+        const ConstantArray *CVA = dyn_cast<ConstantArray>(C);
+        if (TAI->getCStringSection() && CVA && CVA->isCString()) {
+          SwitchToDataSection(TAI->getCStringSection(), I);
+          break;
+        }
+      }
+      // FIXME: special handling for ".ctors" & ".dtors" sections
+      if (I->hasSection() &&
+          (I->getSection() == ".ctors" ||
+           I->getSection() == ".dtors")) {
+        std::string SectionName = ".section " + I->getSection();
+        
+        if (Subtarget->isTargetCygMing()) {
+          SectionName += ",\"aw\"";
+        } else {
+          assert(!Subtarget->isTargetDarwin());
+          SectionName += ",\"aw\",@progbits";
+        }
+
+        SwitchToDataSection(SectionName.c_str());
+      } else {
+        if (C->isNullValue() && !NoZerosInBSS && TAI->getBSSSection())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSBSSSection() :
+                              TAI->getBSSSection(), I);
+        else if (!I->isConstant())
+          SwitchToDataSection(I->isThreadLocal() ? TAI->getTLSDataSection() :
+                              TAI->getDataSection(), I);
+        else if (I->isThreadLocal())
+          SwitchToDataSection(TAI->getTLSDataSection());
+        else {
+          // Read-only data.
+          bool HasReloc = C->ContainsRelocations();
+          if (HasReloc &&
+              Subtarget->isTargetDarwin() &&
+              TM.getRelocationModel() != Reloc::Static)
+            SwitchToDataSection("\t.const_data\n");
+          else if (!HasReloc && Size == 4 &&
+                   TAI->getFourByteConstantSection())
+            SwitchToDataSection(TAI->getFourByteConstantSection(), I);
+          else if (!HasReloc && Size == 8 &&
+                   TAI->getEightByteConstantSection())
+            SwitchToDataSection(TAI->getEightByteConstantSection(), I);
+          else if (!HasReloc && Size == 16 &&
+                   TAI->getSixteenByteConstantSection())
+            SwitchToDataSection(TAI->getSixteenByteConstantSection(), I);
+          else if (TAI->getReadOnlySection())
+            SwitchToDataSection(TAI->getReadOnlySection(), I);
+          else
+            SwitchToDataSection(TAI->getDataSection(), I);
+        }
+      }
+      
+      break;
+    }
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    EmitAlignment(Align, I);
+    O << name << ":\t\t\t\t" << TAI->getCommentString() << " " << I->getName()
+      << "\n";
+    if (TAI->hasDotTypeDotSizeDirective())
+      O << "\t.size " << name << ", " << Size << "\n";
+    // If the initializer is a extern weak symbol, remember to emit the weak
+    // reference!
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      if (GV->hasExternalWeakLinkage())
+        ExtWeakSymbols.insert(GV);
+
+    EmitGlobalConstant(C);
+  }
+  
+  // Output linker support code for dllexported globals
+  if (DLLExportedGVs.begin() != DLLExportedGVs.end()) {
+    SwitchToDataSection(".section .drectve");
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i) {
+    O << "\t.ascii \" -export:" << *i << ",data\"\n";
+  }    
+
+  if (DLLExportedFns.begin() != DLLExportedFns.end()) {
+    SwitchToDataSection(".section .drectve");
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i) {
+    O << "\t.ascii \" -export:" << *i << "\"\n";
+  }    
+
+  if (Subtarget->isTargetDarwin()) {
+    SwitchToDataSection("");
+
+    // Output stubs for dynamically-linked functions
+    unsigned j = 1;
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i, ++j) {
+      SwitchToDataSection(".section __IMPORT,__jump_table,symbol_stubs,"
+                          "self_modifying_code+pure_instructions,5", 0);
+      O << "L" << *i << "$stub:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\thlt ; hlt ; hlt ; hlt ; hlt\n";
+    }
+
+    O << "\n";
+
+    // Output stubs for external and common global variables.
+    if (GVStubs.begin() != GVStubs.end())
+      SwitchToDataSection(
+                    ".section __IMPORT,__pointers,non_lazy_symbol_pointers");
+    for (std::set<std::string>::iterator i = GVStubs.begin(), e = GVStubs.end();
+         i != e; ++i) {
+      O << "L" << *i << "$non_lazy_ptr:\n";
+      O << "\t.indirect_symbol " << *i << "\n";
+      O << "\t.long\t0\n";
+    }
+
+    // Emit final debug information.
+    DW.EndModule();
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    O << "\t.subsections_via_symbols\n";
+  } else if (Subtarget->isTargetCygMing()) {
+    // Emit type information for external functions
+    for (std::set<std::string>::iterator i = FnStubs.begin(), e = FnStubs.end();
+         i != e; ++i) {
+      O << "\t.def\t " << *i
+        << ";\t.scl\t" << COFF::C_EXT
+        << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+        << ";\t.endef\n";
+    }
+    
+    // Emit final debug information.
+    DW.EndModule();    
+  } else if (Subtarget->isTargetELF()) {
+    // Emit final debug information.
+    DW.EndModule();
+  }
+
+  AsmPrinter::doFinalization(M);
+  return false; // success
+}
+
+/// createX86CodePrinterPass - Returns a pass that prints the X86 assembly code
+/// for a MachineFunction to the given output stream, using the given target
+/// machine description.
+///
+FunctionPass *llvm::createX86CodePrinterPass(std::ostream &o,
+                                             X86TargetMachine &tm) {
+  const X86Subtarget *Subtarget = &tm.getSubtarget<X86Subtarget>();
+
+  if (Subtarget->isFlavorIntel()) {
+    return new X86IntelAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  } else {
+    return new X86ATTAsmPrinter(o, tm, tm.getTargetAsmInfo());
+  }
+}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
new file mode 100755
index 0000000..45be89e
--- /dev/null
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -0,0 +1,97 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to Intel assembly ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file the shared super class printer that converts from our internal
+// representation of machine-dependent LLVM code to Intel and AT&T format
+// assembly language.  This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "X86.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Compiler.h"
+#include <set>
+
+
+namespace llvm {
+
+struct VISIBILITY_HIDDEN X86SharedAsmPrinter : public AsmPrinter {
+  DwarfWriter DW;
+
+  X86SharedAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+                      const TargetAsmInfo *T)
+    : AsmPrinter(O, TM, T), DW(O, this, T) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+
+  void decorateName(std::string& Name, const GlobalValue* GV);
+
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    if (Subtarget->isTargetDarwin() ||
+        Subtarget->isTargetELF() ||
+        Subtarget->isTargetCygMing()) {
+      AU.addRequired<MachineModuleInfo>();
+    }
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const X86Subtarget *Subtarget;
+
+  // Necessary for Darwin to print out the apprioriate types of linker stubs
+  std::set<std::string> FnStubs, GVStubs, LinkOnceStubs;
+
+  // Necessary for dllexport support
+  std::set<std::string> DLLExportedFns, DLLExportedGVs;
+
+  inline static bool isScale(const MachineOperand &MO) {
+    return MO.isImmediate() &&
+          (MO.getImmedValue() == 1 || MO.getImmedValue() == 2 ||
+          MO.getImmedValue() == 4 || MO.getImmedValue() == 8);
+  }
+
+  inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+    if (MI->getOperand(Op).isFrameIndex()) return true;
+    return Op+4 <= MI->getNumOperands() &&
+      MI->getOperand(Op  ).isRegister() && isScale(MI->getOperand(Op+1)) &&
+      MI->getOperand(Op+2).isRegister() &&
+      (MI->getOperand(Op+3).isImmediate() ||
+       MI->getOperand(Op+3).isGlobalAddress() ||
+       MI->getOperand(Op+3).isConstantPoolIndex() ||
+       MI->getOperand(Op+3).isJumpTableIndex());
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..75892ef
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h
@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Anton Korobeynikov and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF 
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+  C_EFCN =   -1,  ///< Physical end of function
+  C_NULL    = 0,  ///< No symbol
+  C_AUTO    = 1,  ///< External definition
+  C_EXT     = 2,  ///< External symbol
+  C_STAT    = 3,  ///< Static
+  C_REG     = 4,  ///< Register variable
+  C_EXTDEF  = 5,  ///< External definition
+  C_LABEL   = 6,  ///< Label
+  C_ULABEL  = 7,  ///< Undefined label
+  C_MOS     = 8,  ///< Member of structure
+  C_ARG     = 9,  ///< Function argument
+  C_STRTAG  = 10, ///< Structure tag
+  C_MOU     = 11, ///< Member of union
+  C_UNTAG   = 12, ///< Union tag
+  C_TPDEF   = 13, ///< Type definition
+  C_USTATIC = 14, ///< Undefined static
+  C_ENTAG   = 15, ///< Enumeration tag
+  C_MOE     = 16, ///< Member of enumeration
+  C_REGPARM = 17, ///< Register parameter
+  C_FIELD   = 18, ///< Bit field
+
+  C_BLOCK  = 100, ///< ".bb" or ".eb" - beginning or end of block
+  C_FCN    = 101, ///< ".bf" or ".ef" - beginning or end of function
+  C_EOS    = 102, ///< End of structure
+  C_FILE   = 103, ///< File name
+  C_LINE   = 104, ///< Line number, reformatted as symbol
+  C_ALIAS  = 105, ///< Duplicate tag
+  C_HIDDEN = 106  ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+  T_NULL   = 0,  ///< No type info
+  T_ARG    = 1,  ///< Void function argument (only used by compiler)
+  T_VOID   = 1,  ///< The same as above. Just named differently in some specs.
+  T_CHAR   = 2,  ///< Character
+  T_SHORT  = 3,  ///< Short integer
+  T_INT    = 4,  ///< Integer
+  T_LONG   = 5,  ///< Long integer
+  T_FLOAT  = 6,  ///< Floating point
+  T_DOUBLE = 7,  ///< Double word
+  T_STRUCT = 8,  ///< Structure
+  T_UNION  = 9,  ///< Union
+  T_ENUM   = 10, ///< Enumeration
+  T_MOE    = 11, ///< Member of enumeration
+  T_UCHAR  = 12, ///< Unsigned character
+  T_USHORT = 13, ///< Unsigned short
+  T_UINT   = 14, ///< Unsigned integer
+  T_ULONG  = 15  ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+  DT_NON = 0, ///< No derived type
+  DT_PTR = 1, ///< Pointer to T
+  DT_FCN = 2, ///< Function returning T
+  DT_ARY = 3  ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+  N_BTMASK = 017, ///< Mask for base type
+  N_TMASK  = 060  ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+  N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..39811bd7
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td
@@ -0,0 +1,172 @@
+//===- X86CallingConv.td - Calling Conventions for X86 32/64 ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.
+  CCIfType<[i8] , CCAssignToReg<[AL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  
+  // Vector types are returned in XMM0 and XMM1, when they fit.  If the target
+  // doesn't have XMM registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, otherwise it is the
+  // same as the common X86 calling conv.
+  CCIfType<[f32], CCAssignToReg<[ST0]>>,
+  CCIfType<[f64], CCAssignToReg<[ST0]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns FP values in XMM0 if the target has SSE2,
+  // otherwise it is the the C calling conventions.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0]>>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // Always just the same as C calling conv for X86-64.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8, R9 ]>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+  
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+  // The first 8 MMX vector arguments are passed in GPRs.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64],
+              CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  
+  // The first 4 vector arguments are passed in XMM registers.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Other vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+  
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+  
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+  
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..8b22634
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -0,0 +1,824 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  class VISIBILITY_HIDDEN Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const TargetData    *TD;
+    TargetMachine       &TM;
+    MachineCodeEmitter  &MCE;
+    bool Is64BitMode;
+  public:
+    static char ID;
+    explicit Emitter(TargetMachine &tm, MachineCodeEmitter &mce)
+      : MachineFunctionPass((intptr_t)&ID), II(0), TD(0), TM(tm), 
+      MCE(mce), Is64BitMode(false) {}
+    Emitter(TargetMachine &tm, MachineCodeEmitter &mce,
+            const X86InstrInfo &ii, const TargetData &td, bool is64)
+      : MachineFunctionPass((intptr_t)&ID), II(&ii), TD(&td), TM(tm), 
+      MCE(mce), Is64BitMode(is64) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitPCRelativeValue(intptr_t Address);
+    void emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub);
+    void emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+                                 int Disp = 0, unsigned PCAdj = 0);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, int Disp = 0,
+                              unsigned PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc, unsigned PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               unsigned PCAdj = 0);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          unsigned PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo);
+    bool isX86_64ExtendedReg(const MachineOperand &MO);
+    unsigned determineREX(const MachineInstr &MI);
+  };
+  char Emitter::ID = 0;
+}
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *llvm::createX86CodeEmitterPass(X86TargetMachine &TM,
+                                             MachineCodeEmitter &MCE) {
+  return new Emitter(TM, MCE);
+}
+
+bool Emitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  II = ((X86TargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((X86TargetMachine&)MF.getTarget()).getTargetData();
+  Is64BitMode =
+    ((X86TargetMachine&)MF.getTarget()).getSubtarget<X86Subtarget>().is64Bit();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// emitPCRelativeValue - Emit a PC relative address.
+///
+void Emitter::emitPCRelativeValue(intptr_t Address) {
+  MCE.emitWordLE(Address-MCE.getCurrentPCValue()-4);
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+void Emitter::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddressForCall - Emit the specified address to the code stream
+/// assuming this is part of a function call, which is PC relative.
+///
+void Emitter::emitGlobalAddressForCall(GlobalValue *GV, bool DoesntNeedStub) {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+                                      X86::reloc_pcrel_word, GV, 0,
+                                      DoesntNeedStub));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+void Emitter::emitGlobalAddressForPtr(GlobalValue *GV, unsigned Reloc,
+                                      int Disp /* = 0 */,
+                                      unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             GV, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   int Disp /* = 0 */,
+                                   unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(Disp); // The relocated value will be added to the displacement
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void Emitter::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   unsigned PCAdj /* = 0 */) {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, PCAdj));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitWordLE(0);
+  MCE.emitWordLE(0); // The relocated value will be added to the displacement
+}
+
+/// N86 namespace - Native X86 Register numbers... used by X86 backend.
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+// getX86RegNum - This function maps LLVM register identifiers to their X86
+// specific numbering, which is used in various places encoding instructions.
+//
+unsigned Emitter::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0:  case X86::XMM1:  case X86::XMM2:  case X86::XMM3:
+  case X86::XMM4:  case X86::XMM5:  case X86::XMM6:  case X86::XMM7:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM0);
+  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    return II->getRegisterInfo().getDwarfRegNum(RegNo) -
+           II->getRegisterInfo().getDwarfRegNum(X86::XMM8);
+
+  default:
+    assert(MRegisterInfo::isVirtualRegister(RegNo) &&
+           "Unknown physical register!");
+    assert(0 && "Register allocator hasn't allocated reg correctly yet!");
+    return 0;
+  }
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+void Emitter::emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+void Emitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+void Emitter::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
+/// sign-extended field. 
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+void Emitter::emitDisplacementField(const MachineOperand *RelocOp,
+                                    int DispVal, unsigned PCAdj) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+  
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  if (RelocOp->isGlobalAddress()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial.
+    //  89 05 00 00 00 00    	mov    %eax,0(%rip)  # PC-relative
+    //	89 04 25 00 00 00 00 	mov    %eax,0x0      # Absolute
+    unsigned rt= Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+    emitGlobalAddressForPtr(RelocOp->getGlobal(), rt,
+                            RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isConstantPoolIndex()) {
+    // Must be in 64-bit mode.
+    emitConstPoolAddress(RelocOp->getConstantPoolIndex(), X86::reloc_pcrel_word,
+                         RelocOp->getOffset(), PCAdj);
+  } else if (RelocOp->isJumpTableIndex()) {
+    // Must be in 64-bit mode.
+    emitJumpTableAddress(RelocOp->getJumpTableIndex(), X86::reloc_pcrel_word,
+                         PCAdj);
+  } else {
+    assert(0 && "Unknown value to relocate!");
+  }
+}
+
+void Emitter::emitMemModRMByte(const MachineInstr &MI,
+                               unsigned Op, unsigned RegOpcodeField,
+                               unsigned PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobalAddress()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isConstantPoolIndex()) {
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getConstantPoolIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJumpTableIndex()) {
+    if (Is64BitMode) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getJumpTableIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Is a SIB byte needed?
+  if (IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || getX86RegNum(BaseReg) != N86::ESP)) {
+    if (BaseReg == 0) {  // Just a displacement?
+      // Emit special case [disp32] encoding
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+        // Emit simple indirect register encoding... [EAX] f.e.
+        MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      } else if (!DispForReloc && isDisp8(DispVal)) {
+        // Emit the disp8 encoding... [REG+disp8]
+        MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+        emitConstant(DispVal, 1);
+      } else {
+        // Emit the most general non-SIB encoding: [REG+disp32]
+        MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+        emitDisplacementField(DispForReloc, DispVal, PCAdj);
+      }
+    }
+
+  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+    bool ForceDisp32 = false;
+    bool ForceDisp8  = false;
+    if (BaseReg == 0) {
+      // If there is no base register, we emit the special case SIB byte with
+      // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispForReloc) {
+      // Emit the normal disp32 encoding.
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+      ForceDisp32 = true;
+    } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+      // Emit no displacement ModR/M byte
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    } else if (isDisp8(DispVal)) {
+      // Emit the disp8 encoding...
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+      ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    } else {
+      // Emit the normal disp32 encoding...
+      MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    }
+
+    // Calculate what the SS field value should be...
+    static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+    unsigned SS = SSTable[Scale.getImm()];
+
+    if (BaseReg == 0) {
+      // Handle the SIB byte for the case where there is no base.  The
+      // displacement has already been output.
+      assert(IndexReg.getReg() && "Index register must be specified!");
+      emitSIBByte(SS, getX86RegNum(IndexReg.getReg()), 5);
+    } else {
+      unsigned BaseRegNo = getX86RegNum(BaseReg);
+      unsigned IndexRegNo;
+      if (IndexReg.getReg())
+        IndexRegNo = getX86RegNum(IndexReg.getReg());
+      else
+        IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+      emitSIBByte(SS, IndexRegNo, BaseRegNo);
+    }
+
+    // Do we need to output a displacement?
+    if (ForceDisp8) {
+      emitConstant(DispVal, 1);
+    } else if (DispVal != 0 || ForceDisp32) {
+      emitDisplacementField(DispForReloc, DispVal, PCAdj);
+    }
+  }
+}
+
+static unsigned sizeOfImm(const TargetInstrDescriptor *Desc) {
+  switch (Desc->TSFlags & X86II::ImmMask) {
+  case X86II::Imm8:   return 1;
+  case X86II::Imm16:  return 2;
+  case X86II::Imm32:  return 4;
+  case X86II::Imm64:  return 8;
+  default: assert(0 && "Immediate size not set!");
+    return 0;
+  }
+}
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended register?
+/// e.g. r8, xmm8, etc.
+bool Emitter::isX86_64ExtendedReg(const MachineOperand &MO) {
+  if (!MO.isRegister()) return false;
+  unsigned RegNo = MO.getReg();
+  int DWNum = II->getRegisterInfo().getDwarfRegNum(RegNo);
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::R8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::R15))
+    return true;
+  if (DWNum >= II->getRegisterInfo().getDwarfRegNum(X86::XMM8) &&
+      DWNum <= II->getRegisterInfo().getDwarfRegNum(X86::XMM15))
+    return true;
+  return false;
+}
+
+inline static bool isX86_64TruncToByte(unsigned oc) {
+  return (oc == X86::TRUNC_64to8 || oc == X86::TRUNC_32to8 ||
+          oc == X86::TRUNC_16to8);
+}
+
+
+inline static bool isX86_64NonExtLowByteReg(unsigned reg) {
+  return (reg == X86::SPL || reg == X86::BPL ||
+          reg == X86::SIL || reg == X86::DIL);
+}
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned Emitter::determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+  unsigned Opcode = Desc->Opcode;
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc->TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc->TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc->numOperands;
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    bool isTrunc8 = isX86_64TruncToByte(Opcode);
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isRegister()) {
+        unsigned Reg = MO.getReg();
+        // Trunc to byte are actually movb. The real source operand is the low
+        // byte of the register.
+        if (isTrunc8 && i == 1)
+          Reg = getX86SubSuperRegister(Reg, MVT::i8);
+        if (isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc->TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= (1 << 0) | (1 << 2);
+      break;
+    case X86II::MRMSrcReg: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 0;
+      }
+      break;
+    }
+    case X86II::MRMSrcMem: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      i = isTwoAddr ? 2 : 1;
+      for (; i != NumOps; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+    case X86II::MRMDestMem: {
+      unsigned e = isTwoAddr ? 5 : 4;
+      i = isTwoAddr ? 1 : 0;
+      if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isRegister()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    default: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 0;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 2;
+      }
+      break;
+    }
+    }
+  }
+  return REX;
+}
+
+void Emitter::emitInstruction(const MachineInstr &MI) {
+  NumEmitted++;  // Keep track of the # of mi's emitted
+
+  const TargetInstrDescriptor *Desc = MI.getInstrDescriptor();
+  unsigned Opcode = Desc->Opcode;
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) MCE.emitByte(0xF3);
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize) MCE.emitByte(0x66);
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize) MCE.emitByte(0x67);
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:
+    Need0FPrefix = true;   // Two-byte opcode prefix
+    break;
+  case X86II::T8:
+    MCE.emitByte(0x0F);
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:
+    MCE.emitByte(0x0F);
+    MCE.emitByte(0x3A);
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    MCE.emitByte(0xF3);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    MCE.emitByte(0xD8+
+                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                                   >> X86II::Op0Shift));
+    break; // Two-byte opcode prefix
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = determineREX(MI);
+    if (REX)
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->numOperands;
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    CurOp++;
+
+  unsigned char BaseOpcode = II->getBaseOpcodeFor(Desc);
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default: assert(0 && "Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+#ifndef NDEBUG
+    switch (Opcode) {
+    default: 
+      assert(0 && "psuedo instructions should be removed before code emission");
+    case TargetInstrInfo::INLINEASM:
+      assert(0 && "JIT does not support inline asm!\n");
+    case TargetInstrInfo::LABEL:
+      assert(0 && "JIT does not support meta labels!\n");
+    case X86::IMPLICIT_USE:
+    case X86::IMPLICIT_DEF:
+    case X86::IMPLICIT_DEF_GR8:
+    case X86::IMPLICIT_DEF_GR16:
+    case X86::IMPLICIT_DEF_GR32:
+    case X86::IMPLICIT_DEF_GR64:
+    case X86::IMPLICIT_DEF_FR32:
+    case X86::IMPLICIT_DEF_FR64:
+    case X86::IMPLICIT_DEF_VR64:
+    case X86::IMPLICIT_DEF_VR128:
+    case X86::FP_REG_KILL:
+      break;
+    }
+#endif
+    CurOp = NumOps;
+    break;
+
+  case X86II::RawFrm:
+    MCE.emitByte(BaseOpcode);
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      if (MO.isMachineBasicBlock()) {
+        emitPCRelativeBlockAddress(MO.getMachineBasicBlock());
+      } else if (MO.isGlobalAddress()) {
+        bool NeedStub = Is64BitMode ||
+                        Opcode == X86::TAILJMPd ||
+                        Opcode == X86::TAILJMPr || Opcode == X86::TAILJMPm;
+        emitGlobalAddressForCall(MO.getGlobal(), !NeedStub);
+      } else if (MO.isExternalSymbol()) {
+        emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      } else if (MO.isImmediate()) {
+        emitConstant(MO.getImm(), sizeOfImm(Desc));
+      } else {
+        assert(0 && "Unknown RawFrm operand!");
+      }
+    }
+    break;
+
+  case X86II::AddRegFrm:
+    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO1.isImmediate())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64ri)
+          rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+        if (MO1.isGlobalAddress())
+          emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+        else if (MO1.isExternalSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isConstantPoolIndex())
+          emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+        else if (MO1.isJumpTableIndex())
+          emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(CurOp+4).getReg()));
+    CurOp += 5;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+
+  case X86II::MRMSrcMem: {
+    unsigned PCAdj = (CurOp+5 != NumOps) ? sizeOfImm(Desc) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+                     PCAdj);
+    CurOp += 5;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(), sizeOfImm(Desc));
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                     (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO1.isImmediate())
+        emitConstant(MO1.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64ri32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO1.isGlobalAddress())
+          emitGlobalAddressForPtr(MO1.getGlobal(), rt, MO1.getOffset());
+        else if (MO1.isExternalSymbol())
+          emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+        else if (MO1.isConstantPoolIndex())
+          emitConstPoolAddress(MO1.getConstantPoolIndex(), rt);
+        else if (MO1.isJumpTableIndex())
+          emitJumpTableAddress(MO1.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    unsigned PCAdj = (CurOp+4 != NumOps) ?
+      (MI.getOperand(CurOp+4).isImmediate() ? sizeOfImm(Desc) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
+    CurOp += 4;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      unsigned Size = sizeOfImm(Desc);
+      if (MO.isImmediate())
+        emitConstant(MO.getImm(), Size);
+      else {
+        unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+          : X86::reloc_absolute_word;
+        if (Opcode == X86::MOV64mi32)
+          rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+        if (MO.isGlobalAddress())
+          emitGlobalAddressForPtr(MO.getGlobal(), rt, MO.getOffset());
+        else if (MO.isExternalSymbol())
+          emitExternalSymbolAddress(MO.getSymbolName(), rt);
+        else if (MO.isConstantPoolIndex())
+          emitConstPoolAddress(MO.getConstantPoolIndex(), rt);
+        else if (MO.isJumpTableIndex())
+          emitJumpTableAddress(MO.getJumpTableIndex(), rt);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    MCE.emitByte(BaseOpcode);
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    ++CurOp;
+    break;
+  }
+
+  assert((Desc->Flags & M_VARIABLE_OPS) != 0 ||
+         CurOp == NumOps && "Unknown encoding!");
+}
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..f8f8d48
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp
@@ -0,0 +1,18 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+using namespace llvm;
+
+X86ELFWriterInfo::X86ELFWriterInfo() : TargetELFWriterInfo(EM_386) {}
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..eb564fb
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h
@@ -0,0 +1,29 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Bill Wendling and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+  class X86ELFWriterInfo : public TargetELFWriterInfo {
+  public:
+    X86ELFWriterInfo();
+    virtual ~X86ELFWriterInfo();
+  };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..c293a32
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -0,0 +1,882 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code.  In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges.  Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet.  The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct VISIBILITY_HIDDEN FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LiveVariables>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    LiveVariables     *LV;      // Live variable info for current function...
+    MachineBasicBlock *MBB;     // Current basic block
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned RegMap[8];         // Track which stack slot contains each register
+    unsigned StackTop;          // The current top of the FP stack.
+
+    void dumpStack() const {
+      cerr << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        cerr << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      cerr << "\n";
+    }
+  private:
+    // getSlot - Return the stack slot number a particular register number is
+    // in...
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < 8 && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    // getStackEntry - Return the X86::FP<n> register in register ST(i)
+    unsigned getStackEntry(unsigned STi) const {
+      assert(STi < StackTop && "Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    // getSTReg - Return the X86::ST(i) register which contains the specified
+    // FP<RegNo> register
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack
+    void pushReg(unsigned Reg) {
+      assert(Reg < 8 && "Register number out of range!");
+      assert(StackTop < 8 && "Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator &I) {
+      if (!isAtTop(RegNo)) {
+        unsigned STReg = getSTReg(RegNo);
+        unsigned RegOnTop = getStackEntry(0);
+
+        // Swap the slots the regs are in
+        std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+        // Swap stack slot contents
+        assert(RegMap[RegOnTop] < StackTop);
+        std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+        // Emit an fxch to update the runtime processors version of the state
+        BuildMI(*MBB, I, TII->get(X86::XCH_F)).addReg(STReg);
+        NumFXCH++;
+      }
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    // popStackAfter - Pop the current value off of the top of the FP stack
+    // after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    // freeStackSlotAfter - Free the specified register from the register stack,
+    // so that it is no longer in a register.  If the register is currently at
+    // the top of the stack, we just pop the current instruction, otherwise we
+    // store the current top-of-stack into the specified slot, then pop the top
+    // of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  TII = MF.getTarget().getInstrInfo();
+  LV = &getAnalysis<LiveVariables>();
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  std::set<MachineBasicBlock*> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, std::set<MachineBasicBlock*> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    unsigned Flags = MI->getInstrDescriptor()->TSFlags;
+    if ((Flags & X86II::FPTypeMask) == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = 0;
+    if (I != BB.begin())
+        PrevMI = prior(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DOUT << "\nFPInst:\t" << *MI;
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (Flags & X86II::FPTypeMask) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: assert(0 && "Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DOUT << "Register FP#" << Reg-X86::FP0 << " is dead!\n";
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        cerr << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        cerr << "Inserted instructions:\n\t";
+        Start->print(*cerr.stream(), &MF.getTarget());
+        while (++Start != next(I));
+      }
+      dumpStack();
+    );
+
+    Changed = true;
+  }
+
+  assert(StackTop == 0 && "Stack not empty at end of basic block?");
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    unsigned from;
+    unsigned to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#define ARRAY_SIZE(TABLE)  \
+   (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) &&                  \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, ARRAY_SIZE(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(PopTable);
+  assert(StackTop > 0 && "Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, ARRAY_SIZE(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setInstrDescriptor(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  I = BuildMI(*MBB, ++I, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isRegister() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+  assert((NumOps == 5 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64)) {
+    duplicateToTop(Reg, 7 /*temp register*/, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+  
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m) {
+    assert(StackTop > 0 && "Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getInstrDescriptor()->numOperands;
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = LV->KillsRegister(MI, X86::FP0+Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    assert(StackTop > 0 && "Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, ARRAY_SIZE(ForwardST0Table), MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  delete MI;   // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getInstrDescriptor()->numOperands;
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = LV->KillsRegister(MI, X86::FP0+Op0);
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = LV->KillsRegister(MI, X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setInstrDescriptor(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  switch (MI->getOpcode()) {
+  default: assert(0 && "Unknown SpecialFP instruction!");
+  case X86::FpGETRESULT32:  // Appears immediately after a call returning FP type!
+  case X86::FpGETRESULT64:  // Appears immediately after a call returning FP type!
+    assert(StackTop == 0 && "Stack should be empty after a call!");
+    pushReg(getFPReg(MI->getOperand(0)));
+    break;
+  case X86::FpSETRESULT32:
+  case X86::FpSETRESULT64:
+    assert(StackTop == 1 && "Stack should have one element on it to return!");
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  case X86::MOV_Fp3232:
+  case X86::MOV_Fp3264:
+  case X86::MOV_Fp6432:
+  case X86::MOV_Fp6464: {
+    unsigned SrcReg = getFPReg(MI->getOperand(1));
+    unsigned DestReg = getFPReg(MI->getOperand(0));
+
+    if (LV->KillsRegister(MI, X86::FP0+SrcReg)) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcReg);
+      assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+      Stack[Slot] = DestReg;
+      RegMap[DestReg] = Slot;
+
+    } else {
+      // For FMOV we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcReg, DestReg, I);
+    }
+    break;
+  }
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+  --I;
+}
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..8b1690c
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -0,0 +1,1342 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumFPKill   , "Number of FP_REG_KILL instructions added");
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDOperand's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDOperand Reg;
+      int FrameIndex;
+    } Base;
+
+    bool isRIPRel;     // RIP relative?
+    unsigned Scale;
+    SDOperand IndexReg; 
+    unsigned Disp;
+    GlobalValue *GV;
+    Constant *CP;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), isRIPRel(false), Scale(1), IndexReg(), Disp(0),
+        GV(0), CP(0), ES(0), JT(-1), Align(0) {
+    }
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
+    /// ContainsFPCode - Every instruction we select that uses or defines a FP
+    /// register should set this to true.
+    bool ContainsFPCode;
+
+    /// FastISel - Enable fast(er) instruction selection.
+    ///
+    bool FastISel;
+
+    /// TM - Keep a reference to X86TargetMachine.
+    ///
+    X86TargetMachine &TM;
+
+    /// X86Lowering - This object fully describes how to lower LLVM code to an
+    /// X86-specific SelectionDAG.
+    X86TargetLowering X86Lowering;
+
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// GlobalBaseReg - keeps track of the virtual register mapped onto global
+    /// base register.
+    unsigned GlobalBaseReg;
+
+  public:
+    X86DAGToDAGISel(X86TargetMachine &tm, bool fast)
+      : SelectionDAGISel(X86Lowering),
+        ContainsFPCode(false), FastISel(fast), TM(tm),
+        X86Lowering(*TM.getTargetLowering()),
+        Subtarget(&TM.getSubtarget<X86Subtarget>()) {}
+
+    virtual bool runOnFunction(Function &Fn) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      return SelectionDAGISel::runOnFunction(Fn);
+    }
+   
+    virtual const char *getPassName() const {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    /// InstructionSelectBasicBlock - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelectBasicBlock(SelectionDAG &DAG);
+
+    virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+    virtual bool CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root);
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDOperand N);
+
+    bool MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+                      bool isRoot = true, unsigned Depth = 0);
+    bool SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                    SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectLEAAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                       SDOperand &Scale, SDOperand &Index, SDOperand &Disp);
+    bool SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                             SDOperand N, SDOperand &Base, SDOperand &Scale,
+                             SDOperand &Index, SDOperand &Disp,
+                             SDOperand &InChain, SDOperand &OutChain);
+    bool TryFoldLoad(SDOperand P, SDOperand N,
+                     SDOperand &Base, SDOperand &Scale,
+                     SDOperand &Index, SDOperand &Disp);
+    void InstructionSelectPreprocess(SelectionDAG &DAG);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDOperand &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDOperand> &OutOps,
+                                              SelectionDAG &DAG);
+    
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDOperand &Base, 
+                                   SDOperand &Scale, SDOperand &Index,
+                                   SDOperand &Disp) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+        AM.Base.Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Align, AM.Disp);
+      else if (AM.ES)
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32);
+      else if (AM.JT != -1)
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32);
+      else
+        Disp = getI32Imm(AM.Disp);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDOperand getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDOperand getI16Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDOperand getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+
+#ifndef NDEBUG
+    unsigned Indent;
+#endif
+  };
+}
+
+static SDNode *findFlagUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+      SDOperand Op = User->getOperand(i);
+      if (Op.Val == N && Op.ResNo == FlagResNo)
+        return User;
+    }
+  }
+  return NULL;
+}
+
+static void findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SDNode *Skip, bool &found,
+                          std::set<SDNode *> &Visited) {
+  if (found ||
+      Use->getNodeId() > Def->getNodeId() ||
+      !Visited.insert(Use).second)
+    return;
+
+  for (unsigned i = 0, e = Use->getNumOperands(); !found && i != e; ++i) {
+    SDNode *N = Use->getOperand(i).Val;
+    if (N == Skip)
+      continue;
+    if (N == Def) {
+      if (Use == ImmedUse)
+        continue; // Immediate use is ok.
+      if (Use == Root) {
+        assert(Use->getOpcode() == ISD::STORE ||
+               Use->getOpcode() == X86ISD::CMP);
+        continue;
+      }
+      found = true;
+      break;
+    }
+    findNonImmUse(N, Def, ImmedUse, Root, Skip, found, Visited);
+  }
+}
+
+/// isNonImmUse - Start searching from Root up the DAG to check is Def can
+/// be reached. Return true if that's the case. However, ignore direct uses
+/// by ImmedUse (which would be U in the example illustrated in
+/// CanBeFoldedBy) and by Root (which can happen in the store case).
+/// FIXME: to be really generic, we should allow direct use by any node
+/// that is being folded. But realisticly since we only fold loads which
+/// have one non-chain use, we only need to watch out for load/op/store
+/// and load/op/cmp case where the root (store / cmp) may reach the load via
+/// its chain operand.
+static inline bool isNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
+                               SDNode *Skip = NULL) {
+  std::set<SDNode *> Visited;
+  bool found = false;
+  findNonImmUse(Root, Def, ImmedUse, Root, Skip, found, Visited);
+  return found;
+}
+
+
+bool X86DAGToDAGISel::CanBeFoldedBy(SDNode *N, SDNode *U, SDNode *Root) {
+  if (FastISel) return false;
+
+  // If U use can somehow reach N through another path then U can't fold N or
+  // it will create a cycle. e.g. In the following diagram, U can reach N
+  // through X. If N is folded into into U, then X is both a predecessor and
+  // a successor of U.
+  //
+  //         [ N ]
+  //         ^  ^
+  //         |  |
+  //        /   \---
+  //      /        [X]
+  //      |         ^
+  //     [U]--------|
+
+  if (isNonImmUse(Root, N, U))
+    return false;
+
+  // If U produces a flag, then it gets (even more) interesting. Since it
+  // would have been "glued" together with its flag use, we need to check if
+  // it might reach N:
+  //
+  //       [ N ]
+  //        ^ ^
+  //        | |
+  //       [U] \--
+  //        ^   [TF]
+  //        |    ^
+  //        |    |
+  //         \  /
+  //          [FU]
+  //
+  // If FU (flag use) indirectly reach N (the load), and U fold N (call it
+  // NU), then TF is a predecessor of FU and a successor of NU. But since
+  // NU and FU are flagged together, this effectively creates a cycle.
+  bool HasFlagUse = false;
+  MVT::ValueType VT = Root->getValueType(Root->getNumValues()-1);
+  while ((VT == MVT::Flag && !Root->use_empty())) {
+    SDNode *FU = findFlagUse(Root);
+    if (FU == NULL)
+      break;
+    else {
+      Root = FU;
+      HasFlagUse = true;
+    }
+    VT = Root->getValueType(Root->getNumValues()-1);
+  }
+
+  if (HasFlagUse)
+    return !isNonImmUse(Root, N, Root, U);
+  return true;
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG &DAG, SDOperand Load,
+                                 SDOperand Store, SDOperand TF) {
+  std::vector<SDOperand> Ops;
+  for (unsigned i = 0, e = TF.Val->getNumOperands(); i != e; ++i)
+    if (Load.Val == TF.Val->getOperand(i).Val)
+      Ops.push_back(Load.Val->getOperand(0));
+    else
+      Ops.push_back(TF.Val->getOperand(i));
+  DAG.UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  DAG.UpdateNodeOperands(Load, TF, Load.getOperand(1), Load.getOperand(2));
+  DAG.UpdateNodeOperands(Store, Load.getValue(1), Store.getOperand(1),
+                         Store.getOperand(2), Store.getOperand(3));
+}
+
+/// InstructionSelectPreprocess - Preprocess the DAG to allow the instruction
+/// selector to pick more load-modify-store instructions. This is a common
+/// case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \- 
+///       |       | 
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+void X86DAGToDAGISel::InstructionSelectPreprocess(SelectionDAG &DAG) {
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+         E = DAG.allnodes_end(); I != E; ++I) {
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+    SDOperand Chain = I->getOperand(0);
+    if (Chain.Val->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDOperand N1 = I->getOperand(1);
+    SDOperand N2 = I->getOperand(2);
+    if (MVT::isFloatingPoint(N1.getValueType()) ||
+        MVT::isVector(N1.getValueType()) ||
+        !N1.hasOneUse())
+      continue;
+
+    bool RModW = false;
+    SDOperand Load;
+    unsigned Opcode = N1.Val->getOpcode();
+    switch (Opcode) {
+      case ISD::ADD:
+      case ISD::MUL:
+      case ISD::AND:
+      case ISD::OR:
+      case ISD::XOR:
+      case ISD::ADDC:
+      case ISD::ADDE: {
+        SDOperand N10 = N1.getOperand(0);
+        SDOperand N11 = N1.getOperand(1);
+        if (ISD::isNON_EXTLoad(N10.Val))
+          RModW = true;
+        else if (ISD::isNON_EXTLoad(N11.Val)) {
+          RModW = true;
+          std::swap(N10, N11);
+        }
+        RModW = RModW && N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+          (N10.getOperand(1) == N2) &&
+          (N10.Val->getValueType(0) == N1.getValueType());
+        if (RModW)
+          Load = N10;
+        break;
+      }
+      case ISD::SUB:
+      case ISD::SHL:
+      case ISD::SRA:
+      case ISD::SRL:
+      case ISD::ROTL:
+      case ISD::ROTR:
+      case ISD::SUBC:
+      case ISD::SUBE:
+      case X86ISD::SHLD:
+      case X86ISD::SHRD: {
+        SDOperand N10 = N1.getOperand(0);
+        if (ISD::isNON_EXTLoad(N10.Val))
+          RModW = N10.Val->isOperand(Chain.Val) && N10.hasOneUse() &&
+            (N10.getOperand(1) == N2) &&
+            (N10.Val->getValueType(0) == N1.getValueType());
+        if (RModW)
+          Load = N10;
+        break;
+      }
+    }
+
+    if (RModW) {
+      MoveBelowTokenFactor(DAG, Load, SDOperand(I, 0), Chain);
+      ++NumLoadMoved;
+    }
+  }
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelectBasicBlock(SelectionDAG &DAG) {
+  DEBUG(BB->dump());
+  MachineFunction::iterator FirstMBB = BB;
+
+  if (!FastISel)
+    InstructionSelectPreprocess(DAG);
+
+  // Codegen the basic block.
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection begins:\n";
+  Indent = 0;
+#endif
+  DAG.setRoot(SelectRoot(DAG.getRoot()));
+#ifndef NDEBUG
+  DOUT << "===== Instruction selection ends:\n";
+#endif
+
+  DAG.RemoveDeadNodes();
+
+  // Emit machine code to BB. 
+  ScheduleAndEmitDAG(DAG);
+  
+  // If we are emitting FP stack code, scan the basic block to determine if this
+  // block defines any FP values.  If so, put an FP_REG_KILL instruction before
+  // the terminator of the block.
+  if (!Subtarget->hasSSE2()) {
+    // Note that FP stack instructions *are* used in SSE code when returning
+    // values, but these are not live out of the basic block, so we don't need
+    // an FP_REG_KILL in this case either.
+    bool ContainsFPCode = false;
+    
+    // Scan all of the machine instructions in these MBBs, checking for FP
+    // stores.
+    MachineFunction::iterator MBBI = FirstMBB;
+    do {
+      for (MachineBasicBlock::iterator I = MBBI->begin(), E = MBBI->end();
+           !ContainsFPCode && I != E; ++I) {
+        if (I->getNumOperands() != 0 && I->getOperand(0).isRegister()) {
+          const TargetRegisterClass *clas;
+          for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+            if (I->getOperand(op).isRegister() && I->getOperand(op).isDef() &&
+                MRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+                ((clas = RegMap->getRegClass(I->getOperand(0).getReg())) == 
+                   X86::RFP32RegisterClass ||
+                 clas == X86::RFP64RegisterClass)) {
+              ContainsFPCode = true;
+              break;
+            }
+          }
+        }
+      }
+    } while (!ContainsFPCode && &*(MBBI++) != BB);
+    
+    // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
+    // a copy of the input value in this block.
+    if (!ContainsFPCode) {
+      // Final check, check LLVM BB's that are successors to the LLVM BB
+      // corresponding to BB for FP PHI nodes.
+      const BasicBlock *LLVMBB = BB->getBasicBlock();
+      const PHINode *PN;
+      for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+           !ContainsFPCode && SI != E; ++SI) {
+        for (BasicBlock::const_iterator II = SI->begin();
+             (PN = dyn_cast<PHINode>(II)); ++II) {
+          if (PN->getType()->isFloatingPoint()) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+
+    // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+    if (ContainsFPCode) {
+      BuildMI(*BB, BB->getFirstTerminator(),
+              TM.getInstrInfo()->get(X86::FP_REG_KILL));
+      ++NumFPKill;
+    }
+  }
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing())
+    BuildMI(BB, TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+
+  // Switch the FPU to 64-bit precision mode for better compatibility and speed.
+  int CWFrameIdx = MFI->CreateStackObject(2, 2);
+  addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+  // Set the high part to be 64-bit precision.
+  addFrameReference(BuildMI(BB, TII->get(X86::MOV8mi)),
+                    CWFrameIdx, 1).addImm(2);
+
+  // Reload the modified control word now.
+  addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+  // If this is main, emit special code for main.
+  MachineBasicBlock *BB = MF.begin();
+  if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+    EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode
+bool X86DAGToDAGISel::MatchAddress(SDOperand N, X86ISelAddressMode &AM,
+                                   bool isRoot, unsigned Depth) {
+  if (Depth > 5) {
+    // Default, generate it as a register.
+    AM.BaseType = X86ISelAddressMode::RegBase;
+    AM.Base.Reg = N;
+    return false;
+  }
+  
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRel) {
+    if (!AM.ES && AM.JT != -1 && N.getOpcode() == ISD::Constant) {
+      int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+      if (isInt32(AM.Disp + Val)) {
+        AM.Disp += Val;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  int id = N.Val->getNodeId();
+  bool Available = isSelected(id);
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    int64_t Val = cast<ConstantSDNode>(N)->getSignExtended();
+    if (isInt32(AM.Disp + Val)) {
+      AM.Disp += Val;
+      return false;
+    }
+    break;
+  }
+
+  case X86ISD::Wrapper: {
+    bool is64Bit = Subtarget->is64Bit();
+    // Under X86-64 non-small code model, GV (and friends) are 64-bits.
+    if (is64Bit && TM.getCodeModel() != CodeModel::Small)
+      break;
+    if (AM.GV != 0 || AM.CP != 0 || AM.ES != 0 || AM.JT != -1)
+      break;
+    // If value is available in a register both base and index components have
+    // been picked, we can't fit the result available in the register in the
+    // addressing mode. Duplicate GlobalAddress or ConstantPool as displacement.
+    if (!Available || (AM.Base.Reg.Val && AM.IndexReg.Val)) {
+      bool isStatic = TM.getRelocationModel() == Reloc::Static;
+      SDOperand N0 = N.getOperand(0);
+      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+        GlobalValue *GV = G->getGlobal();
+        bool isAbs32 = !is64Bit || isStatic;
+        if (isAbs32 || isRoot) {
+          AM.GV = GV;
+          AM.Disp += G->getOffset();
+          AM.isRIPRel = !isAbs32;
+          return false;
+        }
+      } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+        if (!is64Bit || isStatic || isRoot) {
+          AM.CP = CP->getConstVal();
+          AM.Align = CP->getAlignment();
+          AM.Disp += CP->getOffset();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      } else if (ExternalSymbolSDNode *S =dyn_cast<ExternalSymbolSDNode>(N0)) {
+        if (isStatic || isRoot) {
+          AM.ES = S->getSymbol();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+        if (isStatic || isRoot) {
+          AM.JT = J->getIndex();
+          AM.isRIPRel = !isStatic;
+          return false;
+        }
+      }
+    }
+    break;
+  }
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base.Reg.Val == 0) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (!Available && AM.IndexReg.Val == 0 && AM.Scale == 1)
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1))) {
+        unsigned Val = CN->getValue();
+        if (Val == 1 || Val == 2 || Val == 3) {
+          AM.Scale = 1 << Val;
+          SDOperand ShVal = N.Val->getOperand(0);
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (ShVal.Val->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
+              isa<ConstantSDNode>(ShVal.Val->getOperand(1))) {
+            AM.IndexReg = ShVal.Val->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(ShVal.Val->getOperand(1));
+            uint64_t Disp = AM.Disp + (AddVal->getValue() << Val);
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              AM.IndexReg = ShVal;
+          } else {
+            AM.IndexReg = ShVal;
+          }
+          return false;
+        }
+      }
+    break;
+
+  case ISD::MUL:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (!Available &&
+        AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base.Reg.Val == 0 &&
+        AM.IndexReg.Val == 0) {
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.Val->getOperand(1)))
+        if (CN->getValue() == 3 || CN->getValue() == 5 || CN->getValue() == 9) {
+          AM.Scale = unsigned(CN->getValue())-1;
+
+          SDOperand MulVal = N.Val->getOperand(0);
+          SDOperand Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.Val->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.Val->getOperand(1))) {
+            Reg = MulVal.Val->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.Val->getOperand(1));
+            uint64_t Disp = AM.Disp + AddVal->getValue() * CN->getValue();
+            if (isInt32(Disp))
+              AM.Disp = Disp;
+            else
+              Reg = N.Val->getOperand(0);
+          } else {
+            Reg = N.Val->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base.Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::ADD:
+    if (!Available) {
+      X86ISelAddressMode Backup = AM;
+      if (!MatchAddress(N.Val->getOperand(0), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(1), AM, false, Depth+1))
+        return false;
+      AM = Backup;
+      if (!MatchAddress(N.Val->getOperand(1), AM, false, Depth+1) &&
+          !MatchAddress(N.Val->getOperand(0), AM, false, Depth+1))
+        return false;
+      AM = Backup;
+    }
+    break;
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (!Available) {
+      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+        X86ISelAddressMode Backup = AM;
+        // Start with the LHS as an addr mode.
+        if (!MatchAddress(N.getOperand(0), AM, false) &&
+            // Address could not have picked a GV address for the displacement.
+            AM.GV == NULL &&
+            // On x86-64, the resultant disp must fit in 32-bits.
+            isInt32(AM.Disp + CN->getSignExtended()) &&
+            // Check to see if the LHS & C is zero.
+            CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getValue())) {
+          AM.Disp += CN->getValue();
+          return false;
+        }
+        AM = Backup;
+      }
+    }
+    break;
+  }
+
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.Val) {
+    // If so, check to see if the scale index register is set.
+    if (AM.IndexReg.Val == 0) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDOperand Op, SDOperand N, SDOperand &Base,
+                                 SDOperand &Scale, SDOperand &Index,
+                                 SDOperand &Disp) {
+  X86ISelAddressMode AM;
+  if (MatchAddress(N, AM))
+    return false;
+
+  MVT::ValueType VT = N.getValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.Val)
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.Val)
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp);
+  return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+  cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+  (isa<ConstantFPSDNode>(Elt) &&
+  cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDOperand Op, SDOperand Pred,
+                                          SDOperand N, SDOperand &Base,
+                                          SDOperand &Scale, SDOperand &Index,
+                                          SDOperand &Disp, SDOperand &InChain,
+                                          SDOperand &OutChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    InChain = N.getOperand(0).getValue(1);
+    if (ISD::isNON_EXTLoad(InChain.Val) &&
+        InChain.getValue(0).hasOneUse() &&
+        N.hasOneUse() &&
+        CanBeFoldedBy(N.Val, Pred.Val, Op.Val)) {
+      LoadSDNode *LD = cast<LoadSDNode>(InChain);
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == ISD::VECTOR_SHUFFLE && N.Val->hasOneUse() &&
+      N.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(1).Val->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(1).getOperand(0).Val) &&
+      N.getOperand(1).getOperand(0).hasOneUse()) {
+    // Check to see if the BUILD_VECTOR is building a zero vector.
+    SDOperand BV = N.getOperand(0);
+    for (unsigned i = 0, e = BV.getNumOperands(); i != e; ++i)
+      if (!isZeroNode(BV.getOperand(i)) &&
+          BV.getOperand(i).getOpcode() != ISD::UNDEF)
+        return false;  // Not a zero/undef vector.
+    // Check to see if the shuffle mask is 4/L/L/L or 2/L, where L is something
+    // from the LHS.
+    unsigned VecWidth = BV.getNumOperands();
+    SDOperand ShufMask = N.getOperand(2);
+    assert(ShufMask.getOpcode() == ISD::BUILD_VECTOR && "Invalid shuf mask!");
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ShufMask.getOperand(0))) {
+      if (C->getValue() == VecWidth) {
+        for (unsigned i = 1; i != VecWidth; ++i) {
+          if (ShufMask.getOperand(i).getOpcode() == ISD::UNDEF) {
+            // ok.
+          } else {
+            ConstantSDNode *C = cast<ConstantSDNode>(ShufMask.getOperand(i));
+            if (C->getValue() >= VecWidth) return false;
+          }
+        }
+      }
+      
+      // Okay, this is a zero extending load.  Fold it.
+      LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(1).getOperand(0));
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp))
+        return false;
+      OutChain = LD->getChain();
+      InChain = SDOperand(LD, 1);
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDOperand Op, SDOperand N,
+                                    SDOperand &Base, SDOperand &Scale,
+                                    SDOperand &Index, SDOperand &Disp) {
+  X86ISelAddressMode AM;
+  if (MatchAddress(N, AM))
+    return false;
+
+  MVT::ValueType VT = N.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base.Reg.Val)
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.Val)
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.GV || AM.CP || AM.ES || AM.JT != -1) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base.Reg.Val || AM.IndexReg.Val))
+    Complexity++;
+
+  if (Complexity > 2) {
+    getAddressOperands(AM, Base, Scale, Index, Disp);
+    return true;
+  }
+  return false;
+}
+
+bool X86DAGToDAGISel::TryFoldLoad(SDOperand P, SDOperand N,
+                                  SDOperand &Base, SDOperand &Scale,
+                                  SDOperand &Index, SDOperand &Disp) {
+  if (ISD::isNON_EXTLoad(N.Val) &&
+      N.hasOneUse() &&
+      CanBeFoldedBy(N.Val, P.Val, P.Val))
+    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp);
+  return false;
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  assert(!Subtarget->is64Bit() && "X86-64 PIC uses RIP relative addressing");
+  if (!GlobalBaseReg) {
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = BB->getParent()->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    SSARegMap *RegMap = BB->getParent()->getSSARegMap();
+    unsigned PC = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+    
+    const TargetInstrInfo *TII = TM.getInstrInfo();
+    BuildMI(FirstMBB, MBBI, TII->get(X86::MovePCtoStack));
+    BuildMI(FirstMBB, MBBI, TII->get(X86::POP32r), PC);
+    
+    // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+    // not to pc, but to _GLOBAL_ADDRESS_TABLE_ external
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        Subtarget->isPICStyleGOT()) {
+      GlobalBaseReg = RegMap->createVirtualRegister(X86::GR32RegisterClass);
+      BuildMI(FirstMBB, MBBI, TII->get(X86::ADD32ri), GlobalBaseReg).
+        addReg(PC).
+        addExternalSymbol("_GLOBAL_OFFSET_TABLE_");
+    } else {
+      GlobalBaseReg = PC;
+    }
+    
+  }
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).Val;
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+    assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCall(Node->getOperand(0).Val);
+}
+
+SDNode *X86DAGToDAGISel::Select(SDOperand N) {
+  SDNode *Node = N.Val;
+  MVT::ValueType NVT = Node->getValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+
+#ifndef NDEBUG
+  DOUT << std::string(Indent, ' ') << "Selecting: ";
+  DEBUG(Node->dump(CurDAG));
+  DOUT << "\n";
+  Indent += 2;
+#endif
+
+  if (Opcode >= ISD::BUILTIN_OP_END && Opcode < X86ISD::FIRST_NUMBER) {
+#ifndef NDEBUG
+    DOUT << std::string(Indent-2, ' ') << "== ";
+    DEBUG(Node->dump(CurDAG));
+    DOUT << "\n";
+    Indent -= 2;
+#endif
+    return NULL;   // Already selected.
+  }
+
+  switch (Opcode) {
+    default: break;
+    case X86ISD::GlobalBaseReg: 
+      return getGlobalBaseReg();
+
+    case ISD::ADD: {
+      // Turn ADD X, c to MOV32ri X+c. This cannot be done with tblgen'd
+      // code and is matched first so to prevent it from being turned into
+      // LEA32r X+c.
+      // In 64-bit mode, use LEA to take advantage of RIP-relative addressing.
+      MVT::ValueType PtrVT = TLI.getPointerTy();
+      SDOperand N0 = N.getOperand(0);
+      SDOperand N1 = N.getOperand(1);
+      if (N.Val->getValueType(0) == PtrVT &&
+          N0.getOpcode() == X86ISD::Wrapper &&
+          N1.getOpcode() == ISD::Constant) {
+        unsigned Offset = (unsigned)cast<ConstantSDNode>(N1)->getValue();
+        SDOperand C(0, 0);
+        // TODO: handle ExternalSymbolSDNode.
+        if (GlobalAddressSDNode *G =
+            dyn_cast<GlobalAddressSDNode>(N0.getOperand(0))) {
+          C = CurDAG->getTargetGlobalAddress(G->getGlobal(), PtrVT,
+                                             G->getOffset() + Offset);
+        } else if (ConstantPoolSDNode *CP =
+                   dyn_cast<ConstantPoolSDNode>(N0.getOperand(0))) {
+          C = CurDAG->getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                            CP->getAlignment(),
+                                            CP->getOffset()+Offset);
+        }
+
+        if (C.Val) {
+          if (Subtarget->is64Bit()) {
+            SDOperand Ops[] = { CurDAG->getRegister(0, PtrVT), getI8Imm(1),
+                                CurDAG->getRegister(0, PtrVT), C };
+            return CurDAG->SelectNodeTo(N.Val, X86::LEA64r, MVT::i64, Ops, 4);
+          } else
+            return CurDAG->SelectNodeTo(N.Val, X86::MOV32ri, PtrVT, C);
+        }
+      }
+
+      // Other cases are handled by auto-generated code.
+      break;
+    }
+
+    case ISD::MULHU:
+    case ISD::MULHS: {
+      if (Opcode == ISD::MULHU)
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+        case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+        case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+        case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+        }
+      else
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+        case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+        case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+        case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      switch (NVT) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
+      case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
+      case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+      case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+      }
+
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+
+      bool foldedLoad = false;
+      SDOperand Tmp0, Tmp1, Tmp2, Tmp3;
+      foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+      // MULHU and MULHS are commmutative
+      if (!foldedLoad) {
+        foldedLoad = TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3);
+        if (foldedLoad) {
+          N0 = Node->getOperand(1);
+          N1 = Node->getOperand(0);
+        }
+      }
+
+      SDOperand Chain;
+      if (foldedLoad) {
+        Chain = N1.getOperand(0);
+        AddToISelQueue(Chain);
+      } else
+        Chain = CurDAG->getEntryNode();
+
+      SDOperand InFlag(0, 0);
+      AddToISelQueue(N0);
+      Chain  = CurDAG->getCopyToReg(Chain, CurDAG->getRegister(LoReg, NVT),
+                                    N0, InFlag);
+      InFlag = Chain.getValue(1);
+
+      if (foldedLoad) {
+        AddToISelQueue(Tmp0);
+        AddToISelQueue(Tmp1);
+        AddToISelQueue(Tmp2);
+        AddToISelQueue(Tmp3);
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Chain, InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+        Chain  = SDOperand(CNode, 0);
+        InFlag = SDOperand(CNode, 1);
+      } else {
+        AddToISelQueue(N1);
+        InFlag =
+          SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+      }
+
+      SDOperand Result = CurDAG->getCopyFromReg(Chain, HiReg, NVT, InFlag);
+      ReplaceUses(N.getValue(0), Result);
+      if (foldedLoad)
+        ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(Result.Val->dump(CurDAG));
+      DOUT << "\n";
+      Indent -= 2;
+#endif
+      return NULL;
+    }
+      
+    case ISD::SDIV:
+    case ISD::UDIV:
+    case ISD::SREM:
+    case ISD::UREM: {
+      bool isSigned = Opcode == ISD::SDIV || Opcode == ISD::SREM;
+      bool isDiv    = Opcode == ISD::SDIV || Opcode == ISD::UDIV;
+      if (!isSigned)
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+        case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+        case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+        case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+        }
+      else
+        switch (NVT) {
+        default: assert(0 && "Unsupported VT!");
+        case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+        case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+        case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+        case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+        }
+
+      unsigned LoReg, HiReg;
+      unsigned ClrOpcode, SExtOpcode;
+      switch (NVT) {
+      default: assert(0 && "Unsupported VT!");
+      case MVT::i8:
+        LoReg = X86::AL;  HiReg = X86::AH;
+        ClrOpcode  = 0;
+        SExtOpcode = X86::CBW;
+        break;
+      case MVT::i16:
+        LoReg = X86::AX;  HiReg = X86::DX;
+        ClrOpcode  = X86::MOV16r0;
+        SExtOpcode = X86::CWD;
+        break;
+      case MVT::i32:
+        LoReg = X86::EAX; HiReg = X86::EDX;
+        ClrOpcode  = X86::MOV32r0;
+        SExtOpcode = X86::CDQ;
+        break;
+      case MVT::i64:
+        LoReg = X86::RAX; HiReg = X86::RDX;
+        ClrOpcode  = X86::MOV64r0;
+        SExtOpcode = X86::CQO;
+        break;
+      }
+
+      SDOperand N0 = Node->getOperand(0);
+      SDOperand N1 = Node->getOperand(1);
+      SDOperand InFlag(0, 0);
+      if (NVT == MVT::i8 && !isSigned) {
+        // Special case for div8, just use a move with zero extension to AX to
+        // clear the upper 8 bits (AH).
+        SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Move, Chain;
+        if (TryFoldLoad(N, N0, Tmp0, Tmp1, Tmp2, Tmp3)) {
+          SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N0.getOperand(0) };
+          AddToISelQueue(N0.getOperand(0));
+          AddToISelQueue(Tmp0);
+          AddToISelQueue(Tmp1);
+          AddToISelQueue(Tmp2);
+          AddToISelQueue(Tmp3);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rm8, MVT::i16, MVT::Other,
+                                            Ops, 5), 0);
+          Chain = Move.getValue(1);
+          ReplaceUses(N0.getValue(1), Chain);
+        } else {
+          AddToISelQueue(N0);
+          Move =
+            SDOperand(CurDAG->getTargetNode(X86::MOVZX16rr8, MVT::i16, N0), 0);
+          Chain = CurDAG->getEntryNode();
+        }
+        Chain  = CurDAG->getCopyToReg(Chain, X86::AX, Move, InFlag);
+        InFlag = Chain.getValue(1);
+      } else {
+        AddToISelQueue(N0);
+        InFlag =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), LoReg, N0,
+                               InFlag).getValue(1);
+        if (isSigned) {
+          // Sign extend the low part into the high part.
+          InFlag =
+            SDOperand(CurDAG->getTargetNode(SExtOpcode, MVT::Flag, InFlag), 0);
+        } else {
+          // Zero out the high part, effectively zero extending the input.
+          SDOperand ClrNode = SDOperand(CurDAG->getTargetNode(ClrOpcode, NVT), 0);
+          InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), HiReg, ClrNode,
+                                        InFlag).getValue(1);
+        }
+      }
+
+      SDOperand Tmp0, Tmp1, Tmp2, Tmp3, Chain;
+      bool foldedLoad = TryFoldLoad(N, N1, Tmp0, Tmp1, Tmp2, Tmp3);
+      if (foldedLoad) {
+        AddToISelQueue(N1.getOperand(0));
+        AddToISelQueue(Tmp0);
+        AddToISelQueue(Tmp1);
+        AddToISelQueue(Tmp2);
+        AddToISelQueue(Tmp3);
+        SDOperand Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, N1.getOperand(0), InFlag };
+        SDNode *CNode =
+          CurDAG->getTargetNode(MOpc, MVT::Other, MVT::Flag, Ops, 6);
+        Chain  = SDOperand(CNode, 0);
+        InFlag = SDOperand(CNode, 1);
+      } else {
+        AddToISelQueue(N1);
+        Chain = CurDAG->getEntryNode();
+        InFlag =
+          SDOperand(CurDAG->getTargetNode(Opc, MVT::Flag, N1, InFlag), 0);
+      }
+
+      SDOperand Result =
+        CurDAG->getCopyFromReg(Chain, isDiv ? LoReg : HiReg, NVT, InFlag);
+      ReplaceUses(N.getValue(0), Result);
+      if (foldedLoad)
+        ReplaceUses(N1.getValue(1), Result.getValue(1));
+
+#ifndef NDEBUG
+      DOUT << std::string(Indent-2, ' ') << "=> ";
+      DEBUG(Result.Val->dump(CurDAG));
+      DOUT << "\n";
+      Indent -= 2;
+#endif
+
+      return NULL;
+    }
+
+    case ISD::TRUNCATE: {
+      if (!Subtarget->is64Bit() && NVT == MVT::i8) {
+        unsigned Opc2;
+        MVT::ValueType VT;
+        switch (Node->getOperand(0).getValueType()) {
+        default: assert(0 && "Unknown truncate!");
+        case MVT::i16:
+          Opc = X86::MOV16to16_;
+          VT = MVT::i16;
+          Opc2 = X86::TRUNC_16_to8;
+          break;
+        case MVT::i32:
+          Opc = X86::MOV32to32_;
+          VT = MVT::i32;
+          Opc2 = X86::TRUNC_32_to8;
+          break;
+        }
+
+        AddToISelQueue(Node->getOperand(0));
+        SDOperand Tmp =
+          SDOperand(CurDAG->getTargetNode(Opc, VT, Node->getOperand(0)), 0);
+        SDNode *ResNode = CurDAG->getTargetNode(Opc2, NVT, Tmp);
+      
+#ifndef NDEBUG
+        DOUT << std::string(Indent-2, ' ') << "=> ";
+        DEBUG(ResNode->dump(CurDAG));
+        DOUT << "\n";
+        Indent -= 2;
+#endif
+        return ResNode;
+      }
+
+      break;
+    }
+  }
+
+  SDNode *ResNode = SelectCode(N);
+
+#ifndef NDEBUG
+  DOUT << std::string(Indent-2, ' ') << "=> ";
+  if (ResNode == NULL || ResNode == N.Val)
+    DEBUG(N.Val->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DOUT << "\n";
+  Indent -= 2;
+#endif
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDOperand &Op, char ConstraintCode,
+                             std::vector<SDOperand> &OutOps, SelectionDAG &DAG){
+  SDOperand Op0, Op1, Op2, Op3;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op, Op, Op0, Op1, Op2, Op3))
+      return true;
+    break;
+  }
+  
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  AddToISelQueue(Op0);
+  AddToISelQueue(Op1);
+  AddToISelQueue(Op2);
+  AddToISelQueue(Op3);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, bool Fast) {
+  return new X86DAGToDAGISel(TM, Fast);
+}
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..37dea79
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -0,0 +1,5094 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SSARegMap.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+X86TargetLowering::X86TargetLowering(TargetMachine &TM)
+  : TargetLowering(TM) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSE = Subtarget->hasSSE2();
+  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+  RegInfo = TM.getRegisterInfo();
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setShiftAmountType(MVT::i8);
+  setSetCCResultType(MVT::i8);
+  setSetCCResultContents(ZeroOrOneSetCCResult);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
+  setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetMingw()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+  
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+  } else {
+    if (X86ScalarSSE)
+      // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
+    else
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+  // SSE has no i16 to fp conversion, only i32
+  if (X86ScalarSSE)
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+  else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+  }
+
+  if (!Subtarget->is64Bit()) {
+    // Custom lower SINT_TO_FP and FP_TO_SINT from/to i64 in 32-bit mode.
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  }
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSE) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else {
+    if (X86ScalarSSE && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSE) {
+    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
+    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
+  }
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+
+  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i8   , Expand);
+  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i16  , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i16  , Expand);
+  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
+  setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ           , MVT::i64  , Expand);
+    setOperationAction(ISD::CTLZ           , MVT::i64  , Expand);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
+  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  // X86 ret instruction may pop stack.
+  setOperationAction(ISD::RET             , MVT::Other, Custom);
+  if (!Subtarget->is64Bit())
+    setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  // X86 wants to expand memset / memcpy itself.
+  setOperationAction(ISD::MEMSET          , MVT::Other, Custom);
+  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);
+
+  // We don't have line number support yet.
+  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
+  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() &&
+      !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing())
+    setOperationAction(ISD::LABEL, MVT::Other, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  if (Subtarget->is64Bit()) {
+    // FIXME: Verify
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  if (Subtarget->isTargetCygMing())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  if (X86ScalarSSE) {
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FREM , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+    addLegalFPImmediate(+0.0); // xorps / xorpd
+  } else {
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Expand);
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+
+    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+    addLegalFPImmediate(+0.0); // FLD0
+    addLegalFPImmediate(+1.0); // FLD1
+    addLegalFPImmediate(-0.0); // FLD0/FCHS
+    addLegalFPImmediate(-1.0); // FLD1/FCHS
+  }
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FNEG, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::ValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::ValueType)VT, Expand);
+  }
+
+  if (Subtarget->hasMMX()) {
+    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
+    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+    // FIXME: add MMX packed arithmetics
+
+    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
+
+    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
+
+    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
+
+    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
+  }
+
+  if (Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+  }
+
+  if (Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    // Implement v4f32 insert_vector_elt in terms of SSE2 v8i16 ones.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::BUILD_VECTOR,        (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,      (MVT::ValueType)VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  (MVT::ValueType)VT, Custom);
+    }
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
+      setOperationAction(ISD::AND,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::AND,    (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::OR,     (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::OR,     (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::XOR,    (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::LOAD,   (MVT::ValueType)VT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
+      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
+    }
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::SELECT);
+
+  computeRegisterProperties();
+
+  // FIXME: These should be based on subtarget info. Plus, the values should
+  // be smaller when we are in optimizing for size mode.
+  maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores
+  maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores
+  maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores
+  allowUnalignedMemoryAccesses = true; // x86 supports it!
+}
+
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+    
+/// LowerRET - Lower an ISD::RET node.
+SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
+  assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args");
+  
+  SmallVector<CCValAssign, 16> RVLocs;
+  unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv();
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
+  
+  
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
+  }
+  
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Flag;
+  
+  // Copy the result values into the output registers.
+  if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
+      RVLocs[0].getLocReg() != X86::ST0) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i) {
+      CCValAssign &VA = RVLocs[i];
+      assert(VA.isRegLoc() && "Can only return in registers!");
+      Chain = DAG.getCopyToReg(Chain, VA.getLocReg(), Op.getOperand(i*2+1),
+                               Flag);
+      Flag = Chain.getValue(1);
+    }
+  } else {
+    // We need to handle a destination of ST0 specially, because it isn't really
+    // a register.
+    SDOperand Value = Op.getOperand(1);
+    
+    // If this is an FP return with ScalarSSE, we need to move the value from
+    // an XMM register onto the fp-stack.
+    if (X86ScalarSSE) {
+      SDOperand MemLoc;
+      
+      // If this is a load into a scalarsse value, don't store the loaded value
+      // back to the stack, only to reload it: just replace the scalar-sse load.
+      if (ISD::isNON_EXTLoad(Value.Val) &&
+          (Chain == Value.getValue(1) || Chain == Value.getOperand(0))) {
+        Chain  = Value.getOperand(0);
+        MemLoc = Value.getOperand(1);
+      } else {
+        // Spill the value to memory and reload it into top of stack.
+        unsigned Size = MVT::getSizeInBits(RVLocs[0].getValVT())/8;
+        MachineFunction &MF = DAG.getMachineFunction();
+        int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+        MemLoc = DAG.getFrameIndex(SSFI, getPointerTy());
+        Chain = DAG.getStore(Op.getOperand(0), Value, MemLoc, NULL, 0);
+      }
+      SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other);
+      SDOperand Ops[] = {Chain, MemLoc, DAG.getValueType(RVLocs[0].getValVT())};
+      Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+      Chain = Value.getValue(1);
+    }
+    
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDOperand Ops[] = { Chain, Value };
+    Chain = DAG.getNode(X86ISD::FP_SET_RESULT, Tys, Ops, 2);
+    Flag = Chain.getValue(1);
+  }
+  
+  SDOperand BytesToPop = DAG.getConstant(getBytesToPopOnReturn(), MVT::i16);
+  if (Flag.Val)
+    return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop, Flag);
+  else
+    return DAG.getNode(X86ISD::RET_FLAG, MVT::Other, Chain, BytesToPop);
+}
+
+
+/// LowerCallResult - Lower the result values of an ISD::CALL into the
+/// appropriate copies out of appropriate physical registers.  This assumes that
+/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// being lowered.  The returns a SDNode with the same number of values as the
+/// ISD::CALL.
+SDNode *X86TargetLowering::
+LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, 
+                unsigned CallingConv, SelectionDAG &DAG) {
+  
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool isVarArg = cast<ConstantSDNode>(TheCall->getOperand(2))->getValue() != 0;
+  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
+  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
+
+  
+  SmallVector<SDOperand, 8> ResultVals;
+  
+  // Copy all of the result registers out of their specified physreg.
+  if (RVLocs.size() != 1 || RVLocs[0].getLocReg() != X86::ST0) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i) {
+      Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+      InFlag = Chain.getValue(2);
+      ResultVals.push_back(Chain.getValue(0));
+    }
+  } else {
+    // Copies from the FP stack are special, as ST0 isn't a valid register
+    // before the fp stackifier runs.
+    
+    // Copy ST0 into an RFP register with FP_GET_RESULT.
+    SDVTList Tys = DAG.getVTList(RVLocs[0].getValVT(), MVT::Other, MVT::Flag);
+    SDOperand GROps[] = { Chain, InFlag };
+    SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2);
+    Chain  = RetVal.getValue(1);
+    InFlag = RetVal.getValue(2);
+    
+    // If we are using ScalarSSE, store ST(0) to the stack and reload it into
+    // an XMM register.
+    if (X86ScalarSSE) {
+      // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
+      // shouldn't be necessary except that RFP cannot be live across
+      // multiple blocks. When stackifier is fixed, they can be uncoupled.
+      MachineFunction &MF = DAG.getMachineFunction();
+      int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+      SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+      SDOperand Ops[] = {
+        Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag
+      };
+      Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
+      RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0);
+      Chain = RetVal.getValue(1);
+    }
+    ResultVals.push_back(RetVal);
+  }
+  
+  // Merge everything together with a MERGE_VALUES node.
+  ResultVals.push_back(Chain);
+  return DAG.getNode(ISD::MERGE_VALUES, TheCall->getVTList(),
+                     &ResultVals[0], ResultVals.size()).Val;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value.  It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+                          const TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
+  MF.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+                                               bool isStdCall) {
+  unsigned NumArgs = Op.Val->getNumValues() - 1;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+   
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        RC = X86::VR128RegisterClass;
+      }
+      
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+      
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  unsigned StackSize = CCInfo.getNextStackOffset();
+
+  ArgValues.push_back(Root);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg)
+    VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+
+  if (isStdCall && !isVarArg) {
+    BytesToPopOnReturn  = StackSize;    // Callee pops everything..
+    BytesCallerReserves = 0;
+  } else {
+    BytesToPopOnReturn  = 0; // Callee pops nothing.
+    
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (NumArgs &&
+        (cast<ConstantSDNode>(Op.getOperand(3))->getValue() &
+         ISD::ParamFlags::StructReturn))
+      BytesToPopOnReturn = 4;  
+    
+    BytesCallerReserves = StackSize;
+  }
+  
+  RegSaveFrameIndex = 0xAAAAAAA;  // X86-64 only.
+  ReturnAddrIndex = 0;            // No return address slot generated yet.
+
+  MF.getInfo<X86MachineFunctionInfo>()
+    ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                            unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+  unsigned NumOps     = (Op.getNumOperands() - 5) / 2;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  SDOperand StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+
+  // If the first argument is an sret pointer, remember it.
+  bool isSRet = NumOps &&
+    (cast<ConstantSDNode>(Op.getOperand(6))->getValue() &
+     ISD::ParamFlags::StructReturn);
+  
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT()) {
+    Chain = DAG.getCopyToReg(Chain, X86::EBX,
+                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+  
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                        getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+  
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPush = 0;
+
+  if (CC == CallingConv::X86_StdCall) {
+    if (isVarArg)
+      NumBytesForCalleeToPush = isSRet ? 4 : 0;
+    else
+      NumBytesForCalleeToPush = NumBytes;
+  } else {
+    // If this is is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    NumBytesForCalleeToPush = isSRet ? 4 : 0;
+  }
+  
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                   FastCall Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//
+// The X86 'fastcall' calling convention passes up to two integer arguments in
+// registers (an appropriate portion of ECX/EDX), passes arguments in C order,
+// and requires that the callee pop its arguments off the stack (allowing proper
+// tail calls), and has the same return value conventions as C calling convs.
+//
+// This calling convention always arranges for the callee pop value to be 8n+4
+// bytes, which is needed for tail recursion elimination and stack alignment
+// reasons.
+SDOperand
+X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_FastCall);
+  
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        RC = X86::VR128RegisterClass;
+      }
+      
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+      
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  ArgValues.push_back(Root);
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+
+  if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+    // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+    // arguments and the arguments after the retaddr has been pushed are aligned.
+    if ((StackSize & 7) == 0)
+      StackSize += 4;
+  }
+
+  VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
+  RegSaveFrameIndex = 0xAAAAAAA;   // X86-64 only.
+  ReturnAddrIndex = 0;             // No return address slot generated yet.
+  BytesToPopOnReturn = StackSize;  // Callee pops all stack arguments.
+  BytesCallerReserves = 0;
+
+  MF.getInfo<X86MachineFunctionInfo>()
+    ->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                               unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_FastCall);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
+    // Make sure the instruction takes 8n+4 bytes to make sure the start of the
+    // arguments and the arguments after the retaddr has been pushed are aligned.
+    if ((NumBytes & 7) == 0)
+      NumBytes += 4;
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+  
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+  
+  SDOperand StackPtr;
+  
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+        break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                        getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT()) {
+    Chain = DAG.getCopyToReg(Chain, X86::EBX,
+                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  // FIXME: Do not generate X86ISD::TAILCALL for now.
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Returns a flag for retval copy to use.
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                 X86-64 C Calling Convention implementation
+//===----------------------------------------------------------------------===//
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SDOperand Root = Op.getOperand(0);
+  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+
+  static const unsigned GPR64ArgRegs[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9
+  };
+  static const unsigned XMMArgRegs[] = {
+    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+    X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+  };
+
+  
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+                 getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
+  
+  SmallVector<SDOperand, 8> ArgValues;
+  unsigned LastVal = ~0U;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+    
+    if (VA.isRegLoc()) {
+      MVT::ValueType RegVT = VA.getLocVT();
+      TargetRegisterClass *RC;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else if (RegVT == MVT::i64)
+        RC = X86::GR64RegisterClass;
+      else if (RegVT == MVT::f32)
+        RC = X86::FR32RegisterClass;
+      else if (RegVT == MVT::f64)
+        RC = X86::FR64RegisterClass;
+      else {
+        assert(MVT::isVector(RegVT));
+        if (MVT::getSizeInBits(RegVT) == 64) {
+          RC = X86::GR64RegisterClass;       // MMX values are passed in GPRs.
+          RegVT = MVT::i64;
+        } else
+          RC = X86::VR128RegisterClass;
+      }
+
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
+      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
+      
+      // Handle MMX values passed in GPRs.
+      if (RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
+          MVT::getSizeInBits(RegVT) == 64)
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
+      
+      ArgValues.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc());
+    
+      // Create the nodes corresponding to a load from this parameter slot.
+      int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
+                                      VA.getLocMemOffset());
+      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValues.push_back(DAG.getLoad(VA.getValVT(), Root, FIN, NULL, 0));
+    }
+  }
+  
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    
+    // For X86-64, if there are vararg parameters that are passed via
+    // registers, then we must store them to their spots on the stack so they
+    // may be loaded by deferencing the result of va_next.
+    VarArgsGPOffset = NumIntRegs * 8;
+    VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
+    VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
+    RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
+
+    // Store the integer parameter registers.
+    SmallVector<SDOperand, 8> MemOps;
+    SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+    SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+                              DAG.getConstant(VarArgsGPOffset, getPointerTy()));
+    for (; NumIntRegs != 6; ++NumIntRegs) {
+      unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
+                                X86::GR64RegisterClass);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
+
+    // Now store the XMM (fp + vector) parameter registers.
+    FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
+                      DAG.getConstant(VarArgsFPOffset, getPointerTy()));
+    for (; NumXMMRegs != 8; ++NumXMMRegs) {
+      unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
+                                X86::VR128RegisterClass);
+      SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
+      SDOperand Store = DAG.getStore(Val.getValue(1), Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                        DAG.getConstant(16, getPointerTy()));
+    }
+    if (!MemOps.empty())
+        Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                           &MemOps[0], MemOps.size());
+  }
+
+  ArgValues.push_back(Root);
+
+  ReturnAddrIndex = 0;     // No return address slot generated yet.
+  BytesToPopOnReturn = 0;  // Callee pops nothing.
+  BytesCallerReserves = StackSize;
+
+  // Return the new list of results.
+  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
+                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
+}
+
+SDOperand
+X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
+                                        unsigned CC) {
+  SDOperand Chain     = Op.getOperand(0);
+  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+  SDOperand Callee    = Op.getOperand(4);
+  
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_C);
+    
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
+
+  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+  SmallVector<SDOperand, 8> MemOpChains;
+
+  SDOperand StackPtr;
+  
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+    
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: assert(0 && "Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.Val == 0)
+        StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+      SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+    }
+  }
+  
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDOperand InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+                             InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isVarArg) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+    
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    
+    Chain = DAG.getCopyToReg(Chain, X86::AL,
+                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    if (getTargetMachine().getCodeModel() != CodeModel::Large
+        && !Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+                                           getTargetMachine(), true))
+      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    if (getTargetMachine().getCodeModel() != CodeModel::Large)
+      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.Val)
+    Ops.push_back(InFlag);
+
+  // FIXME: Do not generate X86ISD::TAILCALL for now.
+  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+                      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Returns a flag for retval copy to use.
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  Ops.clear();
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
+  Ops.push_back(DAG.getConstant(0, getPointerTy()));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+  
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    MachineFunction &MF = DAG.getMachineFunction();
+    if (Subtarget->is64Bit())
+      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
+    else
+      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+
+/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code. It returns a false if it cannot do a direct
+/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
+/// needed.
+static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                           unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
+                           SelectionDAG &DAG) {
+  X86CC = X86::COND_INVALID;
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        X86CC = X86::COND_NS;
+        return true;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        X86CC = X86::COND_S;
+        return true;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
+    case ISD::SETGT:  X86CC = X86::COND_G;  break;
+    case ISD::SETGE:  X86CC = X86::COND_GE; break;
+    case ISD::SETLT:  X86CC = X86::COND_L;  break;
+    case ISD::SETLE:  X86CC = X86::COND_LE; break;
+    case ISD::SETNE:  X86CC = X86::COND_NE; break;
+    case ISD::SETULT: X86CC = X86::COND_B;  break;
+    case ISD::SETUGT: X86CC = X86::COND_A;  break;
+    case ISD::SETULE: X86CC = X86::COND_BE; break;
+    case ISD::SETUGE: X86CC = X86::COND_AE; break;
+    }
+  } else {
+    // On a floating point condition, the flags are set as follows:
+    // ZF  PF  CF   op
+    //  0 | 0 | 0 | X > Y
+    //  0 | 0 | 1 | X < Y
+    //  1 | 0 | 0 | X == Y
+    //  1 | 1 | 1 | unordered
+    bool Flip = false;
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETUEQ:
+    case ISD::SETEQ: X86CC = X86::COND_E;  break;
+    case ISD::SETOLT: Flip = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT: X86CC = X86::COND_A;  break;
+    case ISD::SETOLE: Flip = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: X86CC = X86::COND_AE; break;
+    case ISD::SETUGT: Flip = true; // Fallthrough
+    case ISD::SETULT:
+    case ISD::SETLT: X86CC = X86::COND_B;  break;
+    case ISD::SETUGE: Flip = true; // Fallthrough
+    case ISD::SETULE:
+    case ISD::SETLE: X86CC = X86::COND_BE; break;
+    case ISD::SETONE:
+    case ISD::SETNE: X86CC = X86::COND_NE; break;
+    case ISD::SETUO: X86CC = X86::COND_P;  break;
+    case ISD::SETO:  X86CC = X86::COND_NP; break;
+    }
+    if (Flip)
+      std::swap(LHS, RHS);
+  }
+
+  return X86CC != X86::COND_INVALID;
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if its value falls within the specified range (L, H].
+static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) {
+  if (Op.getOpcode() == ISD::UNDEF)
+    return true;
+
+  unsigned Val = cast<ConstantSDNode>(Op)->getValue();
+  return (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if its value equal to the specified value.
+static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
+  if (Op.getOpcode() == ISD::UNDEF)
+    return true;
+  return cast<ConstantSDNode>(Op)->getValue() == Val;
+}
+
+/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFD.
+bool X86::isPSHUFDMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Check if the value doesn't reference the second vector.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
+      return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
+bool X86::isPSHUFHWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Lower quadword copied in order.
+  for (unsigned i = 0; i != 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (cast<ConstantSDNode>(Arg)->getValue() != i)
+      return false;
+  }
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < 4 || Val > 7)
+      return false;
+  }
+
+  return true;
+}
+
+/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
+bool X86::isPSHUFLWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Upper quadword copied in order.
+  for (unsigned i = 4; i != 8; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i)
+    if (!isUndefOrInRange(N->getOperand(i), 0, 4))
+      return false;
+
+  return true;
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SDOperand *Elems, unsigned NumElems) {
+  if (NumElems != 2 && NumElems != 4) return false;
+
+  unsigned Half = NumElems / 2;
+  for (unsigned i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Elems[i], 0, NumElems))
+      return false;
+  for (unsigned i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2))
+      return false;
+
+  return true;
+}
+
+bool X86::isSHUFPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isSHUFPMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFP(const SDOperand *Ops, unsigned NumOps) {
+  if (NumOps != 2 && NumOps != 4) return false;
+
+  unsigned Half = NumOps / 2;
+  for (unsigned i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2))
+      return false;
+  for (unsigned i = Half; i < NumOps; ++i)
+    if (!isUndefOrInRange(Ops[i], 0, NumOps))
+      return false;
+  return true;
+}
+
+static bool isCommutedSHUFP(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return isCommutedSHUFP(N->op_begin(), N->getNumOperands());
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getOperand(0), 6) &&
+         isUndefOrEqual(N->getOperand(1), 7) &&
+         isUndefOrEqual(N->getOperand(2), 2) &&
+         isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getOperand(0), 2) &&
+         isUndefOrEqual(N->getOperand(1), 3) &&
+         isUndefOrEqual(N->getOperand(2), 2) &&
+         isUndefOrEqual(N->getOperand(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  return true;
+}
+
+/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+/// and MOVLHPS.
+bool X86::isMOVHPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i))
+      return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i) {
+    SDOperand Arg = N->getOperand(i + NumElems/2);
+    if (!isUndefOrEqual(Arg, i + NumElems))
+      return false;
+  }
+
+  return true;
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+bool static isUNPCKLMask(const SDOperand *Elts, unsigned NumElts,
+                         bool V2IsSplat = false) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    SDOperand BitI  = Elts[i];
+    SDOperand BitI1 = Elts[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+bool static isUNPCKHMask(const SDOperand *Elts, unsigned NumElts,
+                         bool V2IsSplat = false) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    SDOperand BitI  = Elts[i];
+    SDOperand BitI1 = Elts[i+1];
+    if (!isUndefOrEqual(BitI, j + NumElts/2))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+        return false;
+    }
+  }
+
+  return true;
+}
+
+bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
+    SDOperand BitI  = N->getOperand(i);
+    SDOperand BitI1 = N->getOperand(i+1);
+
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+
+  return true;
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+bool X86::isUNPCKH_v_undef_Mask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  unsigned NumElems = N->getNumOperands();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+    SDOperand BitI  = N->getOperand(i);
+    SDOperand BitI1 = N->getOperand(i + 1);
+
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+
+  return true;
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SDOperand *Elts, unsigned NumElts) {
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  if (!isUndefOrEqual(Elts[0], NumElts))
+    return false;
+
+  for (unsigned i = 1; i < NumElts; ++i) {
+    if (!isUndefOrEqual(Elts[i], i))
+      return false;
+  }
+
+  return true;
+}
+
+bool X86::isMOVLMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return ::isMOVLMask(N->op_begin(), N->getNumOperands());
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVL(const SDOperand *Ops, unsigned NumOps,
+                           bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+
+  if (!isUndefOrEqual(Ops[0], 0))
+    return false;
+
+  for (unsigned i = 1; i < NumOps; ++i) {
+    SDOperand Arg = Ops[i];
+    if (!(isUndefOrEqual(Arg, i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Arg, NumOps))))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+  return isCommutedMOVL(N->op_begin(), N->getNumOperands(),
+                        V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect 1, 1, 3, 3
+  for (unsigned i = 0; i < 2; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 1) return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 3) return false;
+    HasHi = true;
+  }
+
+  // Don't use movshdup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 4)
+    return false;
+
+  // Expect 0, 0, 2, 2
+  for (unsigned i = 0; i < 2; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 0) return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val != 2) return false;
+    HasHi = true;
+  }
+
+  // Don't use movshdup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a identity operation on the LHS or RHS.
+static bool isIdentityMask(SDNode *N, bool RHS = false) {
+  unsigned NumElems = N->getNumOperands();
+  for (unsigned i = 0; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0)))
+      return false;
+  return true;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element.
+static bool isSplatMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned NumElems = N->getNumOperands();
+  SDOperand ElementBase;
+  unsigned i = 0;
+  for (; i != NumElems; ++i) {
+    SDOperand Elt = N->getOperand(i);
+    if (isa<ConstantSDNode>(Elt)) {
+      ElementBase = Elt;
+      break;
+    }
+  }
+
+  if (!ElementBase.Val)
+    return false;
+
+  for (; i != NumElems; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    if (Arg != ElementBase) return false;
+  }
+
+  // Make sure it is a splat of the first vector operand.
+  return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems;
+}
+
+/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
+/// a splat of a single element and it's a 2 or 4 element mask.
+bool X86::isSplatMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  // We can only splat 64-bit, and 32-bit quantities with a single instruction.
+  if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
+    return false;
+  return ::isSplatMask(N);
+}
+
+/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of zero element.
+bool X86::isSplatLoMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    if (!isUndefOrEqual(N->getOperand(i), 0))
+      return false;
+  return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+/// instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+  unsigned NumOperands = N->getNumOperands();
+  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  unsigned Mask = 0;
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(NumOperands-i-1);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val >= NumOperands) Val -= NumOperands;
+    Mask |= Val;
+    if (i != NumOperands - 1)
+      Mask <<= Shift;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+/// instructions.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the last 4.
+  for (unsigned i = 7; i >= 4; --i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    Mask |= (Val - 4);
+    if (i != 4)
+      Mask <<= 2;
+  }
+
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+/// instructions.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the first 4.
+  for (int i = 3; i >= 0; --i) {
+    unsigned Val = 0;
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF)
+      Val = cast<ConstantSDNode>(Arg)->getValue();
+    Mask |= Val;
+    if (i != 0)
+      Mask <<= 2;
+  }
+
+  return Mask;
+}
+
+/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
+/// specifies a 8 element shuffle that can be broken into a pair of
+/// PSHUFHW and PSHUFLW.
+static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR);
+
+  if (N->getNumOperands() != 8)
+    return false;
+
+  // Lower quadword shuffled.
+  for (unsigned i = 0; i != 4; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val > 4)
+      return false;
+  }
+
+  // Upper quadword shuffled.
+  for (unsigned i = 4; i != 8; ++i) {
+    SDOperand Arg = N->getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) continue;
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < 4 || Val > 7)
+      return false;
+  }
+
+  return true;
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
+/// values in ther permute mask.
+static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
+                                      SDOperand &V2, SDOperand &Mask,
+                                      SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType MaskVT = Mask.getValueType();
+  MVT::ValueType EltVT = MVT::getVectorElementType(MaskVT);
+  unsigned NumElems = Mask.getNumOperands();
+  SmallVector<SDOperand, 8> MaskVec;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() == ISD::UNDEF) {
+      MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
+      continue;
+    }
+    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
+    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+    if (Val < NumElems)
+      MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
+    else
+      MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
+  }
+
+  std::swap(V1, V2);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(SDNode *Mask) {
+  unsigned NumElems = Mask->getNumOperands();
+  if (NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector.
+static inline bool isScalarLoadToVector(SDNode *N) {
+  if (N->getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    N = N->getOperand(0).Val;
+    return ISD::isNON_EXTLoad(N);
+  }
+  return false;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) {
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2))
+    return false;
+
+  unsigned NumElems = Mask->getNumOperands();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i))
+      return false;
+  for (unsigned i = NumElems/2; i != NumElems; ++i)
+    if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems))
+      return false;
+  return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  SDOperand SplatValue = N->getOperand(0);
+  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i) != SplatValue)
+      return false;
+  return true;
+}
+
+/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an undef.
+static bool isUndefShuffle(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  SDOperand V1 = N->getOperand(0);
+  SDOperand V2 = N->getOperand(1);
+  SDOperand Mask = N->getOperand(2);
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      if (Val < NumElems && V1.getOpcode() != ISD::UNDEF)
+        return false;
+      else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+static inline bool isZeroNode(SDOperand Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+           cast<ConstantSDNode>(Elt)->getValue() == 0) ||
+          (isa<ConstantFPSDNode>(Elt) &&
+           cast<ConstantFPSDNode>(Elt)->isExactlyValue(0.0)));
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+static bool isZeroShuffle(SDNode *N) {
+  if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
+    return false;
+
+  SDOperand V1 = N->getOperand(0);
+  SDOperand V2 = N->getOperand(1);
+  SDOperand Mask = N->getOperand(2);
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Idx = cast<ConstantSDNode>(Arg)->getValue();
+      if (Idx < NumElems) {
+        unsigned Opc = V1.Val->getOpcode();
+        if (Opc == ISD::UNDEF)
+          continue;
+        if (Opc != ISD::BUILD_VECTOR ||
+            !isZeroNode(V1.Val->getOperand(Idx)))
+          return false;
+      } else if (Idx >= NumElems) {
+        unsigned Opc = V2.Val->getOpcode();
+        if (Opc == ISD::UNDEF)
+          continue;
+        if (Opc != ISD::BUILD_VECTOR ||
+            !isZeroNode(V2.Val->getOperand(Idx - NumElems)))
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDOperand getZeroVector(MVT::ValueType VT, SelectionDAG &DAG) {
+  assert(MVT::isVector(VT) && "Expected a vector type");
+  unsigned NumElems = MVT::getVectorNumElements(VT);
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  bool isFP = MVT::isFloatingPoint(EVT);
+  SDOperand Zero = isFP ? DAG.getConstantFP(0.0, EVT) : DAG.getConstant(0, EVT);
+  SmallVector<SDOperand, 8> ZeroVec(NumElems, Zero);
+  return DAG.getNode(ISD::BUILD_VECTOR, VT, &ZeroVec[0], ZeroVec.size());
+}
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDOperand NormalizeMask(SDOperand Mask, SelectionDAG &DAG) {
+  assert(Mask.getOpcode() == ISD::BUILD_VECTOR);
+
+  bool Changed = false;
+  SmallVector<SDOperand, 8> MaskVec;
+  unsigned NumElems = Mask.getNumOperands();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDOperand Arg = Mask.getOperand(i);
+    if (Arg.getOpcode() != ISD::UNDEF) {
+      unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
+      if (Val > NumElems) {
+        Arg = DAG.getConstant(NumElems, Arg.getValueType());
+        Changed = true;
+      }
+    }
+    MaskVec.push_back(Arg);
+  }
+
+  if (Changed)
+    Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
+                       &MaskVec[0], MaskVec.size());
+  return Mask;
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDOperand getMOVLMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+
+  SmallVector<SDOperand, 8> MaskVec;
+  MaskVec.push_back(DAG.getConstant(NumElems, BaseVT));
+  for (unsigned i = 1; i != NumElems; ++i)
+    MaskVec.push_back(DAG.getConstant(i, BaseVT));
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation
+/// of specified width.
+static SDOperand getUnpacklMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  SmallVector<SDOperand, 8> MaskVec;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    MaskVec.push_back(DAG.getConstant(i,            BaseVT));
+    MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
+/// of specified width.
+static SDOperand getUnpackhMask(unsigned NumElems, SelectionDAG &DAG) {
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+  unsigned Half = NumElems/2;
+  SmallVector<SDOperand, 8> MaskVec;
+  for (unsigned i = 0; i != Half; ++i) {
+    MaskVec.push_back(DAG.getConstant(i + Half,            BaseVT));
+    MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], MaskVec.size());
+}
+
+/// PromoteSplat - Promote a splat of v8i16 or v16i8 to v4i32.
+///
+static SDOperand PromoteSplat(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand Mask = Op.getOperand(2);
+  MVT::ValueType VT = Op.getValueType();
+  unsigned NumElems = Mask.getNumOperands();
+  Mask = getUnpacklMask(NumElems, DAG);
+  while (NumElems != 4) {
+    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, Mask);
+    NumElems >>= 1;
+  }
+  V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
+
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+  Mask = getZeroVector(MaskVT, DAG);
+  SDOperand Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1,
+                                  DAG.getNode(ISD::UNDEF, MVT::v4i32), Mask);
+  return DAG.getNode(ISD::BIT_CONVERT, VT, Shuffle);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.
+static SDOperand getShuffleVectorZeroOrUndef(SDOperand V2, MVT::ValueType VT,
+                                             unsigned NumElems, unsigned Idx,
+                                             bool isZero, SelectionDAG &DAG) {
+  SDOperand V1 = isZero ? getZeroVector(VT, DAG) : DAG.getNode(ISD::UNDEF, VT);
+  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+  MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+  SDOperand Zero = DAG.getConstant(0, EVT);
+  SmallVector<SDOperand, 8> MaskVec(NumElems, Zero);
+  MaskVec[Idx] = DAG.getConstant(NumElems, EVT);
+  SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                               &MaskVec[0], MaskVec.size());
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+}
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDOperand LowerBuildVectorv16i8(SDOperand Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDOperand();
+
+  SDOperand V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, DAG);
+      else
+        V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDOperand ThisElt(0, 0), LastElt(0, 0);
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.Val)
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt,
+                        DAG.getConstant(i/2, TLI.getPointerTy()));
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDOperand();
+
+  SDOperand V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, DAG);
+        else
+          V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getConstant(i, TLI.getPointerTy()));
+    }
+  }
+
+  return V;
+}
+
+SDOperand
+X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  // All zero's are handled with pxor.
+  if (ISD::isBuildVectorAllZeros(Op.Val))
+    return Op;
+
+  // All one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllOnes(Op.Val))
+    return Op;
+
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  unsigned EVTBits = MVT::getSizeInBits(EVT);
+
+  unsigned NumElems = Op.getNumOperands();
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  std::set<SDOperand> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDOperand Elt = Op.getOperand(i);
+    if (Elt.getOpcode() != ISD::UNDEF) {
+      Values.insert(Elt);
+      if (isZeroNode(Elt))
+        NumZero++;
+      else {
+        NonZeros |= (1 << i);
+        NumNonZero++;
+      }
+    }
+  }
+
+  if (NumNonZero == 0) {
+    if (NumZero == 0)
+      // All undef vector. Return an UNDEF.
+      return DAG.getNode(ISD::UNDEF, VT);
+    else
+      // A mix of zero and undef. Return a zero vector.
+      return getZeroVector(VT, DAG);
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1)
+    return SDOperand();
+
+  // Special case for single non-zero element.
+  if (NumNonZero == 1) {
+    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    SDOperand Item = Op.getOperand(Idx);
+    Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
+    if (Idx == 0)
+      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+      return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx,
+                                         NumZero > 0, DAG);
+
+    if (EVTBits == 32) {
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0,
+                                         DAG);
+      MVT::ValueType MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
+      MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i < NumElems; i++)
+        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskVec[0], MaskVec.size());
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
+                         DAG.getNode(ISD::UNDEF, VT), Mask);
+    }
+  }
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64)
+    return SDOperand();
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.Val) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.Val) return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDOperand, 8> V;
+  V.resize(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, DAG);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
+                             getMOVLMask(NumElems, DAG));
+          break;
+        case 2:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+                             getMOVLMask(NumElems, DAG));
+          break;
+        case 3:
+          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
+                             getUnpacklMask(NumElems, DAG));
+          break;
+      }
+    }
+
+    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
+    // clears the upper bits.
+    // FIXME: we can do the same for v4f32 case when we know both parts of
+    // the lower half come from scalar_to_vector (loadf32). We should do
+    // that in post legalizer dag combiner with target specific hooks.
+    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
+      return V[0];
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
+    SmallVector<SDOperand, 8> MaskVec;
+    bool Reverse = (NonZeros & 0x3) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      if (Reverse)
+        MaskVec.push_back(DAG.getConstant(1-i, EVT));
+      else
+        MaskVec.push_back(DAG.getConstant(i, EVT));
+    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      if (Reverse)
+        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
+      else
+        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
+    SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &MaskVec[0], MaskVec.size());
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
+  }
+
+  if (Values.size() > 2) {
+    // Expand into a number of unpckl*.
+    // e.g. for v4f32
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
+    NumElems >>= 1;
+    while (NumElems != 0) {
+      for (unsigned i = 0; i < NumElems; ++i)
+        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
+                           UnpckMask);
+      NumElems >>= 1;
+    }
+    return V[0];
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand V1 = Op.getOperand(0);
+  SDOperand V2 = Op.getOperand(1);
+  SDOperand PermMask = Op.getOperand(2);
+  MVT::ValueType VT = Op.getValueType();
+  unsigned NumElems = PermMask.getNumOperands();
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+
+  if (isUndefShuffle(Op.Val))
+    return DAG.getNode(ISD::UNDEF, VT);
+
+  if (isZeroShuffle(Op.Val))
+    return getZeroVector(VT, DAG);
+
+  if (isIdentityMask(PermMask.Val))
+    return V1;
+  else if (isIdentityMask(PermMask.Val, true))
+    return V2;
+
+  if (isSplatMask(PermMask.Val)) {
+    if (NumElems <= 4) return Op;
+    // Promote it to a v4i32 splat.
+    return PromoteSplat(Op, DAG);
+  }
+
+  if (X86::isMOVLMask(PermMask.Val))
+    return (V1IsUndef) ? V2 : Op;
+
+  if (X86::isMOVSHDUPMask(PermMask.Val) ||
+      X86::isMOVSLDUPMask(PermMask.Val) ||
+      X86::isMOVHLPSMask(PermMask.Val) ||
+      X86::isMOVHPMask(PermMask.Val) ||
+      X86::isMOVLPMask(PermMask.Val))
+    return Op;
+
+  if (ShouldXformToMOVHLPS(PermMask.Val) ||
+      ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
+    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+  bool Commuted = false;
+  V1IsSplat = isSplatVector(V1.Val);
+  V2IsSplat = isSplatVector(V2.Val);
+  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    std::swap(V1IsSplat, V2IsSplat);
+    std::swap(V1IsUndef, V2IsUndef);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) {
+    if (V2IsUndef) return V1;
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    if (V2IsSplat) {
+      // V2 is a splat, so the mask may be malformed. That is, it may point
+      // to any V2 element. The instruction selectior won't like this. Get
+      // a corrected mask and commute to form a proper MOVS{S|D}.
+      SDOperand NewMask = getMOVLMask(NumElems, DAG);
+      if (NewMask.Val != PermMask.Val)
+        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+    }
+    return Op;
+  }
+
+  if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+      X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+      X86::isUNPCKLMask(PermMask.Val) ||
+      X86::isUNPCKHMask(PermMask.Val))
+    return Op;
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.
+    SDOperand NewMask = NormalizeMask(PermMask, DAG);
+    if (NewMask.Val != PermMask.Val) {
+      if (X86::isUNPCKLMask(PermMask.Val, true)) {
+        SDOperand NewMask = getUnpacklMask(NumElems, DAG);
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+      } else if (X86::isUNPCKHMask(PermMask.Val, true)) {
+        SDOperand NewMask = getUnpackhMask(NumElems, DAG);
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
+      }
+    }
+  }
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val))
+      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
+    if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
+        X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
+        X86::isUNPCKLMask(PermMask.Val) ||
+        X86::isUNPCKHMask(PermMask.Val))
+      return Op;
+  }
+
+  // If VT is integer, try PSHUF* first, then SHUFP*.
+  if (MVT::isInteger(VT)) {
+    if (X86::isPSHUFDMask(PermMask.Val) ||
+        X86::isPSHUFHWMask(PermMask.Val) ||
+        X86::isPSHUFLWMask(PermMask.Val)) {
+      if (V2.getOpcode() != ISD::UNDEF)
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+      return Op;
+    }
+
+    if (X86::isSHUFPMask(PermMask.Val) &&
+        MVT::getSizeInBits(VT) != 64)    // Don't do this for MMX.
+      return Op;
+
+    // Handle v8i16 shuffle high / low shuffle node pair.
+    if (VT == MVT::v8i16 && isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
+      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
+      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(PermMask.getOperand(i));
+      for (unsigned i = 4; i != 8; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskVec[0], MaskVec.size());
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+      MaskVec.clear();
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      for (unsigned i = 4; i != 8; ++i)
+        MaskVec.push_back(PermMask.getOperand(i));
+      Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0],MaskVec.size());
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
+    }
+  } else {
+    // Floating point cases in the other order.
+    if (X86::isSHUFPMask(PermMask.Val))
+      return Op;
+    if (X86::isPSHUFDMask(PermMask.Val) ||
+        X86::isPSHUFHWMask(PermMask.Val) ||
+        X86::isPSHUFLWMask(PermMask.Val)) {
+      if (V2.getOpcode() != ISD::UNDEF)
+        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
+                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
+      return Op;
+    }
+  }
+
+  if (NumElems == 4 && 
+      // Don't do this for MMX.
+      MVT::getSizeInBits(VT) != 64) {
+    MVT::ValueType MaskVT = PermMask.getValueType();
+    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
+    SmallVector<std::pair<int, int>, 8> Locs;
+    Locs.reserve(NumElems);
+    SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    unsigned NumHi = 0;
+    unsigned NumLo = 0;
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    for (unsigned i = 0; i != NumElems; ++i) {
+      SDOperand Elt = PermMask.getOperand(i);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Locs[i] = std::make_pair(-1, -1);
+      } else {
+        unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
+        if (Val < NumElems) {
+          Locs[i] = std::make_pair(0, NumLo);
+          Mask1[NumLo] = Elt;
+          NumLo++;
+        } else {
+          Locs[i] = std::make_pair(1, NumHi);
+          if (2+NumHi < NumElems)
+            Mask1[2+NumHi] = Elt;
+          NumHi++;
+        }
+      }
+    }
+    if (NumLo <= 2 && NumHi <= 2) {
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &Mask1[0], Mask1.size()));
+      for (unsigned i = 0; i != NumElems; ++i) {
+        if (Locs[i].first == -1)
+          continue;
+        else {
+          unsigned Idx = (i < NumElems/2) ? 0 : NumElems;
+          Idx += Locs[i].first * (NumElems/2) + Locs[i].second;
+          Mask2[i] = DAG.getConstant(Idx, MaskEVT);
+        }
+      }
+
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
+                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &Mask2[0], Mask2.size()));
+    }
+
+    // Break it into (shuffle shuffle_hi, shuffle_lo).
+    Locs.clear();
+    SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand,8> *MaskPtr = &LoMask;
+    unsigned MaskIdx = 0;
+    unsigned LoIdx = 0;
+    unsigned HiIdx = NumElems/2;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (i == NumElems/2) {
+        MaskPtr = &HiMask;
+        MaskIdx = 1;
+        LoIdx = 0;
+        HiIdx = NumElems/2;
+      }
+      SDOperand Elt = PermMask.getOperand(i);
+      if (Elt.getOpcode() == ISD::UNDEF) {
+        Locs[i] = std::make_pair(-1, -1);
+      } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) {
+        Locs[i] = std::make_pair(MaskIdx, LoIdx);
+        (*MaskPtr)[LoIdx] = Elt;
+        LoIdx++;
+      } else {
+        Locs[i] = std::make_pair(MaskIdx, HiIdx);
+        (*MaskPtr)[HiIdx] = Elt;
+        HiIdx++;
+      }
+    }
+
+    SDOperand LoShuffle =
+      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                              &LoMask[0], LoMask.size()));
+    SDOperand HiShuffle =
+      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
+                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                              &HiMask[0], HiMask.size()));
+    SmallVector<SDOperand, 8> MaskOps;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Locs[i].first == -1) {
+        MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+      } else {
+        unsigned Idx = Locs[i].first * NumElems + Locs[i].second;
+        MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
+      }
+    }
+    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
+                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                   &MaskOps[0], MaskOps.size()));
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDOperand();
+
+  MVT::ValueType VT = Op.getValueType();
+  // TODO: handle v16i8.
+  if (MVT::getSizeInBits(VT) == 16) {
+    // Transform it so it match pextrw which produces a 32-bit result.
+    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
+    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDOperand Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
+  } else if (MVT::getSizeInBits(VT) == 32) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return Op;
+    // SHUFPS the element to the lowest double word, then movss.
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    SmallVector<SDOperand, 8> IdxVec;
+    IdxVec.push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                 &IdxVec[0], IdxVec.size());
+    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+                       DAG.getConstant(0, getPointerTy()));
+  } else if (MVT::getSizeInBits(VT) == 64) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+    SmallVector<SDOperand, 8> IdxVec;
+    IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
+    IdxVec.push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
+    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                 &IdxVec[0], IdxVec.size());
+    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
+                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
+                       DAG.getConstant(0, getPointerTy()));
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
+  // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+  // as its second argument.
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType BaseVT = MVT::getVectorElementType(VT);
+  SDOperand N0 = Op.getOperand(0);
+  SDOperand N1 = Op.getOperand(1);
+  SDOperand N2 = Op.getOperand(2);
+  if (MVT::getSizeInBits(BaseVT) == 16) {
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getConstant(cast<ConstantSDNode>(N2)->getValue(),getPointerTy());
+    return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
+  } else if (MVT::getSizeInBits(BaseVT) == 32) {
+    unsigned Idx = cast<ConstantSDNode>(N2)->getValue();
+    if (Idx == 0) {
+      // Use a movss.
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, N1);
+      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
+      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
+      SmallVector<SDOperand, 8> MaskVec;
+      MaskVec.push_back(DAG.getConstant(4, BaseVT));
+      for (unsigned i = 1; i <= 3; ++i)
+        MaskVec.push_back(DAG.getConstant(i, BaseVT));
+      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, N0, N1,
+                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
+                                     &MaskVec[0], MaskVec.size()));
+    } else {
+      // Use two pinsrw instructions to insert a 32 bit value.
+      Idx <<= 1;
+      if (MVT::isFloatingPoint(N1.getValueType())) {
+        if (ISD::isNON_EXTLoad(N1.Val)) {
+          // Just load directly from f32mem to GR32.
+          LoadSDNode *LD = cast<LoadSDNode>(N1);
+          N1 = DAG.getLoad(MVT::i32, LD->getChain(), LD->getBasePtr(),
+                           LD->getSrcValue(), LD->getSrcValueOffset());
+        } else {
+          N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4f32, N1);
+          N1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, N1);
+          N1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, N1,
+                           DAG.getConstant(0, getPointerTy()));
+        }
+      }
+      N0 = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, N0);
+      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+                       DAG.getConstant(Idx, getPointerTy()));
+      N1 = DAG.getNode(ISD::SRL, MVT::i32, N1, DAG.getConstant(16, MVT::i8));
+      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
+                       DAG.getConstant(Idx+1, getPointerTy()));
+      return DAG.getNode(ISD::BIT_CONVERT, VT, N0);
+    }
+  }
+
+  return SDOperand();
+}
+
+SDOperand
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
+  return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt);
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDOperand
+X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(),
+                                               getPointerTy(),
+                                               CP->getAlignment());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+  
+  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
+  // load the value at address GV, not the value of GV itself. This means that
+  // the GlobalAddress must be in the base or index register of the address, not
+  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
+  // The same applies for external symbols during PIC codegen
+  if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
+    Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, NULL, 0);
+
+  return Result;
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+static SDOperand
+LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                              const MVT::ValueType PtrVT) {
+  SDOperand InFlag;
+  SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
+                                     DAG.getNode(X86ISD::GlobalBaseReg,
+                                                 PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // emit leal symbol@TLSGD(,%ebx,1), %eax
+  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
+  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDOperand Ops[] = { Chain,  TGA, InFlag };
+  SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3);
+  InFlag = Result.getValue(2);
+  Chain = Result.getValue(1);
+
+  // call ___tls_get_addr. This function receives its argument in
+  // the register EAX.
+  Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag);
+  InFlag = Chain.getValue(1);
+
+  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops1[] = { Chain,
+                      DAG.getTargetExternalSymbol("___tls_get_addr",
+                                                  PtrVT),
+                      DAG.getRegister(X86::EAX, PtrVT),
+                      DAG.getRegister(X86::EBX, PtrVT),
+                      InFlag };
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5);
+  InFlag = Chain.getValue(1);
+
+  return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDOperand
+LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                         const MVT::ValueType PtrVT) {
+  // Get the Thread Pointer
+  SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
+  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+  // exec)
+  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                             GA->getValueType(0),
+                                             GA->getOffset());
+  SDOperand Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA);
+
+  if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
+    Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, NULL, 0);
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
+}
+
+SDOperand
+X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  // TODO: implement the "initial exec"model for pic executables
+  assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF and 64-bit targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+  // otherwise use the "Local Exec"TLS Model
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+    return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy());
+  else
+    return LowerToTLSExecModel(GA, DAG, getPointerTy());
+}
+
+SDOperand
+X86TargetLowering::LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG) {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDOperand Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDOperand Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->isPICStyleRIPRel()) {
+    Result = DAG.getNode(ISD::ADD, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) {
+    assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
+           "Not an i64 shift!");
+    bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+    SDOperand ShOpLo = Op.getOperand(0);
+    SDOperand ShOpHi = Op.getOperand(1);
+    SDOperand ShAmt  = Op.getOperand(2);
+    SDOperand Tmp1 = isSRA ?
+      DAG.getNode(ISD::SRA, MVT::i32, ShOpHi, DAG.getConstant(31, MVT::i8)) :
+      DAG.getConstant(0, MVT::i32);
+
+    SDOperand Tmp2, Tmp3;
+    if (Op.getOpcode() == ISD::SHL_PARTS) {
+      Tmp2 = DAG.getNode(X86ISD::SHLD, MVT::i32, ShOpHi, ShOpLo, ShAmt);
+      Tmp3 = DAG.getNode(ISD::SHL, MVT::i32, ShOpLo, ShAmt);
+    } else {
+      Tmp2 = DAG.getNode(X86ISD::SHRD, MVT::i32, ShOpLo, ShOpHi, ShAmt);
+      Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, MVT::i32, ShOpHi, ShAmt);
+    }
+
+    const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+    SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
+                                    DAG.getConstant(32, MVT::i8));
+    SDOperand COps[]={DAG.getEntryNode(), AndNode, DAG.getConstant(0, MVT::i8)};
+    SDOperand InFlag = DAG.getNode(X86ISD::CMP, VTs, 2, COps, 3).getValue(1);
+
+    SDOperand Hi, Lo;
+    SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+
+    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::Flag);
+    SmallVector<SDOperand, 4> Ops;
+    if (Op.getOpcode() == ISD::SHL_PARTS) {
+      Ops.push_back(Tmp2);
+      Ops.push_back(Tmp3);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+      InFlag = Hi.getValue(1);
+
+      Ops.clear();
+      Ops.push_back(Tmp3);
+      Ops.push_back(Tmp1);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+    } else {
+      Ops.push_back(Tmp2);
+      Ops.push_back(Tmp3);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Lo = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+      InFlag = Lo.getValue(1);
+
+      Ops.clear();
+      Ops.push_back(Tmp3);
+      Ops.push_back(Tmp1);
+      Ops.push_back(CC);
+      Ops.push_back(InFlag);
+      Hi = DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+    }
+
+    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::i32);
+    Ops.clear();
+    Ops.push_back(Lo);
+    Ops.push_back(Hi);
+    return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getOperand(0).getValueType() <= MVT::i64 &&
+         Op.getOperand(0).getValueType() >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  SDOperand Result;
+  MVT::ValueType SrcVT = Op.getOperand(0).getValueType();
+  unsigned Size = MVT::getSizeInBits(SrcVT)/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
+  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDOperand Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0),
+                                 StackSlot, NULL, 0);
+
+  // Build the FILD
+  SDVTList Tys;
+  if (X86ScalarSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(StackSlot);
+  Ops.push_back(DAG.getValueType(SrcVT));
+  Result = DAG.getNode(X86ScalarSSE ? X86ISD::FILD_FLAG :X86ISD::FILD,
+                       Tys, &Ops[0], Ops.size());
+
+  if (X86ScalarSSE) {
+    Chain = Result.getValue(1);
+    SDOperand InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
+    SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SmallVector<SDOperand, 8> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Result);
+    Ops.push_back(StackSlot);
+    Ops.push_back(DAG.getValueType(Op.getValueType()));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
+    Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, NULL, 0);
+  }
+
+  return Result;
+}
+
+SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
+  assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 &&
+         "Unknown FP_TO_SINT to lower!");
+  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  switch (Op.getValueType()) {
+    default: assert(0 && "Invalid FP_TO_SINT to lower!");
+    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDOperand Chain = DAG.getEntryNode();
+  SDOperand Value = Op.getOperand(0);
+  if (X86ScalarSSE) {
+    assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDOperand Ops[] = {
+      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+    };
+    Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  // Build the FP_TO_INT*_IN_MEM
+  SDOperand Ops[] = { Chain, Value, StackSlot };
+  SDOperand FIST = DAG.getNode(Opc, MVT::Other, Ops, 3);
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(~(1ULL << 63)));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(~(1U << 31)));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType EltVT = VT;
+  if (MVT::isVector(VT))
+    EltVT = MVT::getVectorElementType(VT);
+  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(OpNTy, BitsToDouble(1ULL << 63));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(OpNTy, BitsToFloat(1U << 31));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  return DAG.getNode(X86ISD::FXOR, VT, Op.getOperand(0), Mask);
+}
+
+SDOperand X86TargetLowering::LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Op0 = Op.getOperand(0);
+  SDOperand Op1 = Op.getOperand(1);
+  MVT::ValueType VT = Op.getValueType();
+  MVT::ValueType SrcVT = Op1.getValueType();
+  const Type *SrcTy =  MVT::getTypeForValueType(SrcVT);
+
+  // If second operand is smaller, extend it first.
+  if (MVT::getSizeInBits(SrcVT) < MVT::getSizeInBits(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, VT, Op1);
+    SrcVT = VT;
+  }
+
+  // First get the sign bit of second operand.
+  std::vector<Constant*> CV;
+  if (SrcVT == MVT::f64) {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(1ULL << 63)));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  } else {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(1U << 31)));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  }
+  Constant *CS = ConstantStruct::get(CV);
+  SDOperand CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  SDVTList Tys = DAG.getVTList(SrcVT, MVT::Other);
+  SmallVector<SDOperand, 3> Ops;
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask1 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  SDOperand SignBit = DAG.getNode(X86ISD::FAND, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (MVT::getSizeInBits(SrcVT) > MVT::getSizeInBits(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::f32, SignBit,
+                          DAG.getConstant(0, getPointerTy()));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToDouble(~(1ULL << 63))));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  } else {
+    CV.push_back(ConstantFP::get(SrcTy, BitsToFloat(~(1U << 31))));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+    CV.push_back(ConstantFP::get(SrcTy, 0.0));
+  }
+  CS = ConstantStruct::get(CV);
+  CPIdx = DAG.getConstantPool(CS, getPointerTy(), 4);
+  Tys = DAG.getVTList(VT, MVT::Other);
+  Ops.clear();
+  Ops.push_back(DAG.getEntryNode());
+  Ops.push_back(CPIdx);
+  Ops.push_back(DAG.getSrcValue(NULL));
+  SDOperand Mask2 = DAG.getNode(X86ISD::LOAD_PACK, Tys, &Ops[0], Ops.size());
+  SDOperand Val = DAG.getNode(X86ISD::FAND, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, VT, Val, SignBit);
+}
+
+SDOperand X86TargetLowering::LowerSETCC(SDOperand Op, SelectionDAG &DAG,
+                                        SDOperand Chain) {
+  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDOperand Cond;
+  SDOperand Op0 = Op.getOperand(0);
+  SDOperand Op1 = Op.getOperand(1);
+  SDOperand CC = Op.getOperand(2);
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  const MVT::ValueType *VTs1 = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+  const MVT::ValueType *VTs2 = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+  bool isFP = MVT::isFloatingPoint(Op.getOperand(1).getValueType());
+  unsigned X86CC;
+
+  if (translateX86CC(cast<CondCodeSDNode>(CC)->get(), isFP, X86CC,
+                     Op0, Op1, DAG)) {
+    SDOperand Ops1[] = { Chain, Op0, Op1 };
+    Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, Ops1, 3).getValue(1);
+    SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+    return DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+  }
+
+  assert(isFP && "Illegal integer SetCC!");
+
+  SDOperand COps[] = { Chain, Op0, Op1 };
+  Cond = DAG.getNode(X86ISD::CMP, VTs1, 2, COps, 3).getValue(1);
+
+  switch (SetCCOpcode) {
+  default: assert(false && "Illegal floating point SetCC!");
+  case ISD::SETOEQ: {  // !PF & ZF
+    SDOperand Ops1[] = { DAG.getConstant(X86::COND_NP, MVT::i8), Cond };
+    SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+    SDOperand Ops2[] = { DAG.getConstant(X86::COND_E, MVT::i8),
+                         Tmp1.getValue(1) };
+    SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+    return DAG.getNode(ISD::AND, MVT::i8, Tmp1, Tmp2);
+  }
+  case ISD::SETUNE: {  // PF | !ZF
+    SDOperand Ops1[] = { DAG.getConstant(X86::COND_P, MVT::i8), Cond };
+    SDOperand Tmp1 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops1, 2);
+    SDOperand Ops2[] = { DAG.getConstant(X86::COND_NE, MVT::i8),
+                         Tmp1.getValue(1) };
+    SDOperand Tmp2 = DAG.getNode(X86ISD::SETCC, VTs2, 2, Ops2, 2);
+    return DAG.getNode(ISD::OR, MVT::i8, Tmp1, Tmp2);
+  }
+  }
+}
+
+SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDOperand Chain = DAG.getEntryNode();
+  SDOperand Cond  = Op.getOperand(0);
+  SDOperand CC;
+  const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG, Chain);
+
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    // If condition flag is set by a X86ISD::CMP, then make a copy of it
+    // (since flag operand cannot be shared). Use it as the condition setting
+    // operand in place of the X86ISD::SETCC.
+    // If the X86ISD::SETCC has more than one use, then perhaps it's better
+    // to use a test instead of duplicating the X86ISD::CMP (for register
+    // pressure reason)?
+    SDOperand Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    bool IllegalFPCMov = !X86ScalarSSE &&
+      MVT::isFloatingPoint(Op.getValueType()) &&
+      !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
+    if ((Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) &&
+        !IllegalFPCMov) {
+      SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+      Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+    Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+  }
+
+  VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::Flag);
+  SmallVector<SDOperand, 4> Ops;
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  Ops.push_back(Op.getOperand(2));
+  Ops.push_back(Op.getOperand(1));
+  Ops.push_back(CC);
+  Ops.push_back(Cond.getValue(1));
+  return DAG.getNode(X86ISD::CMOV, VTs, 2, &Ops[0], Ops.size());
+}
+
+SDOperand X86TargetLowering::LowerBRCOND(SDOperand Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Cond  = Op.getOperand(1);
+  SDOperand Dest  = Op.getOperand(2);
+  SDOperand CC;
+  const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+
+  if (Cond.getOpcode() == ISD::SETCC)
+    Cond = LowerSETCC(Cond, DAG, Chain);
+
+  if (Cond.getOpcode() == X86ISD::SETCC) {
+    CC = Cond.getOperand(0);
+
+    // If condition flag is set by a X86ISD::CMP, then make a copy of it
+    // (since flag operand cannot be shared). Use it as the condition setting
+    // operand in place of the X86ISD::SETCC.
+    // If the X86ISD::SETCC has more than one use, then perhaps it's better
+    // to use a test instead of duplicating the X86ISD::CMP (for register
+    // pressure reason)?
+    SDOperand Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) {
+      SDOperand Ops[] = { Chain, Cmp.getOperand(1), Cmp.getOperand(2) };
+      Cond = DAG.getNode(Opc, VTs, 2, Ops, 3);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    SDOperand Ops[] = { Chain, Cond, DAG.getConstant(0, MVT::i8) };
+    Cond = DAG.getNode(X86ISD::CMP, VTs, 2, Ops, 3);
+  }
+  return DAG.getNode(X86ISD::BRCOND, Op.getValueType(),
+                     Cond, Op.getOperand(2), CC, Cond.getValue(1));
+}
+
+SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
+  unsigned CallingConv= cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+
+  if (Subtarget->is64Bit())
+    return LowerX86_64CCCCallTo(Op, DAG, CallingConv);
+  else
+    switch (CallingConv) {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+      // TODO: Implement fastcc
+      // Falls through
+    case CallingConv::C:
+    case CallingConv::X86_StdCall:
+      return LowerCCCCallTo(Op, DAG, CallingConv);
+    case CallingConv::X86_FastCall:
+      return LowerFastCCCallTo(Op, DAG, CallingConv);
+    }
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDOperand
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDOperand Op,
+                                           SelectionDAG &DAG) {
+  assert(Subtarget->isTargetCygMing() &&
+         "This should be used only on Cygwin/Mingw targets");
+  
+  // Get the inputs.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand Size  = Op.getOperand(1);
+  // FIXME: Ensure alignment here
+
+  SDOperand Flag;
+  
+  MVT::ValueType IntPtr = getPointerTy();
+  MVT::ValueType SPTy = (Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+
+  Chain = DAG.getCopyToReg(Chain, X86::EAX, Size, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand Ops[] = { Chain,
+                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
+                      DAG.getRegister(X86::EAX, IntPtr),
+                      Flag };
+  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops, 4);
+  Flag = Chain.getValue(1);
+
+  Chain = DAG.getCopyFromReg(Chain, X86StackPtr, SPTy).getValue(1);
+  
+  std::vector<MVT::ValueType> Tys;
+  Tys.push_back(SPTy);
+  Tys.push_back(MVT::Other);
+  SDOperand Ops1[2] = { Chain.getValue(0), Chain };
+  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops1, 2);
+}
+
+SDOperand
+X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    MF.getInfo<X86MachineFunctionInfo>()->setForceFramePointer(true);
+
+  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+  if (Subtarget->is64Bit())
+    return LowerX86_64CCCArguments(Op, DAG);
+  else
+    switch(CC) {
+    default:
+      assert(0 && "Unsupported calling convention");
+    case CallingConv::Fast:
+      // TODO: implement fastcc.
+      
+      // Falls through
+    case CallingConv::C:
+      return LowerCCCArguments(Op, DAG);
+    case CallingConv::X86_StdCall:
+      MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(StdCall);
+      return LowerCCCArguments(Op, DAG, true);
+    case CallingConv::X86_FastCall:
+      MF.getInfo<X86MachineFunctionInfo>()->setDecorationStyle(FastCall);
+      return LowerFastCCArguments(Op, DAG);
+    }
+}
+
+SDOperand X86TargetLowering::LowerMEMSET(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand InFlag(0, 0);
+  SDOperand Chain = Op.getOperand(0);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+  // If not DWORD aligned, call memset if size is less than the threshold.
+  // It knows how to align to the right boundary first.
+  if ((Align & 3) != 0 ||
+      (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+    MVT::ValueType IntPtr = getPointerTy();
+    const Type *IntPtrTy = getTargetData()->getIntPtrType();
+    TargetLowering::ArgListTy Args; 
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Op.getOperand(1);
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    // Extend the unsigned i8 argument to be an int value for the call.
+    Entry.Node = DAG.getNode(ISD::ZERO_EXTEND, MVT::i32, Op.getOperand(2));
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3);
+    Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memset", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  MVT::ValueType AVT;
+  SDOperand Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+      case 2:   // WORD aligned
+        AVT = MVT::i16;
+        ValReg = X86::AX;
+        Val = (Val << 8) | Val;
+        break;
+      case 0:  // DWORD aligned
+        AVT = MVT::i32;
+        ValReg = X86::EAX;
+        Val = (Val << 8)  | Val;
+        Val = (Val << 16) | Val;
+        if (Subtarget->is64Bit() && ((Align & 0xF) == 0)) {  // QWORD aligned
+          AVT = MVT::i64;
+          ValReg = X86::RAX;
+          Val = (Val << 32) | Val;
+        }
+        break;
+      default:  // Byte aligned
+        AVT = MVT::i8;
+        ValReg = X86::AL;
+        Count = Op.getOperand(3);
+        break;
+    }
+
+    if (AVT > MVT::i8) {
+      if (I) {
+        unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+        Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+        BytesLeft = I->getValue() % UBytes;
+      } else {
+        assert(AVT >= MVT::i32 &&
+               "Do not use rep;stos if not at least DWORD aligned");
+        Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+                            Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+        TwoRepStos = true;
+      }
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = Op.getOperand(3);
+    Chain  = DAG.getCopyToReg(Chain, X86::AL, Op.getOperand(2), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+                            Op.getOperand(1), InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count = Op.getOperand(3);
+    MVT::ValueType CVT = Count.getValueType();
+    SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(DAG.getValueType(MVT::i8));
+    Ops.push_back(InFlag);
+    Chain  = DAG.getNode(X86ISD::REP_STOS, Tys, &Ops[0], Ops.size());
+  } else if (BytesLeft) {
+    // Issue stores for the last 1 - 7 bytes.
+    SDOperand Value;
+    unsigned Val = ValC->getValue() & 255;
+    unsigned Offset = I->getValue() - BytesLeft;
+    SDOperand DstAddr = Op.getOperand(1);
+    MVT::ValueType AddrVT = DstAddr.getValueType();
+    if (BytesLeft >= 4) {
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      Value = DAG.getConstant(Val, MVT::i32);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+      BytesLeft -= 4;
+      Offset += 4;
+    }
+    if (BytesLeft >= 2) {
+      Value = DAG.getConstant((Val << 8) | Val, MVT::i16);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+      BytesLeft -= 2;
+      Offset += 2;
+    }
+    if (BytesLeft == 1) {
+      Value = DAG.getConstant(Val, MVT::i8);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, AddrVT, DstAddr,
+                                       DAG.getConstant(Offset, AddrVT)),
+                           NULL, 0);
+    }
+  }
+
+  return Chain;
+}
+
+SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG &DAG) {
+  SDOperand Chain = Op.getOperand(0);
+  unsigned Align =
+    (unsigned)cast<ConstantSDNode>(Op.getOperand(4))->getValue();
+  if (Align == 0) Align = 1;
+
+  ConstantSDNode *I = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+  // If not DWORD aligned, call memcpy if size is less than the threshold.
+  // It knows how to align to the right boundary first.
+  if ((Align & 3) != 0 ||
+      (I && I->getValue() < Subtarget->getMinRepStrSizeThreshold())) {
+    MVT::ValueType IntPtr = getPointerTy();
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = getTargetData()->getIntPtrType();
+    Entry.Node = Op.getOperand(1); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(2); Args.push_back(Entry);
+    Entry.Node = Op.getOperand(3); Args.push_back(Entry);
+    std::pair<SDOperand,SDOperand> CallResult =
+      LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
+                  DAG.getExternalSymbol("memcpy", IntPtr), Args, DAG);
+    return CallResult.second;
+  }
+
+  MVT::ValueType AVT;
+  SDOperand Count;
+  unsigned BytesLeft = 0;
+  bool TwoRepMovs = false;
+  switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      if (Subtarget->is64Bit() && ((Align & 0xF) == 0))  // QWORD aligned
+        AVT = MVT::i64;
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      Count = Op.getOperand(3);
+      break;
+  }
+
+  if (AVT > MVT::i8) {
+    if (I) {
+      unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
+      Count = DAG.getConstant(I->getValue() / UBytes, getPointerTy());
+      BytesLeft = I->getValue() % UBytes;
+    } else {
+      assert(AVT >= MVT::i32 &&
+             "Do not use rep;movs if not at least DWORD aligned");
+      Count = DAG.getNode(ISD::SRL, Op.getOperand(3).getValueType(),
+                          Op.getOperand(3), DAG.getConstant(2, MVT::i8));
+      TwoRepMovs = true;
+    }
+  }
+
+  SDOperand InFlag(0, 0);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RCX : X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RDI : X86::EDI,
+                            Op.getOperand(1), InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, Subtarget->is64Bit() ? X86::RSI : X86::ESI,
+                            Op.getOperand(2), InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDOperand, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(DAG.getValueType(AVT));
+  Ops.push_back(InFlag);
+  Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+
+  if (TwoRepMovs) {
+    InFlag = Chain.getValue(1);
+    Count = Op.getOperand(3);
+    MVT::ValueType CVT = Count.getValueType();
+    SDOperand Left = DAG.getNode(ISD::AND, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, (CVT == MVT::i64) ? X86::RCX : X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(DAG.getValueType(MVT::i8));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(X86ISD::REP_MOVS, Tys, &Ops[0], Ops.size());
+  } else if (BytesLeft) {
+    // Issue loads and stores for the last 1 - 7 bytes.
+    unsigned Offset = I->getValue() - BytesLeft;
+    SDOperand DstAddr = Op.getOperand(1);
+    MVT::ValueType DstVT = DstAddr.getValueType();
+    SDOperand SrcAddr = Op.getOperand(2);
+    MVT::ValueType SrcVT = SrcAddr.getValueType();
+    SDOperand Value;
+    if (BytesLeft >= 4) {
+      Value = DAG.getLoad(MVT::i32, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+      BytesLeft -= 4;
+      Offset += 4;
+    }
+    if (BytesLeft >= 2) {
+      Value = DAG.getLoad(MVT::i16, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+      BytesLeft -= 2;
+      Offset += 2;
+    }
+
+    if (BytesLeft == 1) {
+      Value = DAG.getLoad(MVT::i8, Chain,
+                          DAG.getNode(ISD::ADD, SrcVT, SrcAddr,
+                                      DAG.getConstant(Offset, SrcVT)),
+                          NULL, 0);
+      Chain = Value.getValue(1);
+      Chain = DAG.getStore(Chain, Value,
+                           DAG.getNode(ISD::ADD, DstVT, DstAddr,
+                                       DAG.getConstant(Offset, DstVT)),
+                           NULL, 0);
+    }
+  }
+
+  return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDOperand TheOp = Op.getOperand(0);
+  SDOperand rd = DAG.getNode(X86ISD::RDTSC_DAG, Tys, &TheOp, 1);
+  if (Subtarget->is64Bit()) {
+    SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::RAX, MVT::i64, rd.getValue(1));
+    SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::RDX,
+                                         MVT::i64, Copy1.getValue(2));
+    SDOperand Tmp = DAG.getNode(ISD::SHL, MVT::i64, Copy2,
+                                DAG.getConstant(32, MVT::i8));
+    SDOperand Ops[] = {
+      DAG.getNode(ISD::OR, MVT::i64, Copy1, Tmp), Copy2.getValue(1)
+    };
+    
+    Tys = DAG.getVTList(MVT::i64, MVT::Other);
+    return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 2);
+  }
+  
+  SDOperand Copy1 = DAG.getCopyFromReg(rd, X86::EAX, MVT::i32, rd.getValue(1));
+  SDOperand Copy2 = DAG.getCopyFromReg(Copy1.getValue(1), X86::EDX,
+                                       MVT::i32, Copy1.getValue(2));
+  SDOperand Ops[] = { Copy1, Copy2, Copy2.getValue(1) };
+  Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  return DAG.getNode(ISD::MERGE_VALUES, Tys, Ops, 3);
+}
+
+SDOperand X86TargetLowering::LowerVASTART(SDOperand Op, SelectionDAG &DAG) {
+  SrcValueSDNode *SV = cast<SrcValueSDNode>(Op.getOperand(2));
+
+  if (!Subtarget->is64Bit()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDOperand FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+    return DAG.getStore(Op.getOperand(0), FR,Op.getOperand(1), SV->getValue(),
+                        SV->getOffset());
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDOperand, 8> MemOps;
+  SDOperand FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDOperand Store = DAG.getStore(Op.getOperand(0),
+                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
+                                 FIN, SV->getValue(), SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(4, getPointerTy()));
+  Store = DAG.getStore(Op.getOperand(0),
+                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
+                       FIN, SV->getValue(), SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(4, getPointerTy()));
+  SDOperand OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), OVFIN, FIN, SV->getValue(),
+                       SV->getOffset());
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
+                    DAG.getConstant(8, getPointerTy()));
+  SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), RSFIN, FIN, SV->getValue(),
+                       SV->getOffset());
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, MVT::Other, &MemOps[0], MemOps.size());
+}
+
+SDOperand X86TargetLowering::LowerVACOPY(SDOperand Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  SDOperand Chain = Op.getOperand(0);
+  SDOperand DstPtr = Op.getOperand(1);
+  SDOperand SrcPtr = Op.getOperand(2);
+  SrcValueSDNode *DstSV = cast<SrcValueSDNode>(Op.getOperand(3));
+  SrcValueSDNode *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4));
+
+  SrcPtr = DAG.getLoad(getPointerTy(), Chain, SrcPtr,
+                       SrcSV->getValue(), SrcSV->getOffset());
+  Chain = SrcPtr.getValue(1);
+  for (unsigned i = 0; i < 3; ++i) {
+    SDOperand Val = DAG.getLoad(MVT::i64, Chain, SrcPtr,
+                                SrcSV->getValue(), SrcSV->getOffset());
+    Chain = Val.getValue(1);
+    Chain = DAG.getStore(Chain, Val, DstPtr,
+                         DstSV->getValue(), DstSV->getOffset());
+    if (i == 2)
+      break;
+    SrcPtr = DAG.getNode(ISD::ADD, getPointerTy(), SrcPtr, 
+                         DAG.getConstant(8, getPointerTy()));
+    DstPtr = DAG.getNode(ISD::ADD, getPointerTy(), DstPtr, 
+                         DAG.getConstant(8, getPointerTy()));
+  }
+  return Chain;
+}
+
+SDOperand
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getValue();
+  switch (IntNo) {
+  default: return SDOperand();    // Don't custom lower most intrinsics.
+    // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc = 0;
+    ISD::CondCode CC = ISD::SETCC_INVALID;
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    unsigned X86CC;
+    SDOperand LHS = Op.getOperand(1);
+    SDOperand RHS = Op.getOperand(2);
+    translateX86CC(CC, true, X86CC, LHS, RHS, DAG);
+
+    const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
+    SDOperand Ops1[] = { DAG.getEntryNode(), LHS, RHS };
+    SDOperand Cond = DAG.getNode(Opc, VTs, 2, Ops1, 3);
+    VTs = DAG.getNodeValueTypes(MVT::i8, MVT::Flag);
+    SDOperand Ops2[] = { DAG.getConstant(X86CC, MVT::i8), Cond };
+    SDOperand SetCC = DAG.getNode(X86ISD::SETCC, VTs, 2, Ops2, 2);
+    return DAG.getNode(ISD::ANY_EXTEND, MVT::i32, SetCC);
+  }
+  }
+}
+
+SDOperand X86TargetLowering::LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG) {
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+    return SDOperand();
+  
+  // Just load the return address
+  SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI, NULL, 0);
+}
+
+SDOperand X86TargetLowering::LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG) {
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getValue() > 0)
+    return SDOperand();
+    
+  SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI, 
+                     DAG.getConstant(4, getPointerTy()));
+}
+
+SDOperand X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDOperand Op,
+                                                       SelectionDAG &DAG) {
+  // Is not yet supported on x86-64
+  if (Subtarget->is64Bit())
+    return SDOperand();
+  
+  return DAG.getConstant(8, getPointerTy());
+}
+
+SDOperand X86TargetLowering::LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG)
+{
+  assert(!Subtarget->is64Bit() &&
+         "Lowering of eh_return builtin is not supported yet on x86-64");
+    
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDOperand Chain     = Op.getOperand(0);
+  SDOperand Offset    = Op.getOperand(1);
+  SDOperand Handler   = Op.getOperand(2);
+
+  SDOperand Frame = DAG.getRegister(RegInfo->getFrameRegister(MF),
+                                    getPointerTy());
+
+  SDOperand StoreAddr = DAG.getNode(ISD::SUB, getPointerTy(), Frame,
+                                    DAG.getConstant(-4UL, getPointerTy()));
+  StoreAddr = DAG.getNode(ISD::ADD, getPointerTy(), StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, Handler, StoreAddr, NULL, 0);
+  Chain = DAG.getCopyToReg(Chain, X86::ECX, StoreAddr);
+  MF.addLiveOut(X86::ECX);
+
+  return DAG.getNode(X86ISD::EH_RETURN, MVT::Other,
+                     Chain, DAG.getRegister(X86::ECX, getPointerTy()));
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDOperand X86TargetLowering::LowerOperation(SDOperand Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: assert(0 && "Should not custom lower this!");
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG, DAG.getEntryNode());
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::CALL:               return LowerCALL(Op, DAG);
+  case ISD::RET:                return LowerRET(Op, DAG);
+  case ISD::FORMAL_ARGUMENTS:   return LowerFORMAL_ARGUMENTS(Op, DAG);
+  case ISD::MEMSET:             return LowerMEMSET(Op, DAG);
+  case ISD::MEMCPY:             return LowerMEMCPY(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLCECOUNTER(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  }
+  return SDOperand();
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::FP_GET_RESULT:      return "X86ISD::FP_GET_RESULT";
+  case X86ISD::FP_SET_RESULT:      return "X86ISD::FP_SET_RESULT";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::TAILCALL:           return "X86ISD::TAILCALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::LOAD_PACK:          return "X86ISD::LOAD_PACK";
+  case X86ISD::LOAD_UA:            return "X86ISD::LOAD_UA";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::S2VEC:              return "X86ISD::S2VEC";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::THREAD_POINTER:     return "X86ISD::THREAD_POINTER";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+  
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1)
+    return false;
+  
+  if (AM.BaseGV) {
+    // X86-64 only supports addr of globals in small code model.
+    if (Subtarget->is64Bit() &&
+        getTargetMachine().getCodeModel() != CodeModel::Small)
+      return false;
+    
+    // We can only fold this if we don't need a load either.
+    if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false))
+      return false;
+  }
+  
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+  
+  return true;
+}
+
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const {
+  // Only do shuffles on 128-bit vector types for now.
+  if (MVT::getSizeInBits(VT) == 64) return false;
+  return (Mask.Val->getNumOperands() <= 4 ||
+          isIdentityMask(Mask.Val) ||
+          isIdentityMask(Mask.Val, true) ||
+          isSplatMask(Mask.Val)  ||
+          isPSHUFHW_PSHUFLWMask(Mask.Val) ||
+          X86::isUNPCKLMask(Mask.Val) ||
+          X86::isUNPCKHMask(Mask.Val) ||
+          X86::isUNPCKL_v_undef_Mask(Mask.Val) ||
+          X86::isUNPCKH_v_undef_Mask(Mask.Val));
+}
+
+bool X86TargetLowering::isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+                                               MVT::ValueType EVT,
+                                               SelectionDAG &DAG) const {
+  unsigned NumElts = BVOps.size();
+  // Only do shuffles on 128-bit vector types for now.
+  if (MVT::getSizeInBits(EVT) * NumElts == 64) return false;
+  if (NumElts == 2) return true;
+  if (NumElts == 4) {
+    return (isMOVLMask(&BVOps[0], 4)  ||
+            isCommutedMOVL(&BVOps[0], 4, true) ||
+            isSHUFPMask(&BVOps[0], 4) || 
+            isCommutedSHUFP(&BVOps[0], 4));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+X86TargetLowering::InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                           MachineBasicBlock *BB) {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    ilist<MachineBasicBlock>::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
+    unsigned Opc =
+      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+    BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB);
+    MachineFunction *F = BB->getParent();
+    F->getBasicBlockList().insert(It, copy0MBB);
+    F->getBasicBlockList().insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+        e = BB->succ_end(); i != e; ++i)
+      sinkMBB->addSuccessor(*i);
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    delete MI;   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM: {
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    MachineFunction *F = BB->getParent();
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
+    addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getSSARegMap()->createVirtualRegister(X86::GR16RegisterClass);
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isRegister()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getFrameIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImmediate())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImmediate())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobalAddress()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(BB, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(4).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    delete MI;   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                       uint64_t Mask,
+                                                       uint64_t &KnownZero,
+                                                       uint64_t &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = 0;   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::SETCC:
+    KnownZero |= (MVT::getIntVTBitMask(Op.getValueType()) ^ 1ULL);
+    break;
+  }
+}
+
+/// getShuffleScalarElt - Returns the scalar element that will make up the ith
+/// element of the result of the vector shuffle.
+static SDOperand getShuffleScalarElt(SDNode *N, unsigned i, SelectionDAG &DAG) {
+  MVT::ValueType VT = N->getValueType(0);
+  SDOperand PermMask = N->getOperand(2);
+  unsigned NumElems = PermMask.getNumOperands();
+  SDOperand V = (i < NumElems) ? N->getOperand(0) : N->getOperand(1);
+  i %= NumElems;
+  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    return (i == 0)
+      ? V.getOperand(0) : DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+  } else if (V.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF)
+      return DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(VT));
+    return getShuffleScalarElt(V.Val,cast<ConstantSDNode>(Idx)->getValue(),DAG);
+  }
+  return SDOperand();
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + an offset.
+static bool isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) {
+  unsigned Opc = N->getOpcode();
+  if (Opc == X86ISD::Wrapper) {
+    if (dyn_cast<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    SDOperand N1 = N->getOperand(0);
+    SDOperand N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.Val, GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N2);
+      if (V) {
+        Offset += V->getSignExtended();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.Val, GA, Offset)) {
+      ConstantSDNode *V = dyn_cast<ConstantSDNode>(N1);
+      if (V) {
+        Offset += V->getSignExtended();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// isConsecutiveLoad - Returns true if N is loading from an address of Base
+/// + Dist * Size.
+static bool isConsecutiveLoad(SDNode *N, SDNode *Base, int Dist, int Size,
+                              MachineFrameInfo *MFI) {
+  if (N->getOperand(0).Val != Base->getOperand(0).Val)
+    return false;
+
+  SDOperand Loc = N->getOperand(1);
+  SDOperand BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    int FI  = dyn_cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = dyn_cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != Size) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Size);
+  } else {
+    GlobalValue *GV1 = NULL;
+    GlobalValue *GV2 = NULL;
+    int64_t Offset1 = 0;
+    int64_t Offset2 = 0;
+    bool isGA1 = isGAPlusOffset(Loc.Val, GV1, Offset1);
+    bool isGA2 = isGAPlusOffset(BaseLoc.Val, GV2, Offset2);
+    if (isGA1 && isGA2 && GV1 == GV2)
+      return Offset1 == (Offset2 + Dist*Size);
+  }
+
+  return false;
+}
+
+static bool isBaseAlignment16(SDNode *Base, MachineFrameInfo *MFI,
+                              const X86Subtarget *Subtarget) {
+  GlobalValue *GV;
+  int64_t Offset;
+  if (isGAPlusOffset(Base, GV, Offset))
+    return (GV->getAlignment() >= 16 && (Offset % 16) == 0);
+  else {
+    assert(Base->getOpcode() == ISD::FrameIndex && "Unexpected base node!");
+    int BFI = dyn_cast<FrameIndexSDNode>(Base)->getIndex();
+    if (BFI < 0)
+      // Fixed objects do not specify alignment, however the offsets are known.
+      return ((Subtarget->getStackAlignment() % 16) == 0 &&
+              (MFI->getObjectOffset(BFI) % 16) == 0);
+    else
+      return MFI->getObjectAlignment(BFI) >= 16;
+  }
+  return false;
+}
+
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.
+static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                       const X86Subtarget *Subtarget) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MVT::ValueType VT = N->getValueType(0);
+  MVT::ValueType EVT = MVT::getVectorElementType(VT);
+  SDOperand PermMask = N->getOperand(2);
+  int NumElems = (int)PermMask.getNumOperands();
+  SDNode *Base = NULL;
+  for (int i = 0; i < NumElems; ++i) {
+    SDOperand Idx = PermMask.getOperand(i);
+    if (Idx.getOpcode() == ISD::UNDEF) {
+      if (!Base) return SDOperand();
+    } else {
+      SDOperand Arg =
+        getShuffleScalarElt(N, cast<ConstantSDNode>(Idx)->getValue(), DAG);
+      if (!Arg.Val || !ISD::isNON_EXTLoad(Arg.Val))
+        return SDOperand();
+      if (!Base)
+        Base = Arg.Val;
+      else if (!isConsecutiveLoad(Arg.Val, Base,
+                                  i, MVT::getSizeInBits(EVT)/8,MFI))
+        return SDOperand();
+    }
+  }
+
+  bool isAlign16 = isBaseAlignment16(Base->getOperand(1).Val, MFI, Subtarget);
+  if (isAlign16) {
+    LoadSDNode *LD = cast<LoadSDNode>(Base);
+    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
+                       LD->getSrcValueOffset());
+  } else {
+    // Just use movups, it's shorter.
+    SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
+    SmallVector<SDOperand, 3> Ops;
+    Ops.push_back(Base->getOperand(0));
+    Ops.push_back(Base->getOperand(1));
+    Ops.push_back(Base->getOperand(2));
+    return DAG.getNode(ISD::BIT_CONVERT, VT,
+                       DAG.getNode(X86ISD::LOAD_UA, Tys, &Ops[0], Ops.size()));
+  }
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDOperand Cond = N->getOperand(0);
+
+  // If we have SSE[12] support, try to form min/max nodes.
+  if (Subtarget->hasSSE2() &&
+      (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
+    if (Cond.getOpcode() == ISD::SETCC) {
+      // Get the LHS/RHS of the select.
+      SDOperand LHS = N->getOperand(1);
+      SDOperand RHS = N->getOperand(2);
+      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+      unsigned Opcode = 0;
+      if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+        switch (CC) {
+        default: break;
+        case ISD::SETOLE: // (X <= Y) ? X : Y -> min
+        case ISD::SETULE:
+        case ISD::SETLE:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
+        case ISD::SETLT:
+          Opcode = X86ISD::FMIN;
+          break;
+
+        case ISD::SETOGT: // (X > Y) ? X : Y -> max
+        case ISD::SETUGT:
+        case ISD::SETGT:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
+        case ISD::SETGE:
+          Opcode = X86ISD::FMAX;
+          break;
+        }
+      } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+        switch (CC) {
+        default: break;
+        case ISD::SETOGT: // (X > Y) ? Y : X -> min
+        case ISD::SETUGT:
+        case ISD::SETGT:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
+        case ISD::SETGE:
+          Opcode = X86ISD::FMIN;
+          break;
+
+        case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
+        case ISD::SETULE:
+        case ISD::SETLE:
+          if (!UnsafeFPMath) break;
+          // FALL THROUGH.
+        case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
+        case ISD::SETLT:
+          Opcode = X86ISD::FMAX;
+          break;
+        }
+      }
+
+      if (Opcode)
+        return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS);
+    }
+
+  }
+
+  return SDOperand();
+}
+
+
+SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                               DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::VECTOR_SHUFFLE:
+    return PerformShuffleCombine(N, DAG, Subtarget);
+  case ISD::SELECT:
+    return PerformSELECTCombine(N, DAG, Subtarget);
+  }
+
+  return SDOperand();
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'A':
+    case 'r':
+    case 'R':
+    case 'l':
+    case 'q':
+    case 'Q':
+    case 'x':
+    case 'Y':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// isOperandValidForConstraint - Return the specified operand (possibly
+/// modified) if the specified SDOperand is valid for the specified target
+/// constraint letter, otherwise return null.
+SDOperand X86TargetLowering::
+isOperandValidForConstraint(SDOperand Op, char Constraint, SelectionDAG &DAG) {
+  switch (Constraint) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getValue() <= 31)
+        return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+    }
+    return SDOperand(0,0);
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getValue() <= 255)
+        return DAG.getTargetConstant(C->getValue(), Op.getValueType());
+    }
+    return SDOperand(0,0);
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op))
+      return DAG.getTargetConstant(CST->getValue(), Op.getValueType());
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+    int64_t Offset = 0;
+    
+    // Match either (GA) or (GA+C)
+    if (GA) {
+      Offset = GA->getOffset();
+    } else if (Op.getOpcode() == ISD::ADD) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (C && GA) {
+        Offset = GA->getOffset()+C->getValue();
+      } else {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+        if (C && GA)
+          Offset = GA->getOffset()+C->getValue();
+        else
+          C = 0, GA = 0;
+      }
+    }
+    
+    if (GA) {
+      // If addressing this global requires a load (e.g. in PIC mode), we can't
+      // match.
+      if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
+                                         false))
+        return SDOperand(0, 0);
+
+      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                      Offset);
+      return Op;
+    }
+
+    // Otherwise, not valid for this mode.
+    return SDOperand(0, 0);
+  }
+  }
+  return TargetLowering::isOperandValidForConstraint(Op, Constraint, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  MVT::ValueType VT) const {
+  if (Constraint.size() == 1) {
+    // FIXME: not handling fp-stack yet!
+    switch (Constraint[0]) {      // GCC X86 Constraint Letters
+    default: break;  // Unknown constraint letter
+    case 'A':   // EAX/EDX
+      if (VT == MVT::i32 || VT == MVT::i64)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
+      break;
+    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+      else if (VT == MVT::i16)
+        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+      else if (VT == MVT::i8)
+        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::DL, 0);
+        break;
+    }
+  }
+
+  return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                MVT::ValueType VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+    case 'R':   // LEGACY_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i64 && Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR64RegisterClass);
+      if (VT == MVT::i32)
+        return std::make_pair(0U, X86::GR32RegisterClass);
+      else if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16RegisterClass);
+      else if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8RegisterClass);
+      break;
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, X86::VR64RegisterClass);
+      break;
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed
+      if (!Subtarget->hasSSE1()) break;
+      
+      switch (VT) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, X86::FR32RegisterClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, X86::FR64RegisterClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, X86::VR128RegisterClass);
+      }
+      break;
+    }
+  }
+  
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    // GCC calls "st(0)" just plain "st".
+    if (StringsEqualNoCase("{st}", Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = X86::RSTRegisterClass;
+    }
+
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second != X86::GR16RegisterClass)
+    return Res;
+
+  if (VT == MVT::i8) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::AL; break;
+    case X86::DX: DestReg = X86::DL; break;
+    case X86::CX: DestReg = X86::CL; break;
+    case X86::BX: DestReg = X86::BL; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR8RegisterClass;
+    }
+  } else if (VT == MVT::i32) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::EAX; break;
+    case X86::DX: DestReg = X86::EDX; break;
+    case X86::CX: DestReg = X86::ECX; break;
+    case X86::BX: DestReg = X86::EBX; break;
+    case X86::SI: DestReg = X86::ESI; break;
+    case X86::DI: DestReg = X86::EDI; break;
+    case X86::BP: DestReg = X86::EBP; break;
+    case X86::SP: DestReg = X86::ESP; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR32RegisterClass;
+    }
+  } else if (VT == MVT::i64) {
+    unsigned DestReg = 0;
+    switch (Res.first) {
+    default: break;
+    case X86::AX: DestReg = X86::RAX; break;
+    case X86::DX: DestReg = X86::RDX; break;
+    case X86::CX: DestReg = X86::RCX; break;
+    case X86::BX: DestReg = X86::RBX; break;
+    case X86::SI: DestReg = X86::RSI; break;
+    case X86::DI: DestReg = X86::RDI; break;
+    case X86::BP: DestReg = X86::RBP; break;
+    case X86::SP: DestReg = X86::RSP; break;
+    }
+    if (DestReg) {
+      Res.first = DestReg;
+      Res.second = Res.second = X86::GR64RegisterClass;
+    }
+  }
+
+  return Res;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..07a96d3
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -0,0 +1,437 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+X86::INSTRUCTION_LIST_END,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// FP_GET_RESULT - This corresponds to FpGETRESULT pseudo instruction
+      /// which copies from ST(0) to the destination. It takes a chain and
+      /// writes a RFP result and a chain.
+      FP_GET_RESULT,
+
+      /// FP_SET_RESULT - This corresponds to FpSETRESULT pseudo instruction
+      /// which copies the source operand to ST(0). It takes a chain+value and
+      /// returns a chain and a flag.
+      FP_SET_RESULT,
+
+      /// CALL/TAILCALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      /// The CALL vs TAILCALL distinction boils down to whether the callee is
+      /// known not to modify the caller's stack frame, as is standard with
+      /// LLVM.
+      CALL,
+      TAILCALL,
+      
+      /// RDTSC_DAG - This operation implements the lowering for 
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, TEST, COMI, UCOMI,
+
+      /// X86 SetCC. Operand 1 is condition code, and operand 2 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// X86 conditional moves. Operand 1 and operand 2 are the two values
+      /// to select from (operand 1 is a R/W operand). Operand 3 is the
+      /// condition code, and operand 4 is the flag operand produced by a CMP
+      /// or TEST instruction. It also writes a flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 1 is the chain operand, operand 2
+      /// is the block to branch if condition is true, operand 3 is the
+      /// condition code, and operand 4 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 1 is the chain operand, operand
+      /// 2 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// LOAD_PACK Load a 128-bit packed float / double value. It has the same
+      /// operands as a normal load.
+      LOAD_PACK,
+
+      /// LOAD_UA Load an unaligned 128-bit value. It has the same operands as
+      /// a normal load.
+      LOAD_UA,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// S2VEC - X86 version of SCALAR_TO_VECTOR. The destination base does not
+      /// have to match the operand type.
+      S2VEC,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // Thread Local Storage
+      TLSADDR, THREAD_POINTER,
+
+      // Exception Handling helpers
+      EH_RETURN
+    };
+  }
+
+ /// Define some predicates that are used for node matching.
+ namespace X86 {
+   /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFDMask(SDNode *N);
+
+   /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFHWMask(SDNode *N);
+
+   /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+   bool isPSHUFLWMask(SDNode *N);
+
+   /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+   bool isSHUFPMask(SDNode *N);
+
+   /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+   bool isMOVHLPSMask(SDNode *N);
+
+   /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+   /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+   /// <2, 3, 2, 3>
+   bool isMOVHLPS_v_undef_Mask(SDNode *N);
+
+   /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+   bool isMOVLPMask(SDNode *N);
+
+   /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
+   /// as well as MOVLHPS.
+   bool isMOVHPMask(SDNode *N);
+
+   /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+   bool isUNPCKLMask(SDNode *N, bool V2IsSplat = false);
+
+   /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+   bool isUNPCKHMask(SDNode *N, bool V2IsSplat = false);
+
+   /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+   /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+   /// <0, 0, 1, 1>
+   bool isUNPCKL_v_undef_Mask(SDNode *N);
+
+   /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+   /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+   /// <2, 2, 3, 3>
+   bool isUNPCKH_v_undef_Mask(SDNode *N);
+
+   /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSS,
+   /// MOVSD, and MOVD, i.e. setting the lowest element.
+   bool isMOVLMask(SDNode *N);
+
+   /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+   bool isMOVSHDUPMask(SDNode *N);
+
+   /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+   bool isMOVSLDUPMask(SDNode *N);
+
+   /// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a splat of a single element.
+   bool isSplatMask(SDNode *N);
+
+   /// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
+   /// specifies a splat of zero element.
+   bool isSplatLoMask(SDNode *N);
+
+   /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+   /// instructions.
+   unsigned getShuffleSHUFImmediate(SDNode *N);
+
+   /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
+   /// instructions.
+   unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+   /// getShufflePSHUFKWImmediate - Return the appropriate immediate to shuffle
+   /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
+   /// instructions.
+   unsigned getShufflePSHUFLWImmediate(SDNode *N);
+ }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int RegSaveFrameIndex;            // X86-64 vararg func register save area.
+    unsigned VarArgsGPOffset;         // X86-64 vararg func int reg offset.
+    unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.
+    int ReturnAddrIndex;              // FrameIndex for return slot.
+    int BytesToPopOnReturn;           // Number of arg bytes ret should pop.
+    int BytesCallerReserves;          // Number of arg bytes caller makes.
+  public:
+    X86TargetLowering(TargetMachine &TM);
+
+    // Return the number of bytes that a function should pop when it returns (in
+    // addition to the space used by the return address).
+    //
+    unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+    // Return the number of bytes that the caller reserves for arguments passed
+    // to this function.
+    unsigned getBytesCallerReserves() const { return BytesCallerReserves; }
+ 
+    /// getStackPtrReg - Return the stack pointer register we are using: either
+    /// ESP or RSP.
+    unsigned getStackPtrReg() const { return X86StackPtr; }
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+
+    virtual SDOperand PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *InsertAtEndOfBasicBlock(MachineInstr *MI,
+                                                       MachineBasicBlock *MBB);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+    /// in Mask are known to be either zero or one and return them in the 
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDOperand Op,
+                                                uint64_t Mask,
+                                                uint64_t &KnownZero, 
+                                                uint64_t &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+    
+    SDOperand getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+     
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        MVT::ValueType VT) const;
+    /// isOperandValidForConstraint - Return the specified operand (possibly
+    /// modified) if the specified SDOperand is valid for the specified target
+    /// constraint letter, otherwise return null.
+    SDOperand isOperandValidForConstraint(SDOperand Op, char ConstraintLetter,
+                                          SelectionDAG &DAG);
+    
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   MVT::ValueType VT) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    virtual bool isShuffleMaskLegal(SDOperand Mask, MVT::ValueType VT) const;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    virtual bool isVectorClearMaskLegal(std::vector<SDOperand> &BVOps,
+                                        MVT::ValueType EVT,
+                                        SelectionDAG &DAG) const;
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const MRegisterInfo *RegInfo;
+
+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
+
+    /// X86ScalarSSE - Select between SSE2 or x87 floating point ops.
+    bool X86ScalarSSE;
+
+    SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall,
+                            unsigned CallingConv, SelectionDAG &DAG);
+        
+    // C and StdCall Calling Convention implementation.
+    SDOperand LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
+                                bool isStdCall = false);
+    SDOperand LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+    // X86-64 C Calling Convention implementation.
+    SDOperand LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,unsigned CC);
+
+    // Fast and FastCall Calling Convention implementation.
+    SDOperand LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC);
+
+    SDOperand LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerConstantPool(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerShift(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFABS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFNEG(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFCOPYSIGN(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerSETCC(SDOperand Op, SelectionDAG &DAG, SDOperand Chain);
+    SDOperand LowerSELECT(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerBRCOND(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMSET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerMEMCPY(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerJumpTable(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerCALL(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerRET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerDYNAMIC_STACKALLOC(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerREADCYCLCECOUNTER(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVASTART(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerVACOPY(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerINTRINSIC_WO_CHAIN(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerRETURNADDR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFRAMEADDR(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerFRAME_TO_ARGS_OFFSET(SDOperand Op, SelectionDAG &DAG);
+    SDOperand LowerEH_RETURN(SDOperand Op, SelectionDAG &DAG);
+  };
+}
+
+#endif    // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..c0fa58d
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -0,0 +1,125 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  unsigned Disp;
+  GlobalValue *GV;
+
+  X86AddressMode() : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+inline const MachineInstrBuilder &addDirectMem(const MachineInstrBuilder &MIB,
+                                               unsigned Reg) {
+  // Because memory references are always represented with four
+  // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+inline const MachineInstrBuilder &addRegOffset(const MachineInstrBuilder &MIB,
+                                               unsigned Reg, int Offset) {
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, unsigned Reg2) {
+  return MIB.addReg(Reg1).addImm(1).addReg(Reg2).addImm(0);
+}
+
+inline const MachineInstrBuilder &addFullAddress(const MachineInstrBuilder &MIB,
+                                                 const X86AddressMode &AM) {
+  assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert (0);
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    return MIB.addGlobalAddress(AM.GV, AM.Disp);
+  else
+    return MIB.addImm(AM.Disp);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  return MIB.addFrameIndex(FI).addImm(1).addReg(0).addImm(Offset);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference has base register ConstantPoolIndex offset which is retained until
+/// either machine code emission or assembly output.  This allows an optional
+/// offset to be added as well.
+///
+inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         int Offset = 0) {
+  return MIB.addConstantPoolIndex(CPI).addImm(1).addReg(0).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..11aeb07
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -0,0 +1,456 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set -------*- C++ -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet     : SDTypeProfile<1, 0, [SDTCisFP<0>]>;
+def SDTX86FpSet     : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def X86fpget        : SDNode<"X86ISD::FP_GET_RESULT", SDTX86FpGet,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86fpset        : SDNode<"X86ISD::FP_SET_RESULT", SDTX86FpSet,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86fld          : SDNode<"X86ISD::FLD",      SDTX86Fld,
+                        [SDNPHasChain]>;
+def X86fst          : SDNode<"X86ISD::FST",      SDTX86Fst,
+                        [SDNPHasChain, SDNPInFlag]>;
+def X86fild         : SDNode<"X86ISD::FILD",     SDTX86Fild,
+                        [SDNPHasChain]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG",SDTX86Fild,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                        [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomDAGSchedInserter = 1 in {  // Expanded by the scheduler.
+  def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (ops i16mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (ops i32mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (ops i64mem:$dst, RFP32:$src),
+                              "#FP32_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (ops i16mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (ops i32mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (ops i64mem:$dst, RFP64:$src),
+                              "#FP64_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+  let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+    def FP_REG_KILL  : I<0, Pseudo, (ops), "#FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with three instructions here.  The
+// first two instructions, generated by the instruction selector, uses "RFP32"
+// or "RFP64" registers: traditional register files to reference 32-bit or
+// 64-bit floating point values.  These sizes apply to the values, not the
+// registers, which are always 64 bits; RFP32 and RFP64 can be copied to
+// each other without losing information.  These instructions are all psuedo
+// instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of 32-bit and
+// 64-bit registers.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 64-bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag ops, string asm> : I<o, F, ops, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag ops, FPFormat fp, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, ops, ""> {
+  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let Pattern = pattern;
+}
+
+// Random Pseudo Instructions.
+def FpGETRESULT32 : FpI_<(ops RFP32:$dst), SpecialFP,
+                      [(set RFP32:$dst, X86fpget)]>;           // FPR = ST(0)
+
+def FpGETRESULT64 : FpI_<(ops RFP64:$dst), SpecialFP,
+                      [(set RFP64:$dst, X86fpget)]>;           // FPR = ST(0)
+
+let noResults = 1 in {
+  def FpSETRESULT32 : FpI_<(ops RFP32:$src), SpecialFP,
+                        [(X86fpset RFP32:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+
+  def FpSETRESULT64 : FpI_<(ops RFP64:$src), SpecialFP,
+                        [(X86fpset RFP64:$src)]>, Imp<[], [ST0]>;// ST(0) = FPR
+}
+// FpI - Floating Point Psuedo Instruction template. Predicated on FPStack.
+class FpI<dag ops, FPFormat fp, list<dag> pattern> :
+  FpI_<ops, fp, pattern>, Requires<[FPStack]>;
+
+// Register copies.  Just copies, the 64->32 version does not truncate.
+def MOV_Fp3232       : FpI<(ops RFP32:$dst, RFP32:$src), SpecialFP, []>; 
+def MOV_Fp3264       : FpI<(ops RFP64:$dst, RFP32:$src), SpecialFP, []>; 
+def MOV_Fp6432       : FpI<(ops RFP32:$dst, RFP64:$src), SpecialFP, []>; 
+def MOV_Fp6464       : FpI<(ops RFP64:$dst, RFP64:$src), SpecialFP, []>; 
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpI<(ops RFP32:$dst, RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpI<(ops RFP64:$dst, RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpI<(ops RFP64:$dst, RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (extloadf32 addr:$src2)))]>;
+def _F32m  : FPI<0xD8, fp, (ops f32mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{s} $src"))>;
+def _F64m  : FPI<0xDC, fp, (ops f64mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{l} $src"))>;
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpI<(ops RFP32:$dst, RFP32:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i16mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpI<(ops RFP64:$dst, RFP64:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (ops i16mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{s} $src"))>;
+def _FI32m  : FPI<0xDA, fp, (ops i32mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{l} $src"))>;
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (ops RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r   : FPST0rInst <0xC0, "fadd $op">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd {%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp $op">;
+def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr $op">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r} {%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p $op">;
+def SUB_FST0r   : FPST0rInst <0xE0, "fsub $op">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r} {%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p $op">;
+def MUL_FST0r   : FPST0rInst <0xC8, "fmul $op">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul {%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp $op">;
+def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr $op">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r} {%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p $op">;
+def DIV_FST0r   : FPST0rInst <0xF0, "fdiv $op">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r} {%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p $op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32  : FpI<(ops RFP32:$dst, RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpI<(ops RFP64:$dst, RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _F     : FPI<opcode, RawFrm, (ops), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+def TST_Fp32  : FpI<(ops RFP32:$src), OneArgFP,
+                 []>;
+def TST_Fp64  : FpI<(ops RFP64:$src), OneArgFP,
+                 []>;
+def TST_F  : FPI<0xE4, RawFrm, (ops), "ftst">, D9;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpI<(ops RFP32:$dst, RFP32:$src1, RFP32:$src2), CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc))]>;
+  def _Fp64  : FpI<(ops RFP64:$dst, RFP64:$src1, RFP64:$src2), CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc))]>;
+}
+let isTwoAddress = 1 in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xC0, AddRegFrm, (ops RST:$op),
+                  "fcmovb {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (ops RST:$op),
+                  "fcmovbe {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F  : FPI<0xC8, AddRegFrm, (ops RST:$op),
+                  "fcmove {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F  : FPI<0xD8, AddRegFrm, (ops RST:$op),
+                  "fcmovu  {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (ops RST:$op),
+                  "fcmovnb {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (ops RST:$op),
+                  "fcmovnbe {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (ops RST:$op),
+                  "fcmovne {$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (ops RST:$op),
+                  "fcmovnu {$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+def LD_Fp32m   : FpI<(ops RFP32:$dst, f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+def LD_Fp64m   : FpI<(ops RFP64:$dst, f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def ILD_Fp16m32: FpI<(ops RFP32:$dst, i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpI<(ops RFP32:$dst, i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpI<(ops RFP32:$dst, i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpI<(ops RFP64:$dst, i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpI<(ops RFP64:$dst, i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpI<(ops RFP64:$dst, i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+
+def ST_FpP32m    : FpI<(ops f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpI<(ops f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpI<(ops f64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m32  : FpI<(ops i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpI<(ops i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpI<(ops i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpI<(ops i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpI<(ops i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpI<(ops i64mem:$op, RFP64:$src), OneArgFP, []>;
+
+def LD_F32m   : FPI<0xD9, MRM0m, (ops f32mem:$src), "fld{s} $src">;
+def LD_F64m   : FPI<0xDD, MRM0m, (ops f64mem:$src), "fld{l} $src">;
+def ILD_F16m  : FPI<0xDF, MRM0m, (ops i16mem:$src), "fild{s} $src">;
+def ILD_F32m  : FPI<0xDB, MRM0m, (ops i32mem:$src), "fild{l} $src">;
+def ILD_F64m  : FPI<0xDF, MRM5m, (ops i64mem:$src), "fild{ll} $src">;
+def ST_F32m   : FPI<0xD9, MRM2m, (ops f32mem:$dst), "fst{s} $dst">;
+def ST_F64m   : FPI<0xDD, MRM2m, (ops f64mem:$dst), "fst{l} $dst">;
+def ST_FP32m  : FPI<0xD9, MRM3m, (ops f32mem:$dst), "fstp{s} $dst">;
+def ST_FP64m  : FPI<0xDD, MRM3m, (ops f64mem:$dst), "fstp{l} $dst">;
+def IST_F16m  : FPI<0xDF, MRM2m, (ops i16mem:$dst), "fist{s} $dst">;
+def IST_F32m  : FPI<0xDB, MRM2m, (ops i32mem:$dst), "fist{l} $dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (ops i16mem:$dst), "fistp{s} $dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (ops i32mem:$dst), "fistp{l} $dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (ops i64mem:$dst), "fistp{ll} $dst">;
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(ops i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(ops i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(ops i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(ops i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(ops i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(ops i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+
+def ISTT_FP16m : FPI<0xDF, MRM1m, (ops i16mem:$dst), "fisttp{s} $dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (ops i32mem:$dst), "fisttp{l} $dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (ops i64mem:$dst), "fisttp{ll} $dst">;
+
+// FP Stack manipulation instructions.
+def LD_Frr   : FPI<0xC0, AddRegFrm, (ops RST:$op), "fld $op">, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (ops RST:$op), "fst $op">, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (ops RST:$op), "fstp $op">, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (ops RST:$op), "fxch $op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpI<(ops RFP32:$dst), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpI<(ops RFP32:$dst), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpI<(ops RFP64:$dst), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpI<(ops RFP64:$dst), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (ops), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (ops), "fld1">, D9;
+
+
+// Floating point compares.
+def UCOM_Fpr32 : FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr32: FpI<(ops RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(X86cmp RFP32:$lhs, RFP32:$rhs)]>; // CC = ST(0) cmp ST(i)
+def UCOM_Fpr64 : FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_FpIr64: FpI<(ops RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(X86cmp RFP64:$lhs, RFP64:$rhs)]>; // CC = ST(0) cmp ST(i)
+
+def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
+                    (ops RST:$reg),
+                    "fucom $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
+                    (ops RST:$reg),
+                    "fucomp $reg">, DD, Imp<[ST0],[]>;
+def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
+                    (ops),
+                    "fucompp">, DA, Imp<[ST0],[]>;
+
+def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
+                    (ops RST:$reg),
+                    "fucomi {$reg, %st(0)|%ST(0), $reg}">, DB, Imp<[ST0],[]>;
+def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
+                    (ops RST:$reg),
+                    "fucomip {$reg, %st(0)|%ST(0), $reg}">, DF, Imp<[ST0],[]>;
+
+// Floating point flag ops.
+def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
+                  (ops), "fnstsw", []>, DF, Imp<[],[AX]>;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (ops i16mem:$dst), "fnstcw $dst", []>;
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (ops i16mem:$dst), "fldcw $dst", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+
+// Required for CALL which return f32 / f64 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStack]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStack]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStack]>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+def : Pat<(extloadf32 addr:$src), 
+           (MOV_Fp3264 (LD_Fp32m addr:$src))>, Requires<[FPStack]>;
+def : Pat<(fextend RFP32:$src), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStack]>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..06b14fe
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -0,0 +1,567 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+using namespace llvm;
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+  : TargetInstrInfo(X86Insts, sizeof(X86Insts)/sizeof(X86Insts[0])),
+    TM(tm), RI(tm, *this) {
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg) const {
+  MachineOpCode oc = MI.getOpcode();
+  if (oc == X86::MOV8rr || oc == X86::MOV16rr ||
+      oc == X86::MOV32rr || oc == X86::MOV64rr ||
+      oc == X86::MOV16to16_ || oc == X86::MOV32to32_ ||
+      oc == X86::MOV_Fp3232  || oc == X86::MOVSSrr || oc == X86::MOVSDrr ||
+      oc == X86::MOV_Fp3264 || oc == X86::MOV_Fp6432 || oc == X86::MOV_Fp6464 ||
+      oc == X86::FsMOVAPSrr || oc == X86::FsMOVAPDrr ||
+      oc == X86::MOVAPSrr || oc == X86::MOVAPDrr ||
+      oc == X86::MOVSS2PSrr || oc == X86::MOVSD2PDrr ||
+      oc == X86::MOVPS2SSrr || oc == X86::MOVPD2SDrr ||
+      oc == X86::MMX_MOVD64rr || oc == X86::MMX_MOVQ64rr) {
+      assert(MI.getNumOperands() >= 2 &&
+             MI.getOperand(0).isRegister() &&
+             MI.getOperand(1).isRegister() &&
+             "invalid register-register move instruction");
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV16_rm:
+  case X86::MOV32rm:
+  case X86::MOV32_rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    if (MI->getOperand(1).isFrameIndex() && MI->getOperand(2).isImmediate() &&
+        MI->getOperand(3).isRegister() && MI->getOperand(4).isImmediate() &&
+        MI->getOperand(2).getImmedValue() == 1 &&
+        MI->getOperand(3).getReg() == 0 &&
+        MI->getOperand(4).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(1).getFrameIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(MachineInstr *MI,
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV16_mr:
+  case X86::MOV32mr:
+  case X86::MOV32_mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    if (MI->getOperand(0).isFrameIndex() && MI->getOperand(1).isImmediate() &&
+        MI->getOperand(2).isRegister() && MI->getOperand(3).isImmediate() &&
+        MI->getOperand(1).getImmedValue() == 1 &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImmedValue() == 0) {
+      FrameIndex = MI->getOperand(0).getFrameIndex();
+      return MI->getOperand(4).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+
+bool X86InstrInfo::isReallyTriviallyReMaterializable(MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV16_rm:
+  case X86::MOV32rm:
+  case X86::MOV32_rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    // Loads from constant pools are trivially rematerializable.
+    return MI->getOperand(1).isRegister() && MI->getOperand(2).isImmediate() &&
+           MI->getOperand(3).isRegister() && MI->getOperand(4).isConstantPoolIndex() &&
+           MI->getOperand(1).getReg() == 0 &&
+           MI->getOperand(2).getImmedValue() == 1 &&
+           MI->getOperand(3).getReg() == 0;
+  }
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables &LV) const {
+  MachineInstr *MI = MBBI;
+  // All instructions input are two-addr instructions.  Get the known operands.
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+
+  MachineInstr *NewMI = NULL;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  bool DisableLEA16 = true;
+
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    unsigned M = MI->getOperand(3).getImm();
+    if (B != C) return 0;
+    NewMI = BuildMI(get(X86::PSHUFDri), A).addReg(B).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    NewMI = BuildMI(get(X86::LEA64r), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit() ?
+      X86::LEA64_32r : X86::LEA32r;
+    NewMI = BuildMI(get(Opc), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() == 3 && "Unknown shift instruction!");
+    if (DisableLEA16) return 0;
+    
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src = MI->getOperand(1).getReg();
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    
+    NewMI = BuildMI(get(X86::LEA16r), Dest)
+      .addReg(0).addImm(1 << ShAmt).addReg(Src).addImm(0);
+    break;
+  }
+  }
+
+  // FIXME: None of these instructions are promotable to LEAs without
+  // additional information.  In particular, LEA doesn't set the flags that
+  // add and inc do.  :(
+  if (0)
+  switch (MI->getOpcode()) {
+  case X86::INC32r:
+  case X86::INC64_32r:
+    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, 1);
+    break;
+  case X86::INC16r:
+  case X86::INC64_16r:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 2 && "Unknown inc instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, 1);
+    break;
+  case X86::DEC32r:
+  case X86::DEC64_32r:
+    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src, -1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 2 && "Unknown dec instruction!");
+    NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src, -1);
+    break;
+  case X86::ADD32rr:
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    NewMI = addRegReg(BuildMI(get(X86::LEA32r), Dest), Src,
+                     MI->getOperand(2).getReg());
+    break;
+  case X86::ADD16rr:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    NewMI = addRegReg(BuildMI(get(X86::LEA16r), Dest), Src,
+                     MI->getOperand(2).getReg());
+    break;
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    if (MI->getOperand(2).isImmediate())
+      NewMI = addRegOffset(BuildMI(get(X86::LEA32r), Dest), Src,
+                          MI->getOperand(2).getImmedValue());
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+    if (DisableLEA16) return 0;
+    assert(MI->getNumOperands() == 3 && "Unknown add instruction!");
+    if (MI->getOperand(2).isImmediate())
+      NewMI = addRegOffset(BuildMI(get(X86::LEA16r), Dest), Src,
+                          MI->getOperand(2).getImmedValue());
+    break;
+  case X86::SHL16ri:
+    if (DisableLEA16) return 0;
+  case X86::SHL32ri:
+    assert(MI->getNumOperands() == 3 && MI->getOperand(2).isImmediate() &&
+           "Unknown shl instruction!");
+    unsigned ShAmt = MI->getOperand(2).getImmedValue();
+    if (ShAmt == 1 || ShAmt == 2 || ShAmt == 3) {
+      X86AddressMode AM;
+      AM.Scale = 1 << ShAmt;
+      AM.IndexReg = Src;
+      unsigned Opc = MI->getOpcode() == X86::SHL32ri ? X86::LEA32r :X86::LEA16r;
+      NewMI = addFullAddress(BuildMI(get(Opc), Dest), AM);
+    }
+    break;
+  }
+
+  if (NewMI) {
+    NewMI->copyKillDeadInfo(MI);
+    LV.instructionChanged(MI, NewMI);  // Update live variables
+    MFI->insert(MBBI, NewMI);          // Insert the new inst    
+  }
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *X86InstrInfo::commuteInstruction(MachineInstr *MI) const {
+  // FIXME: Can commute cmoves by changing the condition!
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8:{// A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: assert(0 && "Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImmedValue();
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    bool BisKill = MI->getOperand(1).isKill();
+    bool CisKill = MI->getOperand(2).isKill();
+    return BuildMI(get(Opc), A).addReg(C, false, false, CisKill)
+      .addReg(B, false, false, BisKill).addImm(Size-Amt);
+  }
+  default:
+    return TargetInstrInfo::commuteInstruction(MI);
+  }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE:  return X86::COND_E;
+  case X86::JNE: return X86::COND_NE;
+  case X86::JL:  return X86::COND_L;
+  case X86::JLE: return X86::COND_LE;
+  case X86::JG:  return X86::COND_G;
+  case X86::JGE: return X86::COND_GE;
+  case X86::JB:  return X86::COND_B;
+  case X86::JBE: return X86::COND_BE;
+  case X86::JA:  return X86::COND_A;
+  case X86::JAE: return X86::COND_AE;
+  case X86::JS:  return X86::COND_S;
+  case X86::JNS: return X86::COND_NS;
+  case X86::JP:  return X86::COND_P;
+  case X86::JNP: return X86::COND_NP;
+  case X86::JO:  return X86::COND_O;
+  case X86::JNO: return X86::COND_NO;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::JE;
+  case X86::COND_NE: return X86::JNE;
+  case X86::COND_L:  return X86::JL;
+  case X86::COND_LE: return X86::JLE;
+  case X86::COND_G:  return X86::JG;
+  case X86::COND_GE: return X86::JGE;
+  case X86::COND_B:  return X86::JB;
+  case X86::COND_BE: return X86::JBE;
+  case X86::COND_A:  return X86::JA;
+  case X86::COND_AE: return X86::JAE;
+  case X86::COND_S:  return X86::JS;
+  case X86::COND_NS: return X86::JNS;
+  case X86::COND_P:  return X86::JP;
+  case X86::COND_NP: return X86::JNP;
+  case X86::COND_O:  return X86::JO;
+  case X86::COND_NO: return X86::JNO;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: assert(0 && "Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  if (MI->getOpcode() == X86::FP_REG_KILL)
+    return false;
+
+  const TargetInstrDescriptor *TID = MI->getInstrDescriptor();
+  if (TID->Flags & M_TERMINATOR_FLAG) {
+    // Conditional branch is a special case.
+    if ((TID->Flags & M_BRANCH_FLAG) != 0 && (TID->Flags & M_BARRIER_FLAG) == 0)
+      return true;
+    if ((TID->Flags & M_PREDICABLE) == 0)
+      return true;
+    return !isPredicated(MI);
+  }
+  return false;
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 std::vector<MachineOperand> &Cond) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (!isBranch(LastInst->getOpcode()))
+      return true;
+    
+    // If the block ends with a branch there are 3 possibilities:
+    // it's an unconditional, conditional, or indirect branch.
+    
+    if (LastInst->getOpcode() == X86::JMP) {
+      TBB = LastInst->getOperand(0).getMachineBasicBlock();
+      return false;
+    }
+    X86::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Otherwise, block ends with fall-through condbranch.
+    TBB = LastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with X86::JMP and a conditional branch, handle it.
+  X86::CondCode BranchCode = GetCondFromBranchOpc(SecondLastInst->getOpcode());
+  if (BranchCode != X86::COND_INVALID && LastInst->getOpcode() == X86::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    FBB = LastInst->getOperand(0).getMachineBasicBlock();
+    return false;
+  }
+
+  // If the block ends with two X86::JMPs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == X86::JMP && 
+      LastInst->getOpcode() == X86::JMP) {
+    TBB = SecondLastInst->getOperand(0).getMachineBasicBlock();
+    I = LastInst;
+    I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != X86::JMP && 
+      GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const std::vector<MachineOperand> &Cond) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch?
+      BuildMI(&MBB, get(X86::JMP)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, get(Opc)).addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  unsigned Opc = GetCondBranchFromCond((X86::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, get(Opc)).addMBB(TBB);
+  BuildMI(&MBB, get(X86::JMP)).addMBB(FBB);
+  return 2;
+}
+
+bool X86InstrInfo::BlockHasNoFallThrough(MachineBasicBlock &MBB) const {
+  if (MBB.empty()) return false;
+  
+  switch (MBB.back().getOpcode()) {
+  case X86::RET:     // Return.
+  case X86::RETI:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::JMP:     // Uncond branch.
+  case X86::JMP32r:  // Indirect branch.
+  case X86::JMP32m:  // Indirect branch through mem.
+    return true;
+  default: return false;
+  }
+}
+
+bool X86InstrInfo::
+ReverseBranchCondition(std::vector<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((X86::CondCode)Cond[0].getImm()));
+  return false;
+}
+
+const TargetRegisterClass *X86InstrInfo::getPointerRegClass() const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  if (Subtarget->is64Bit())
+    return &X86::GR64RegClass;
+  else
+    return &X86::GR32RegClass;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..ec30cc7
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -0,0 +1,287 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86RegisterInfo.h"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86TargetMachine;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+    COND_INVALID
+  };
+  
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+  
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+  
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction types.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0xF << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = 12,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = 13,
+    ImmMask  = 7 << ImmShift,
+    Imm8     = 1 << ImmShift,
+    Imm16    = 2 << ImmShift,
+    Imm32    = 3 << ImmShift,
+    Imm64    = 4 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = 16,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Bits 19 -> 23 are unused
+    OpcodeShift   = 24,
+    OpcodeMask    = 0xFF << OpcodeShift
+  };
+}
+
+class X86InstrInfo : public TargetInstrInfo {
+  X86TargetMachine &TM;
+  const X86RegisterInfo RI;
+public:
+  X86InstrInfo(X86TargetMachine &tm);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MRegisterInfo &getRegisterInfo() const { return RI; }
+
+  // Return true if the instruction is a register to register move and
+  // leave the source and dest operands in the passed parameters.
+  //
+  bool isMoveInstr(const MachineInstr& MI, unsigned& sourceReg,
+                   unsigned& destReg) const;
+  unsigned isLoadFromStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(MachineInstr *MI, int &FrameIndex) const;
+  bool isReallyTriviallyReMaterializable(MachineInstr *MI) const;
+  
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables &LV) const;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI) const;
+
+  // Branch analysis.
+  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             std::vector<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                                const std::vector<MachineOperand> &Cond) const;
+  virtual bool BlockHasNoFallThrough(MachineBasicBlock &MBB) const;
+  virtual bool ReverseBranchCondition(std::vector<MachineOperand> &Cond) const;
+
+  const TargetRegisterClass *getPointerRegClass() const;
+
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified opcode number.
+  //
+  unsigned char getBaseOpcodeFor(const TargetInstrDescriptor *TID) const {
+    return TID->TSFlags >> X86II::OpcodeShift;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..b24f644
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -0,0 +1,2674 @@
+//===- X86InstrInfo.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>]>;
+
+def SDTX86BrCond  : SDTypeProfile<0, 2,
+                                  [SDTCisVT<0, OtherVT>, SDTCisVT<1, i8>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 1,
+                                  [SDTCisVT<0, i8>, SDTCisVT<1, i8>]>;
+
+def SDTX86Ret     : SDTypeProfile<0, 1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+def SDT_X86CallSeqEnd   : SDTypeProfile<0, 2, [ SDTCisVT<0, i32>,
+                                                SDTCisVT<1, i32> ]>;
+
+def SDT_X86Call   : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
+
+def SDT_X86TLSTP : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov,    
+                        [SDNPInFlag, SDNPOutFlag]>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain, SDNPInFlag]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC,
+                        [SDNPInFlag, SDNPOutFlag]>;
+
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86tailcall: SDNode<"X86ISD::TAILCALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+                        [SDNPHasChain, SDNPOutFlag]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def X86TLStp : SDNode<"X86ISD::THREAD_POINTER", SDT_X86TLSTP, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+def i8mem   : X86MemOperand<"printi8mem">;
+def i16mem  : X86MemOperand<"printi16mem">;
+def i32mem  : X86MemOperand<"printi32mem">;
+def i64mem  : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+def f32mem  : X86MemOperand<"printf32mem">;
+def f64mem  : X86MemOperand<"printf64mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+
+def lea32mem : Operand<i32> {
+  let PrintMethod = "printi32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+}
+
+def piclabel: Operand<i32> {
+  let PrintMethod = "printPICLabel";
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16>;
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32>;
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 4, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+                               [add, mul, shl, or, frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRMInitReg : Format<32>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def FPStack      : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+
+//===----------------------------------------------------------------------===//
+// X86 specific pattern fragments.
+//
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoImm  : ImmType<0>;
+def Imm8   : ImmType<1>;
+def Imm16  : ImmType<2>;
+def Imm32  : ImmType<3>;
+def Imm64  : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag ops, string AsmStr>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<6> FormBits = Form.Value;
+  ImmType ImmT = i;
+  bits<3> ImmTypeBits = ImmT.Value;
+
+  dag OperandList = ops;
+  string AsmString = AsmStr;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
+  FPFormat FPForm;          // What flavor of FP instruction is this?
+  bits<3> FPFormBits = 0;
+}
+
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class TB     { bits<4> Prefix = 1; }
+class REP    { bits<4> Prefix = 2; }
+class D8     { bits<4> Prefix = 3; }
+class D9     { bits<4> Prefix = 4; }
+class DA     { bits<4> Prefix = 5; }
+class DB     { bits<4> Prefix = 6; }
+class DC     { bits<4> Prefix = 7; }
+class DD     { bits<4> Prefix = 8; }
+class DE     { bits<4> Prefix = 9; }
+class DF     { bits<4> Prefix = 10; }
+class XD     { bits<4> Prefix = 11; }
+class XS     { bits<4> Prefix = 12; }
+class T8     { bits<4> Prefix = 13; }
+class TA     { bits<4> Prefix = 14; }
+
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;
+def X86_COND_AE  : PatLeaf<(i8 1)>;
+def X86_COND_B   : PatLeaf<(i8 2)>;
+def X86_COND_BE  : PatLeaf<(i8 3)>;
+def X86_COND_E   : PatLeaf<(i8 4)>;
+def X86_COND_G   : PatLeaf<(i8 5)>;
+def X86_COND_GE  : PatLeaf<(i8 6)>;
+def X86_COND_L   : PatLeaf<(i8 7)>;
+def X86_COND_LE  : PatLeaf<(i8 8)>;
+def X86_COND_NE  : PatLeaf<(i8 9)>;
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>;
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>;
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+def i16immSExt8  : PatLeaf<(i16 imm), [{
+  // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int16_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+// Helper fragments for loads.
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr))>;
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (load node:$ptr))>;
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (load node:$ptr))>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
+
+def sextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (sextloadi1 node:$ptr))>;
+def sextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (sextloadi1 node:$ptr))>;
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class I<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, NoImm, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm8 , ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm16, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm32, ops, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+def ADJCALLSTACKDOWN : I<0, Pseudo, (ops i32imm:$amt), "#ADJCALLSTACKDOWN",
+                         [(X86callseq_start imm:$amt)]>, Imp<[ESP],[ESP]>;
+def ADJCALLSTACKUP   : I<0, Pseudo, (ops i32imm:$amt1, i32imm:$amt2),
+                         "#ADJCALLSTACKUP",
+                         [(X86callseq_end imm:$amt1, imm:$amt2)]>,
+                         Imp<[ESP],[ESP]>;
+def IMPLICIT_USE     : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_USE", []>;
+def IMPLICIT_DEF     : I<0, Pseudo, (ops variable_ops), "#IMPLICIT_DEF", []>;
+def IMPLICIT_DEF_GR8  : I<0, Pseudo, (ops GR8:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR8:$dst, (undef))]>;
+def IMPLICIT_DEF_GR16  : I<0, Pseudo, (ops GR16:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR16:$dst, (undef))]>;
+def IMPLICIT_DEF_GR32  : I<0, Pseudo, (ops GR32:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR32:$dst, (undef))]>;
+
+// Nop
+def NOOP : I<0x90, RawFrm, (ops), "nop", []>;
+
+// Truncate
+def TRUNC_32_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR32_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}", []>;
+def TRUNC_16_to8 : I<0x88, MRMDestReg, (ops GR8:$dst, GR16_:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}", []>;
+def TRUNC_32to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR32:$src),
+                     "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+                     [(set GR16:$dst, (trunc GR32:$src))]>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, noResults = 1 in {
+  def RET    : I<0xC3, RawFrm, (ops), "ret", [(X86retflag 0)]>;
+  def RETI   : Ii16<0xC2, RawFrm, (ops i16imm:$amt), "ret $amt",
+                    [(X86retflag imm:$amt)]>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1, noResults = 1 in
+  class IBr<bits<8> opcode, dag ops, string asm, list<dag> pattern> :
+        I<opcode, RawFrm, ops, asm, pattern>;
+
+// Indirect branches
+let isBranch = 1, isBarrier = 1 in
+  def JMP : IBr<0xE9, (ops brtarget:$dst), "jmp $dst", [(br bb:$dst)]>;
+
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+  def JMP32r     : I<0xFF, MRM4r, (ops GR32:$dst), "jmp{l} {*}$dst",
+                     [(brind GR32:$dst)]>;
+  def JMP32m     : I<0xFF, MRM4m, (ops i32mem:$dst), "jmp{l} {*}$dst",
+                     [(brind (loadi32 addr:$dst))]>;
+}
+
+// Conditional branches
+def JE  : IBr<0x84, (ops brtarget:$dst), "je $dst",
+              [(X86brcond bb:$dst, X86_COND_E)]>, TB;
+def JNE : IBr<0x85, (ops brtarget:$dst), "jne $dst",
+              [(X86brcond bb:$dst, X86_COND_NE)]>, TB;
+def JL  : IBr<0x8C, (ops brtarget:$dst), "jl $dst",
+              [(X86brcond bb:$dst, X86_COND_L)]>, TB;
+def JLE : IBr<0x8E, (ops brtarget:$dst), "jle $dst",
+              [(X86brcond bb:$dst, X86_COND_LE)]>, TB;
+def JG  : IBr<0x8F, (ops brtarget:$dst), "jg $dst",
+              [(X86brcond bb:$dst, X86_COND_G)]>, TB;
+def JGE : IBr<0x8D, (ops brtarget:$dst), "jge $dst",
+              [(X86brcond bb:$dst, X86_COND_GE)]>, TB;
+
+def JB  : IBr<0x82, (ops brtarget:$dst), "jb $dst",
+              [(X86brcond bb:$dst, X86_COND_B)]>, TB;
+def JBE : IBr<0x86, (ops brtarget:$dst), "jbe $dst",
+              [(X86brcond bb:$dst, X86_COND_BE)]>, TB;
+def JA  : IBr<0x87, (ops brtarget:$dst), "ja $dst",
+              [(X86brcond bb:$dst, X86_COND_A)]>, TB;
+def JAE : IBr<0x83, (ops brtarget:$dst), "jae $dst",
+              [(X86brcond bb:$dst, X86_COND_AE)]>, TB;
+
+def JS  : IBr<0x88, (ops brtarget:$dst), "js $dst",
+              [(X86brcond bb:$dst, X86_COND_S)]>, TB;
+def JNS : IBr<0x89, (ops brtarget:$dst), "jns $dst",
+              [(X86brcond bb:$dst, X86_COND_NS)]>, TB;
+def JP  : IBr<0x8A, (ops brtarget:$dst), "jp $dst",
+              [(X86brcond bb:$dst, X86_COND_P)]>, TB;
+def JNP : IBr<0x8B, (ops brtarget:$dst), "jnp $dst",
+              [(X86brcond bb:$dst, X86_COND_NP)]>, TB;
+def JO  : IBr<0x80, (ops brtarget:$dst), "jo $dst",
+              [(X86brcond bb:$dst, X86_COND_O)]>, TB;
+def JNO : IBr<0x81, (ops brtarget:$dst), "jno $dst",
+              [(X86brcond bb:$dst, X86_COND_NO)]>, TB;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+  // All calls clobber the non-callee saved registers...
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in {
+    def CALLpcrel32 : I<0xE8, RawFrm, (ops i32imm:$dst, variable_ops),
+                        "call ${dst:call}", []>;
+    def CALL32r     : I<0xFF, MRM2r, (ops GR32:$dst, variable_ops),
+                        "call {*}$dst", [(X86call GR32:$dst)]>;
+    def CALL32m     : I<0xFF, MRM2m, (ops i32mem:$dst, variable_ops),
+                        "call {*}$dst", []>;
+  }
+
+// Tail call stuff.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPd : IBr<0xE9, (ops i32imm:$dst), "jmp ${dst:call}  # TAIL CALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPr : I<0xFF, MRM4r, (ops GR32:$dst), "jmp {*}$dst  # TAIL CALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, noResults = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (ops i32mem:$dst),
+                   "jmp {*}$dst  # TAIL CALL", []>;
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+def LEAVE    : I<0xC9, RawFrm,
+                 (ops), "leave", []>, Imp<[EBP,ESP],[EBP,ESP]>;
+def POP32r   : I<0x58, AddRegFrm,
+                 (ops GR32:$reg), "pop{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def PUSH32r  : I<0x50, AddRegFrm,
+                 (ops GR32:$reg), "push{l} $reg", []>, Imp<[ESP],[ESP]>;
+
+def MovePCtoStack : I<0, Pseudo, (ops piclabel:$label),
+                      "call $label", []>;
+
+let isTwoAddress = 1 in                               // GR32 = bswap GR32
+  def BSWAP32r : I<0xC8, AddRegFrm,
+                   (ops GR32:$dst, GR32:$src),
+                   "bswap{l} $dst", 
+                   [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+def XCHG8rr  : I<0x86, MRMDestReg,                    // xchg GR8, GR8
+                 (ops GR8:$src1, GR8:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rr : I<0x87, MRMDestReg,                    // xchg GR16, GR16
+                 (ops GR16:$src1, GR16:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rr : I<0x87, MRMDestReg,                    // xchg GR32, GR32
+                 (ops GR32:$src1, GR32:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def XCHG8mr  : I<0x86, MRMDestMem,
+                 (ops i8mem:$src1, GR8:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16mr : I<0x87, MRMDestMem,
+                 (ops i16mem:$src1, GR16:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32mr : I<0x87, MRMDestMem,
+                 (ops i32mem:$src1, GR32:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG8rm  : I<0x86, MRMSrcMem,
+                 (ops GR8:$src1, i8mem:$src2),
+                 "xchg{b} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG16rm : I<0x87, MRMSrcMem,
+                 (ops GR16:$src1, i16mem:$src2),
+                 "xchg{w} {$src2|$src1}, {$src1|$src2}", []>, OpSize;
+def XCHG32rm : I<0x87, MRMSrcMem,
+                 (ops GR32:$src1, i32mem:$src2),
+                 "xchg{l} {$src2|$src1}, {$src1|$src2}", []>;
+
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (ops GR16:$dst, i32mem:$src),
+                 "lea{w} {$src|$dst}, {$dst|$src}", []>, OpSize;
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (ops GR32:$dst, lea32mem:$src),
+                 "lea{l} {$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+def REP_MOVSB : I<0xA4, RawFrm, (ops), "{rep;movsb|rep movsb}",
+                  [(X86rep_movs i8)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (ops), "{rep;movsw|rep movsw}",
+                  [(X86rep_movs i16)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (ops), "{rep;movsl|rep movsd}",
+                  [(X86rep_movs i32)]>,
+                Imp<[ECX,EDI,ESI], [ECX,EDI,ESI]>, REP;
+
+def REP_STOSB : I<0xAA, RawFrm, (ops), "{rep;stosb|rep stosb}",
+                  [(X86rep_stos i8)]>,
+                Imp<[AL,ECX,EDI], [ECX,EDI]>, REP;
+def REP_STOSW : I<0xAB, RawFrm, (ops), "{rep;stosw|rep stosw}",
+                  [(X86rep_stos i16)]>,
+                Imp<[AX,ECX,EDI], [ECX,EDI]>, REP, OpSize;
+def REP_STOSD : I<0xAB, RawFrm, (ops), "{rep;stosl|rep stosd}",
+                  [(X86rep_stos i32)]>,
+                Imp<[EAX,ECX,EDI], [ECX,EDI]>, REP;
+
+def RDTSC : I<0x31, RawFrm, (ops), "rdtsc", [(X86rdtsc)]>,
+            TB, Imp<[],[RAX,RDX]>;
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions...
+//
+def IN8rr  : I<0xEC, RawFrm, (ops),
+               "in{b} {%dx, %al|%AL, %DX}",
+               []>,  Imp<[DX], [AL]>;
+def IN16rr : I<0xED, RawFrm, (ops),
+               "in{w} {%dx, %ax|%AX, %DX}",
+               []>,  Imp<[DX], [AX]>, OpSize;
+def IN32rr : I<0xED, RawFrm, (ops),
+               "in{l} {%dx, %eax|%EAX, %DX}",
+               []>, Imp<[DX],[EAX]>;
+
+def IN8ri  : Ii8<0xE4, RawFrm, (ops i16i8imm:$port),
+                  "in{b} {$port, %al|%AL, $port}",
+                 []>,
+             Imp<[], [AL]>;
+def IN16ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+                  "in{w} {$port, %ax|%AX, $port}",
+                 []>,
+             Imp<[], [AX]>, OpSize;
+def IN32ri : Ii8<0xE5, RawFrm, (ops i16i8imm:$port),
+                  "in{l} {$port, %eax|%EAX, $port}",
+                 []>,
+             Imp<[],[EAX]>;
+
+def OUT8rr  : I<0xEE, RawFrm, (ops),
+                "out{b} {%al, %dx|%DX, %AL}",
+                []>,  Imp<[DX,  AL], []>;
+def OUT16rr : I<0xEF, RawFrm, (ops),
+                "out{w} {%ax, %dx|%DX, %AX}",
+                []>,  Imp<[DX,  AX], []>, OpSize;
+def OUT32rr : I<0xEF, RawFrm, (ops),
+                "out{l} {%eax, %dx|%DX, %EAX}",
+                []>, Imp<[DX, EAX], []>;
+
+def OUT8ir  : Ii8<0xE6, RawFrm, (ops i16i8imm:$port),
+                   "out{b} {%al, $port|$port, %AL}",
+                   []>,
+              Imp<[AL], []>;
+def OUT16ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+                   "out{w} {%ax, $port|$port, %AX}",
+                   []>,
+              Imp<[AX], []>, OpSize;
+def OUT32ir : Ii8<0xE7, RawFrm, (ops i16i8imm:$port),
+                   "out{l} {%eax, $port|$port, %EAX}",
+                   []>,
+              Imp<[EAX], []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+def MOV8rr  : I<0x88, MRMDestReg, (ops GR8 :$dst, GR8 :$src),
+                "mov{b} {$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (ops GR16:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (ops GR32:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+let isReMaterializable = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (ops GR8 :$dst, i8imm :$src),
+                   "mov{b} {$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (ops GR16:$dst, i16imm:$src),
+                   "mov{w} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (ops GR32:$dst, i32imm:$src),
+                   "mov{l} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)]>;
+}
+def MOV8mi  : Ii8 <0xC6, MRM0m, (ops i8mem :$dst, i8imm :$src),
+                   "mov{b} {$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (ops i16mem:$dst, i16imm:$src),
+                   "mov{w} {$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (ops i32mem:$dst, i32imm:$src),
+                   "mov{l} {$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)]>;
+
+def MOV8rm  : I<0x8A, MRMSrcMem, (ops GR8 :$dst, i8mem :$src),
+                "mov{b} {$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (ops GR16:$dst, i16mem:$src),
+                "mov{w} {$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (load addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (ops GR32:$dst, i32mem:$src),
+                "mov{l} {$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (load addr:$src))]>;
+
+def MOV8mr  : I<0x88, MRMDestMem, (ops i8mem :$dst, GR8 :$src),
+                "mov{b} {$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)]>;
+                
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+def MUL8r  : I<0xF6, MRM4r, (ops GR8:$src), "mul{b} $src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src))]>,
+             Imp<[AL],[AX]>;               // AL,AH = AL*GR8
+def MUL16r : I<0xF7, MRM4r, (ops GR16:$src), "mul{w} $src", []>,
+             Imp<[AX],[AX,DX]>, OpSize;    // AX,DX = AX*GR16
+def MUL32r : I<0xF7, MRM4r, (ops GR32:$src), "mul{l} $src", []>,
+             Imp<[EAX],[EAX,EDX]>;         // EAX,EDX = EAX*GR32
+def MUL8m  : I<0xF6, MRM4m, (ops i8mem :$src),
+               "mul{b} $src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src)))]>,
+             Imp<[AL],[AX]>;          // AL,AH = AL*[mem8]
+def MUL16m : I<0xF7, MRM4m, (ops i16mem:$src),
+               "mul{w} $src", []>, Imp<[AX],[AX,DX]>,
+               OpSize; // AX,DX = AX*[mem16]
+def MUL32m : I<0xF7, MRM4m, (ops i32mem:$src),
+               "mul{l} $src", []>, Imp<[EAX],[EAX,EDX]>;// EAX,EDX = EAX*[mem32]
+
+def IMUL8r  : I<0xF6, MRM5r, (ops GR8:$src), "imul{b} $src", []>,
+              Imp<[AL],[AX]>;               // AL,AH = AL*GR8
+def IMUL16r : I<0xF7, MRM5r, (ops GR16:$src), "imul{w} $src", []>,
+              Imp<[AX],[AX,DX]>, OpSize;    // AX,DX = AX*GR16
+def IMUL32r : I<0xF7, MRM5r, (ops GR32:$src), "imul{l} $src", []>,
+              Imp<[EAX],[EAX,EDX]>;         // EAX,EDX = EAX*GR32
+def IMUL8m  : I<0xF6, MRM5m, (ops i8mem :$src),
+                "imul{b} $src", []>, Imp<[AL],[AX]>;        // AL,AH = AL*[mem8]
+def IMUL16m : I<0xF7, MRM5m, (ops i16mem:$src),
+                "imul{w} $src", []>, Imp<[AX],[AX,DX]>,
+                OpSize; // AX,DX = AX*[mem16]
+def IMUL32m : I<0xF7, MRM5m, (ops i32mem:$src),
+                "imul{l} $src", []>,
+                Imp<[EAX],[EAX,EDX]>;  // EAX,EDX = EAX*[mem32]
+
+// unsigned division/remainder
+def DIV8r  : I<0xF6, MRM6r, (ops GR8:$src),          // AX/r8 = AL,AH
+               "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16r : I<0xF7, MRM6r, (ops GR16:$src),         // DX:AX/r16 = AX,DX
+               "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32r : I<0xF7, MRM6r, (ops GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def DIV8m  : I<0xF6, MRM6m, (ops i8mem:$src),       // AX/[mem8] = AL,AH
+               "div{b} $src", []>, Imp<[AX],[AX]>;
+def DIV16m : I<0xF7, MRM6m, (ops i16mem:$src),      // DX:AX/[mem16] = AX,DX
+               "div{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def DIV32m : I<0xF7, MRM6m, (ops i32mem:$src),      // EDX:EAX/[mem32] = EAX,EDX
+               "div{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+// Signed division/remainder.
+def IDIV8r : I<0xF6, MRM7r, (ops GR8:$src),          // AX/r8 = AL,AH
+               "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16r: I<0xF7, MRM7r, (ops GR16:$src),         // DX:AX/r16 = AX,DX
+               "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32r: I<0xF7, MRM7r, (ops GR32:$src),         // EDX:EAX/r32 = EAX,EDX
+               "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+def IDIV8m : I<0xF6, MRM7m, (ops i8mem:$src),      // AX/[mem8] = AL,AH
+               "idiv{b} $src", []>, Imp<[AX],[AX]>;
+def IDIV16m: I<0xF7, MRM7m, (ops i16mem:$src),     // DX:AX/[mem16] = AX,DX
+               "idiv{w} $src", []>, Imp<[AX,DX],[AX,DX]>, OpSize;
+def IDIV32m: I<0xF7, MRM7m, (ops i32mem:$src),     // EDX:EAX/[mem32] = EAX,EDX
+               "idiv{l} $src", []>, Imp<[EAX,EDX],[EAX,EDX]>;
+
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions...
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+def CMOVB16rr : I<0x42, MRMSrcReg,       // if <u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_B))]>,
+                  TB, OpSize;
+def CMOVB16rm : I<0x42, MRMSrcMem,       // if <u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_B))]>,
+                  TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg,       // if <u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_B))]>,
+                   TB;
+def CMOVB32rm : I<0x42, MRMSrcMem,       // if <u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovb {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_B))]>,
+                   TB;
+
+def CMOVAE16rr: I<0x43, MRMSrcReg,       // if >=u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_AE))]>,
+                   TB, OpSize;
+def CMOVAE16rm: I<0x43, MRMSrcMem,       // if >=u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_AE))]>,
+                   TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg,       // if >=u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_AE))]>,
+                   TB;
+def CMOVAE32rm: I<0x43, MRMSrcMem,       // if >=u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovae {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_AE))]>,
+                   TB;
+
+def CMOVE16rr : I<0x44, MRMSrcReg,       // if ==, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_E))]>,
+                   TB, OpSize;
+def CMOVE16rm : I<0x44, MRMSrcMem,       // if ==, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_E))]>,
+                   TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg,       // if ==, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_E))]>,
+                   TB;
+def CMOVE32rm : I<0x44, MRMSrcMem,       // if ==, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmove {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_E))]>,
+                   TB;
+
+def CMOVNE16rr: I<0x45, MRMSrcReg,       // if !=, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NE))]>,
+                   TB, OpSize;
+def CMOVNE16rm: I<0x45, MRMSrcMem,       // if !=, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NE))]>,
+                   TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg,       // if !=, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NE))]>,
+                   TB;
+def CMOVNE32rm: I<0x45, MRMSrcMem,       // if !=, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovne {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NE))]>,
+                   TB;
+
+def CMOVBE16rr: I<0x46, MRMSrcReg,       // if <=u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_BE))]>,
+                   TB, OpSize;
+def CMOVBE16rm: I<0x46, MRMSrcMem,       // if <=u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_BE))]>,
+                   TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg,       // if <=u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_BE))]>,
+                   TB;
+def CMOVBE32rm: I<0x46, MRMSrcMem,       // if <=u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovbe {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_BE))]>,
+                   TB;
+
+def CMOVA16rr : I<0x47, MRMSrcReg,       // if >u, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_A))]>,
+                   TB, OpSize;
+def CMOVA16rm : I<0x47, MRMSrcMem,       // if >u, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_A))]>,
+                   TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg,       // if >u, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_A))]>,
+                   TB;
+def CMOVA32rm : I<0x47, MRMSrcMem,       // if >u, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmova {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_A))]>,
+                   TB;
+
+def CMOVL16rr : I<0x4C, MRMSrcReg,       // if <s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_L))]>,
+                   TB, OpSize;
+def CMOVL16rm : I<0x4C, MRMSrcMem,       // if <s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_L))]>,
+                   TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg,       // if <s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_L))]>,
+                   TB;
+def CMOVL32rm : I<0x4C, MRMSrcMem,       // if <s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovl {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_L))]>,
+                   TB;
+
+def CMOVGE16rr: I<0x4D, MRMSrcReg,       // if >=s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_GE))]>,
+                   TB, OpSize;
+def CMOVGE16rm: I<0x4D, MRMSrcMem,       // if >=s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_GE))]>,
+                   TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg,       // if >=s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_GE))]>,
+                   TB;
+def CMOVGE32rm: I<0x4D, MRMSrcMem,       // if >=s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovge {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_GE))]>,
+                   TB;
+
+def CMOVLE16rr: I<0x4E, MRMSrcReg,       // if <=s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_LE))]>,
+                   TB, OpSize;
+def CMOVLE16rm: I<0x4E, MRMSrcMem,       // if <=s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_LE))]>,
+                   TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg,       // if <=s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_LE))]>,
+                   TB;
+def CMOVLE32rm: I<0x4E, MRMSrcMem,       // if <=s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovle {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_LE))]>,
+                   TB;
+
+def CMOVG16rr : I<0x4F, MRMSrcReg,       // if >s, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_G))]>,
+                   TB, OpSize;
+def CMOVG16rm : I<0x4F, MRMSrcMem,       // if >s, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_G))]>,
+                   TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg,       // if >s, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_G))]>,
+                   TB;
+def CMOVG32rm : I<0x4F, MRMSrcMem,       // if >s, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovg {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_G))]>,
+                   TB;
+
+def CMOVS16rr : I<0x48, MRMSrcReg,       // if signed, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_S))]>,
+                  TB, OpSize;
+def CMOVS16rm : I<0x48, MRMSrcMem,       // if signed, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_S))]>,
+                  TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg,       // if signed, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_S))]>,
+                  TB;
+def CMOVS32rm : I<0x48, MRMSrcMem,       // if signed, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovs {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_S))]>,
+                  TB;
+
+def CMOVNS16rr: I<0x49, MRMSrcReg,       // if !signed, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NS))]>,
+                  TB, OpSize;
+def CMOVNS16rm: I<0x49, MRMSrcMem,       // if !signed, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NS))]>,
+                  TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg,       // if !signed, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NS))]>,
+                  TB;
+def CMOVNS32rm: I<0x49, MRMSrcMem,       // if !signed, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovns {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NS))]>,
+                  TB;
+
+def CMOVP16rr : I<0x4A, MRMSrcReg,       // if parity, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_P))]>,
+                  TB, OpSize;
+def CMOVP16rm : I<0x4A, MRMSrcMem,       // if parity, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_P))]>,
+                  TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg,       // if parity, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_P))]>,
+                  TB;
+def CMOVP32rm : I<0x4A, MRMSrcMem,       // if parity, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovp {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_P))]>,
+                  TB;
+
+def CMOVNP16rr : I<0x4B, MRMSrcReg,       // if !parity, GR16 = GR16
+                  (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NP))]>,
+                  TB, OpSize;
+def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, GR16 = [mem16]
+                  (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NP))]>,
+                  TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, GR32 = GR32
+                  (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NP))]>,
+                  TB;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
+                  (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                  "cmovnp {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NP))]>,
+                  TB;
+
+
+// unary instructions
+let CodeSize = 2 in {
+def NEG8r  : I<0xF6, MRM3r, (ops GR8 :$dst, GR8 :$src), "neg{b} $dst",
+               [(set GR8:$dst, (ineg GR8:$src))]>;
+def NEG16r : I<0xF7, MRM3r, (ops GR16:$dst, GR16:$src), "neg{w} $dst",
+               [(set GR16:$dst, (ineg GR16:$src))]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (ops GR32:$dst, GR32:$src), "neg{l} $dst",
+               [(set GR32:$dst, (ineg GR32:$src))]>;
+let isTwoAddress = 0 in {
+  def NEG8m  : I<0xF6, MRM3m, (ops i8mem :$dst), "neg{b} $dst",
+                 [(store (ineg (loadi8 addr:$dst)), addr:$dst)]>;
+  def NEG16m : I<0xF7, MRM3m, (ops i16mem:$dst), "neg{w} $dst",
+                 [(store (ineg (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NEG32m : I<0xF7, MRM3m, (ops i32mem:$dst), "neg{l} $dst",
+                 [(store (ineg (loadi32 addr:$dst)), addr:$dst)]>;
+
+}
+
+def NOT8r  : I<0xF6, MRM2r, (ops GR8 :$dst, GR8 :$src), "not{b} $dst",
+               [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (ops GR16:$dst, GR16:$src), "not{w} $dst",
+               [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (ops GR32:$dst, GR32:$src), "not{l} $dst",
+               [(set GR32:$dst, (not GR32:$src))]>;
+let isTwoAddress = 0 in {
+  def NOT8m  : I<0xF6, MRM2m, (ops i8mem :$dst), "not{b} $dst",
+                 [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+  def NOT16m : I<0xF7, MRM2m, (ops i16mem:$dst), "not{w} $dst",
+                 [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NOT32m : I<0xF7, MRM2m, (ops i32mem:$dst), "not{l} $dst",
+                 [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (ops GR8 :$dst, GR8 :$src), "inc{b} $dst",
+               [(set GR8:$dst, (add GR8:$src, 1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+               [(set GR16:$dst, (add GR16:$src, 1))]>,
+             OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+               [(set GR32:$dst, (add GR32:$src, 1))]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC8m  : I<0xFE, MRM0m, (ops i8mem :$dst), "inc{b} $dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst)]>;
+  def INC16m : I<0xFF, MRM0m, (ops i16mem:$dst), "inc{w} $dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst)]>, OpSize;
+  def INC32m : I<0xFF, MRM0m, (ops i32mem:$dst), "inc{l} $dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst)]>;
+}
+
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (ops GR8 :$dst, GR8 :$src), "dec{b} $dst",
+               [(set GR8:$dst, (add GR8:$src, -1))]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+               [(set GR16:$dst, (add GR16:$src, -1))]>,
+             OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+               [(set GR32:$dst, (add GR32:$src, -1))]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+  def DEC8m  : I<0xFE, MRM1m, (ops i8mem :$dst), "dec{b} $dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst)]>;
+  def DEC16m : I<0xFF, MRM1m, (ops i16mem:$dst), "dec{w} $dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst)]>, OpSize;
+  def DEC32m : I<0xFF, MRM1m, (ops i32mem:$dst), "dec{l} $dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst)]>;
+}
+
+// Logical operators...
+let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
+def AND8rr   : I<0x20, MRMDestReg,
+                (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                "and{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, GR8:$src2))]>;
+def AND16rr  : I<0x21, MRMDestReg,
+                 (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "and{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2))]>, OpSize;
+def AND32rr  : I<0x21, MRMDestReg, 
+                 (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "and{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>;
+}
+
+def AND8rm   : I<0x22, MRMSrcMem, 
+                 (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "and{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, (load addr:$src2)))]>;
+def AND16rm  : I<0x23, MRMSrcMem, 
+                 (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "and{w} {$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (and GR16:$src1, (load addr:$src2)))]>, OpSize;
+def AND32rm  : I<0x23, MRMSrcMem,
+                 (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "and{l} {$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (and GR32:$src1, (load addr:$src2)))]>;
+
+def AND8ri   : Ii8<0x80, MRM4r, 
+                   (ops GR8 :$dst, GR8 :$src1, i8imm :$src2),
+                   "and{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (and GR8:$src1, imm:$src2))]>;
+def AND16ri  : Ii16<0x81, MRM4r, 
+                    (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "and{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (and GR16:$src1, imm:$src2))]>, OpSize;
+def AND32ri  : Ii32<0x81, MRM4r, 
+                    (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "and{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>;
+def AND16ri8 : Ii8<0x83, MRM4r, 
+                   (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "and{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r, 
+                   (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "and{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def AND8mr   : I<0x20, MRMDestMem,
+                   (ops i8mem :$dst, GR8 :$src),
+                   "and{b} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def AND16mr  : I<0x21, MRMDestMem,
+                   (ops i16mem:$dst, GR16:$src),
+                   "and{w} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR16:$src), addr:$dst)]>,
+                   OpSize;
+  def AND32mr  : I<0x21, MRMDestMem,
+                   (ops i32mem:$dst, GR32:$src),
+                   "and{l} {$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def AND8mi   : Ii8<0x80, MRM4m,
+                     (ops i8mem :$dst, i8imm :$src),
+                     "and{b} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def AND16mi  : Ii16<0x81, MRM4m,
+                      (ops i16mem:$dst, i16imm:$src),
+                      "and{w} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                      OpSize;
+  def AND32mi  : Ii32<0x81, MRM4m,
+                      (ops i32mem:$dst, i32imm:$src),
+                      "and{l} {$src, $dst|$dst, $src}",
+                      [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def AND16mi8 : Ii8<0x83, MRM4m,
+                     (ops i16mem:$dst, i16i8imm :$src),
+                     "and{w} {$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def AND32mi8 : Ii8<0x83, MRM4m,
+                     (ops i32mem:$dst, i32i8imm :$src),
+                     "and{l} {$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
+def OR8rr    : I<0x08, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "or{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+def OR16rr   : I<0x09, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "or{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>, OpSize;
+def OR32rr   : I<0x09, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "or{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>;
+}
+def OR8rm    : I<0x0A, MRMSrcMem , (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "or{b} {$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+def OR16rm   : I<0x0B, MRMSrcMem , (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "or{w} {$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>, OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "or{l} {$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2)))]>;
+
+def OR8ri    : Ii8 <0x80, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                    "or{b} {$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+def OR16ri   : Ii16<0x81, MRM1r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "or{w} {$src2, $dst|$dst, $src2}", 
+                    [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>, OpSize;
+def OR32ri   : Ii32<0x81, MRM1r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "or{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>;
+
+def OR16ri8  : Ii8<0x83, MRM1r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "or{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2))]>, OpSize;
+def OR32ri8  : Ii8<0x83, MRM1r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "or{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def OR8mr  : I<0x08, MRMDestMem, (ops i8mem:$dst, GR8:$src),
+                 "or{b} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def OR16mr : I<0x09, MRMDestMem, (ops i16mem:$dst, GR16:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>, OpSize;
+  def OR32mr : I<0x09, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def OR8mi    : Ii8<0x80, MRM1m, (ops i8mem :$dst, i8imm:$src),
+                 "or{b} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def OR16mi   : Ii16<0x81, MRM1m, (ops i16mem:$dst, i16imm:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                 OpSize;
+  def OR32mi   : Ii32<0x81, MRM1m, (ops i32mem:$dst, i32imm:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def OR16mi8  : Ii8<0x83, MRM1m, (ops i16mem:$dst, i16i8imm:$src),
+                 "or{w} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def OR32mi8  : Ii8<0x83, MRM1m, (ops i32mem:$dst, i32i8imm:$src),
+                 "or{l} {$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+
+let isCommutable = 1 in {   // X = XOR Y, Z   --> X = XOR Z, Y
+def XOR8rr   : I<0x30, MRMDestReg,
+                 (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "xor{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, GR8:$src2))]>;
+def XOR16rr  : I<0x31, MRMDestReg, 
+                 (ops GR16:$dst, GR16:$src1, GR16:$src2), 
+                 "xor{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, GR16:$src2))]>, OpSize;
+def XOR32rr  : I<0x31, MRMDestReg, 
+                 (ops GR32:$dst, GR32:$src1, GR32:$src2), 
+                 "xor{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+}
+
+def XOR8rm   : I<0x32, MRMSrcMem , 
+                 (ops GR8 :$dst, GR8:$src1, i8mem :$src2), 
+                 "xor{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2)))]>;
+def XOR16rm  : I<0x33, MRMSrcMem , 
+                 (ops GR16:$dst, GR16:$src1, i16mem:$src2), 
+                 "xor{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2)))]>, OpSize;
+def XOR32rm  : I<0x33, MRMSrcMem , 
+                 (ops GR32:$dst, GR32:$src1, i32mem:$src2), 
+                 "xor{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2)))]>;
+
+def XOR8ri   : Ii8<0x80, MRM6r, 
+                   (ops GR8:$dst, GR8:$src1, i8imm:$src2), 
+                   "xor{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2))]>;
+def XOR16ri  : Ii16<0x81, MRM6r, 
+                    (ops GR16:$dst, GR16:$src1, i16imm:$src2), 
+                    "xor{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2))]>, OpSize;
+def XOR32ri  : Ii32<0x81, MRM6r, 
+                    (ops GR32:$dst, GR32:$src1, i32imm:$src2), 
+                    "xor{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>;
+def XOR16ri8 : Ii8<0x83, MRM6r, 
+                   (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "xor{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r, 
+                   (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "xor{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def XOR8mr   : I<0x30, MRMDestMem,
+                   (ops i8mem :$dst, GR8 :$src),
+                   "xor{b} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst)]>;
+  def XOR16mr  : I<0x31, MRMDestMem,
+                   (ops i16mem:$dst, GR16:$src),
+                   "xor{w} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR16:$src), addr:$dst)]>,
+                   OpSize;
+  def XOR32mr  : I<0x31, MRMDestMem,
+                   (ops i32mem:$dst, GR32:$src),
+                   "xor{l} {$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR32:$src), addr:$dst)]>;
+  def XOR8mi   : Ii8<0x80, MRM6m,
+                     (ops i8mem :$dst, i8imm :$src),
+                     "xor{b} {$src, $dst|$dst, $src}",
+                    [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst)]>;
+  def XOR16mi  : Ii16<0x81, MRM6m,
+                      (ops i16mem:$dst, i16imm:$src),
+                      "xor{w} {$src, $dst|$dst, $src}",
+                   [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst)]>,
+                      OpSize;
+  def XOR32mi  : Ii32<0x81, MRM6m,
+                      (ops i32mem:$dst, i32imm:$src),
+                      "xor{l} {$src, $dst|$dst, $src}",
+                   [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst)]>;
+  def XOR16mi8 : Ii8<0x83, MRM6m,
+                     (ops i16mem:$dst, i16i8imm :$src),
+                     "xor{w} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst)]>,
+                     OpSize;
+  def XOR32mi8 : Ii8<0x83, MRM6m,
+                     (ops i32mem:$dst, i32i8imm :$src),
+                     "xor{l} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst)]>;
+}
+
+// Shift instructions
+def SHL8rCL  : I<0xD2, MRM4r, (ops GR8 :$dst, GR8 :$src),
+                 "shl{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (shl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHL16rCL : I<0xD3, MRM4r, (ops GR16:$dst, GR16:$src),
+                 "shl{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (shl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (ops GR32:$dst, GR32:$src),
+                 "shl{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (shl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "shl{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "shl{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri  : Ii8<0xC1, MRM4r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "shl{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+}
+
+// Shift left by one. Not used because (add x, x) is slightly cheaper.
+def SHL8r1   : I<0xD0, MRM4r, (ops GR8 :$dst, GR8 :$src1),
+                 "shl{b} $dst", []>;
+def SHL16r1  : I<0xD1, MRM4r, (ops GR16:$dst, GR16:$src1),
+                 "shl{w} $dst", []>, OpSize;
+def SHL32r1  : I<0xD1, MRM4r, (ops GR32:$dst, GR32:$src1),
+                 "shl{l} $dst", []>;
+
+let isTwoAddress = 0 in {
+  def SHL8mCL  : I<0xD2, MRM4m, (ops i8mem :$dst),
+                   "shl{b} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHL16mCL : I<0xD3, MRM4m, (ops i16mem:$dst),
+                   "shl{w} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SHL32mCL : I<0xD3, MRM4m, (ops i32mem:$dst),
+                   "shl{l} {%cl, $dst|$dst, %CL}",
+                   [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHL8mi   : Ii8<0xC0, MRM4m, (ops i8mem :$dst, i8imm:$src),
+                     "shl{b} {$src, $dst|$dst, $src}",
+                  [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHL16mi  : Ii8<0xC1, MRM4m, (ops i16mem:$dst, i8imm:$src),
+                     "shl{w} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHL32mi  : Ii8<0xC1, MRM4m, (ops i32mem:$dst, i8imm:$src),
+                     "shl{l} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHL8m1   : I<0xD0, MRM4m, (ops i8mem :$dst),
+                   "shl{b} $dst",
+                  [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHL16m1  : I<0xD1, MRM4m, (ops i16mem:$dst),
+                   "shl{w} $dst",
+                 [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SHL32m1  : I<0xD1, MRM4m, (ops i32mem:$dst),
+                   "shl{l} $dst",
+                 [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SHR8rCL  : I<0xD2, MRM5r, (ops GR8 :$dst, GR8 :$src),
+                 "shr{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (srl GR8:$src, CL))]>, Imp<[CL],[]>;
+def SHR16rCL : I<0xD3, MRM5r, (ops GR16:$dst, GR16:$src),
+                 "shr{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (srl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (ops GR32:$dst, GR32:$src),
+                 "shr{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (srl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                   "shr{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "shr{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri  : Ii8<0xC1, MRM5r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "shr{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1   : I<0xD0, MRM5r, (ops GR8:$dst, GR8:$src1),
+                 "shr{b} $dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1  : I<0xD1, MRM5r, (ops GR16:$dst, GR16:$src1),
+                 "shr{w} $dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1  : I<0xD1, MRM5r, (ops GR32:$dst, GR32:$src1),
+                 "shr{l} $dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def SHR8mCL  : I<0xD2, MRM5m, (ops i8mem :$dst),
+                   "shr{b} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHR16mCL : I<0xD3, MRM5m, (ops i16mem:$dst),
+                   "shr{w} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SHR32mCL : I<0xD3, MRM5m, (ops i32mem:$dst),
+                   "shr{l} {%cl, $dst|$dst, %CL}",
+                   [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SHR8mi   : Ii8<0xC0, MRM5m, (ops i8mem :$dst, i8imm:$src),
+                     "shr{b} {$src, $dst|$dst, $src}",
+                  [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHR16mi  : Ii8<0xC1, MRM5m, (ops i16mem:$dst, i8imm:$src),
+                     "shr{w} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHR32mi  : Ii8<0xC1, MRM5m, (ops i32mem:$dst, i8imm:$src),
+                     "shr{l} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHR8m1   : I<0xD0, MRM5m, (ops i8mem :$dst),
+                   "shr{b} $dst",
+                  [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHR16m1  : I<0xD1, MRM5m, (ops i16mem:$dst),
+                   "shr{w} $dst",
+                 [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+  def SHR32m1  : I<0xD1, MRM5m, (ops i32mem:$dst),
+                   "shr{l} $dst",
+                 [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def SAR8rCL  : I<0xD2, MRM7r, (ops GR8 :$dst, GR8 :$src),
+                 "sar{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (sra GR8:$src, CL))]>, Imp<[CL],[]>;
+def SAR16rCL : I<0xD3, MRM7r, (ops GR16:$dst, GR16:$src),
+                 "sar{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (sra GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (ops GR32:$dst, GR32:$src),
+                 "sar{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (sra GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "sar{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "sar{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize;
+def SAR32ri  : Ii8<0xC1, MRM7r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "sar{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (ops GR8 :$dst, GR8 :$src1),
+                 "sar{b} $dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1  : I<0xD1, MRM7r, (ops GR16:$dst, GR16:$src1),
+                 "sar{w} $dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1  : I<0xD1, MRM7r, (ops GR32:$dst, GR32:$src1),
+                 "sar{l} $dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def SAR8mCL  : I<0xD2, MRM7m, (ops i8mem :$dst),
+                   "sar{b} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SAR16mCL : I<0xD3, MRM7m, (ops i16mem:$dst),
+                   "sar{w} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def SAR32mCL : I<0xD3, MRM7m, (ops i32mem:$dst), 
+                   "sar{l} {%cl, $dst|$dst, %CL}",
+                   [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def SAR8mi   : Ii8<0xC0, MRM7m, (ops i8mem :$dst, i8imm:$src),
+                     "sar{b} {$src, $dst|$dst, $src}",
+                  [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SAR16mi  : Ii8<0xC1, MRM7m, (ops i16mem:$dst, i8imm:$src),
+                     "sar{w} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SAR32mi  : Ii8<0xC1, MRM7m, (ops i32mem:$dst, i8imm:$src),
+                     "sar{l} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SAR8m1   : I<0xD0, MRM7m, (ops i8mem :$dst),
+                   "sar{b} $dst",
+                  [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SAR16m1  : I<0xD1, MRM7m, (ops i16mem:$dst),
+                   "sar{w} $dst",
+                 [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SAR32m1  : I<0xD1, MRM7m, (ops i32mem:$dst),
+                   "sar{l} $dst",
+                 [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+// FIXME: provide shorter instructions when imm8 == 1
+def ROL8rCL  : I<0xD2, MRM0r, (ops GR8 :$dst, GR8 :$src),
+                 "rol{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotl GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROL16rCL : I<0xD3, MRM0r, (ops GR16:$dst, GR16:$src),
+                 "rol{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotl GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (ops GR32:$dst, GR32:$src),
+                 "rol{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotl GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "rol{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "rol{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "rol{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (ops GR8 :$dst, GR8 :$src1),
+                 "rol{b} $dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1  : I<0xD1, MRM0r, (ops GR16:$dst, GR16:$src1),
+                 "rol{w} $dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1  : I<0xD1, MRM0r, (ops GR32:$dst, GR32:$src1),
+                 "rol{l} $dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def ROL8mCL  : I<0xD2, MRM0m, (ops i8mem :$dst),
+                   "rol{b} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROL16mCL : I<0xD3, MRM0m, (ops i16mem:$dst),
+                   "rol{w} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def ROL32mCL : I<0xD3, MRM0m, (ops i32mem:$dst),
+                   "rol{l} {%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROL8mi   : Ii8<0xC0, MRM0m, (ops i8mem :$dst, i8imm:$src),
+                     "rol{b} {$src, $dst|$dst, $src}",
+                 [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROL16mi  : Ii8<0xC1, MRM0m, (ops i16mem:$dst, i8imm:$src),
+                     "rol{w} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROL32mi  : Ii8<0xC1, MRM0m, (ops i32mem:$dst, i8imm:$src),
+                     "rol{l} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROL8m1   : I<0xD0, MRM0m, (ops i8mem :$dst),
+                   "rol{b} $dst",
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROL16m1  : I<0xD1, MRM0m, (ops i16mem:$dst),
+                   "rol{w} $dst",
+                [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROL32m1  : I<0xD1, MRM0m, (ops i32mem:$dst),
+                   "rol{l} $dst",
+                [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+def ROR8rCL  : I<0xD2, MRM1r, (ops GR8 :$dst, GR8 :$src),
+                 "ror{b} {%cl, $dst|$dst, %CL}",
+                 [(set GR8:$dst, (rotr GR8:$src, CL))]>, Imp<[CL],[]>;
+def ROR16rCL : I<0xD3, MRM1r, (ops GR16:$dst, GR16:$src),
+                 "ror{w} {%cl, $dst|$dst, %CL}",
+                 [(set GR16:$dst, (rotr GR16:$src, CL))]>, Imp<[CL],[]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (ops GR32:$dst, GR32:$src),
+                 "ror{l} {%cl, $dst|$dst, %CL}",
+                 [(set GR32:$dst, (rotr GR32:$src, CL))]>, Imp<[CL],[]>;
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (ops GR8 :$dst, GR8 :$src1, i8imm:$src2),
+                   "ror{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (ops GR16:$dst, GR16:$src1, i8imm:$src2),
+                   "ror{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (ops GR32:$dst, GR32:$src1, i8imm:$src2),
+                   "ror{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (ops GR8 :$dst, GR8 :$src1),
+                 "ror{b} $dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1  : I<0xD1, MRM1r, (ops GR16:$dst, GR16:$src1),
+                 "ror{w} $dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1  : I<0xD1, MRM1r, (ops GR32:$dst, GR32:$src1),
+                 "ror{l} $dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  def ROR8mCL  : I<0xD2, MRM1m, (ops i8mem :$dst),
+                   "ror{b} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROR16mCL : I<0xD3, MRM1m, (ops i16mem:$dst),
+                   "ror{w} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>, OpSize;
+  def ROR32mCL : I<0xD3, MRM1m, (ops i32mem:$dst), 
+                   "ror{l} {%cl, $dst|$dst, %CL}",
+                   [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>,
+                   Imp<[CL],[]>;
+  def ROR8mi   : Ii8<0xC0, MRM1m, (ops i8mem :$dst, i8imm:$src),
+                     "ror{b} {$src, $dst|$dst, $src}",
+                 [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROR16mi  : Ii8<0xC1, MRM1m, (ops i16mem:$dst, i8imm:$src),
+                     "ror{w} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROR32mi  : Ii8<0xC1, MRM1m, (ops i32mem:$dst, i8imm:$src),
+                     "ror{l} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROR8m1   : I<0xD0, MRM1m, (ops i8mem :$dst),
+                   "ror{b} $dst",
+                 [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROR16m1  : I<0xD1, MRM1m, (ops i16mem:$dst),
+                   "ror{w} $dst",
+                [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROR32m1  : I<0xD1, MRM1m, (ops i32mem:$dst),
+                   "ror{l} $dst",
+                [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+def SHLD32rrCL : I<0xA5, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                   "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+                   Imp<[CL],[]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                   "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+                   Imp<[CL],[]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                   "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   Imp<[CL],[]>, TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                   "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   Imp<[CL],[]>, TB, OpSize;
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (ops GR32:$dst, GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (ops GR16:$dst, GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+  def SHLD32mrCL : I<0xA5, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                     "shld{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                       addr:$dst)]>,
+                     Imp<[CL],[]>, TB;
+  def SHRD32mrCL : I<0xAD, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                    "shrd{l} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                      addr:$dst)]>,
+                    Imp<[CL],[]>, TB;
+  def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                      (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+                      "shld{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB;
+  def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                       (ops i32mem:$dst, GR32:$src2, i8imm:$src3),
+                       "shrd{l} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                         (i8 imm:$src3)), addr:$dst)]>,
+                       TB;
+
+  def SHLD16mrCL : I<0xA5, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                     "shld{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                       addr:$dst)]>,
+                     Imp<[CL],[]>, TB, OpSize;
+  def SHRD16mrCL : I<0xAD, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                    "shrd{w} {%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                      addr:$dst)]>,
+                    Imp<[CL],[]>, TB, OpSize;
+  def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                      (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+                      "shld{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB, OpSize;
+  def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                       (ops i16mem:$dst, GR16:$src2, i8imm:$src3),
+                       "shrd{w} {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                       TB, OpSize;
+}
+
+
+// Arithmetic.
+let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
+def ADD8rr   : I<0x00, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "add{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, GR8:$src2))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def ADD16rr  : I<0x01, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "add{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2))]>, OpSize;
+def ADD32rr  : I<0x01, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "add{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2))]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+def ADD8rm   : I<0x02, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "add{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2)))]>;
+def ADD16rm  : I<0x03, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "add{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2)))]>, OpSize;
+def ADD32rm  : I<0x03, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "add{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2)))]>;
+
+def ADD8ri   : Ii8<0x80, MRM0r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                   "add{b} {$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (add GR8:$src1, imm:$src2))]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def ADD16ri  : Ii16<0x81, MRM0r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "add{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (add GR16:$src1, imm:$src2))]>, OpSize;
+def ADD32ri  : Ii32<0x81, MRM0r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "add{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, imm:$src2))]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "add{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "add{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2))]>;
+}
+
+let isTwoAddress = 0 in {
+  def ADD8mr   : I<0x00, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+                   "add{b} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def ADD16mr  : I<0x01, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                   "add{w} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mr  : I<0x01, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                   "add{l} {$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADD8mi   : Ii8<0x80, MRM0m, (ops i8mem :$dst, i8imm :$src2),
+                     "add{b} {$src2, $dst|$dst, $src2}",
+                   [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADD16mi  : Ii16<0x81, MRM0m, (ops i16mem:$dst, i16imm:$src2),
+                      "add{w} {$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mi  : Ii32<0x81, MRM0m, (ops i32mem:$dst, i32imm:$src2),
+                      "add{l} {$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADD16mi8 : Ii8<0x83, MRM0m, (ops i16mem:$dst, i16i8imm :$src2),
+                     "add{w} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADD32mi8 : Ii8<0x83, MRM0m, (ops i32mem:$dst, i32i8imm :$src2),
+                     "add{l} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
+def ADC32rr  : I<0x11, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+def ADC32rm  : I<0x13, MRMSrcMem , (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC32ri  : Ii32<0x81, MRM2r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "adc{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def ADC32mr  : I<0x11, MRMDestMem, (ops i32mem:$dst, GR32:$src2),
+                   "adc{l} {$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADC32mi  : Ii32<0x81, MRM2m, (ops i32mem:$dst, i32imm:$src2),
+                      "adc{l} {$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC32mi8 : Ii8<0x83, MRM2m, (ops i32mem:$dst, i32i8imm :$src2),
+                     "adc{l} {$src2, $dst|$dst, $src2}",
+             [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SUB8rr   : I<0x28, MRMDestReg, (ops GR8 :$dst, GR8 :$src1, GR8 :$src2),
+                 "sub{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sub GR8:$src1, GR8:$src2))]>;
+def SUB16rr  : I<0x29, MRMDestReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "sub{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sub GR16:$src1, GR16:$src2))]>, OpSize;
+def SUB32rr  : I<0x29, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "sub{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>;
+def SUB8rm   : I<0x2A, MRMSrcMem, (ops GR8 :$dst, GR8 :$src1, i8mem :$src2),
+                 "sub{b} {$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2)))]>;
+def SUB16rm  : I<0x2B, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "sub{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2)))]>, OpSize;
+def SUB32rm  : I<0x2B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "sub{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2)))]>;
+
+def SUB8ri   : Ii8 <0x80, MRM5r, (ops GR8:$dst, GR8:$src1, i8imm:$src2),
+                    "sub{b} {$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2))]>;
+def SUB16ri  : Ii16<0x81, MRM5r, (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                    "sub{w} {$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2))]>, OpSize;
+def SUB32ri  : Ii32<0x81, MRM5r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "sub{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2))]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                   "sub{w} {$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "sub{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2))]>;
+let isTwoAddress = 0 in {
+  def SUB8mr   : I<0x28, MRMDestMem, (ops i8mem :$dst, GR8 :$src2),
+                   "sub{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def SUB16mr  : I<0x29, MRMDestMem, (ops i16mem:$dst, GR16:$src2),
+                   "sub{w} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mr  : I<0x29, MRMDestMem, (ops i32mem:$dst, GR32:$src2), 
+                   "sub{l} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SUB8mi   : Ii8<0x80, MRM5m, (ops i8mem :$dst, i8imm:$src2), 
+                     "sub{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SUB16mi  : Ii16<0x81, MRM5m, (ops i16mem:$dst, i16imm:$src2), 
+                      "sub{w} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mi  : Ii32<0x81, MRM5m, (ops i32mem:$dst, i32imm:$src2), 
+                      "sub{l} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SUB16mi8 : Ii8<0x83, MRM5m, (ops i16mem:$dst, i16i8imm :$src2), 
+                     "sub{w} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+                   OpSize;
+  def SUB32mi8 : Ii8<0x83, MRM5m, (ops i32mem:$dst, i32i8imm :$src2), 
+                     "sub{l} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+
+def SBB32rr    : I<0x19, MRMDestReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                  "sbb{l} {$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def SBB32mr  : I<0x19, MRMDestMem, (ops i32mem:$dst, GR32:$src2), 
+                   "sbb{l} {$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SBB8mi  : Ii32<0x80, MRM3m, (ops i8mem:$dst, i8imm:$src2), 
+                      "sbb{b} {$src2, $dst|$dst, $src2}",
+                   [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi  : Ii32<0x81, MRM3m, (ops i32mem:$dst, i32imm:$src2), 
+                      "sbb{l} {$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi8 : Ii8<0x83, MRM3m, (ops i32mem:$dst, i32i8imm :$src2), 
+                     "sbb{l} {$src2, $dst|$dst, $src2}",
+             [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+}
+def SBB32rm  : I<0x1B, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                    "sbb{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB32ri  : Ii32<0x81, MRM3r, (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                    "sbb{l} {$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                   "sbb{l} {$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+
+let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
+def IMUL16rr : I<0xAF, MRMSrcReg, (ops GR16:$dst, GR16:$src1, GR16:$src2),
+                 "imul{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2))]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (ops GR32:$dst, GR32:$src1, GR32:$src2),
+                 "imul{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>, TB;
+}
+def IMUL16rm : I<0xAF, MRMSrcMem, (ops GR16:$dst, GR16:$src1, i16mem:$src2),
+                 "imul{w} {$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2)))]>,
+                 TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (ops GR32:$dst, GR32:$src1, i32mem:$src2),
+                 "imul{l} {$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2)))]>, TB;
+
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (ops GR16:$dst, GR16:$src1, i16imm:$src2),
+                      "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2))]>, OpSize;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (ops GR32:$dst, GR32:$src1, i32imm:$src2),
+                      "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (ops GR16:$dst, GR16:$src1, i16i8imm:$src2),
+                     "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2))]>,
+                     OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (ops GR32:$dst, GR32:$src1, i32i8imm:$src2),
+                     "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2))]>;
+
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                      // GR16 = [mem16]*I16
+                      (ops GR16:$dst, i16mem:$src1, i16imm:$src2),
+                      "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2))]>,
+                      OpSize;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                      // GR32 = [mem32]*I32
+                      (ops GR32:$dst, i32mem:$src1, i32imm:$src2),
+                      "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2))]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (ops GR16:$dst, i16mem:$src1, i16i8imm :$src2),
+                     "imul{w} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR16:$dst, (mul (load addr:$src1), i16immSExt8:$src2))]>,
+                     OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (ops GR32:$dst, i32mem:$src1, i32i8imm: $src2),
+                     "imul{l} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                  [(set GR32:$dst, (mul (load addr:$src1), i32immSExt8:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let isCommutable = 1 in {   // TEST X, Y   --> TEST Y, X
+def TEST8rr  : I<0x84, MRMDestReg, (ops GR8:$src1, GR8:$src2),
+                 "test{b} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR8:$src1, GR8:$src2), 0)]>;
+def TEST16rr : I<0x85, MRMDestReg, (ops GR16:$src1, GR16:$src2),
+                 "test{w} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR16:$src1, GR16:$src2), 0)]>, OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (ops GR32:$src1, GR32:$src2),
+                 "test{l} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR32:$src1, GR32:$src2), 0)]>;
+}
+
+def TEST8rm  : I<0x84, MRMSrcMem, (ops GR8 :$src1, i8mem :$src2),
+                 "test{b} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (ops GR16:$src1, i16mem:$src2),
+                 "test{w} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0)]>,
+               OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (ops GR32:$src1, i32mem:$src2),
+                 "test{l} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0)]>;
+
+def TEST8ri  : Ii8 <0xF6, MRM0r,                     // flags = GR8  & imm8
+                    (ops GR8:$src1, i8imm:$src2),
+                    "test{b} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR8:$src1, imm:$src2), 0)]>;
+def TEST16ri : Ii16<0xF7, MRM0r,                     // flags = GR16 & imm16
+                    (ops GR16:$src1, i16imm:$src2),
+                    "test{w} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR16:$src1, imm:$src2), 0)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r,                     // flags = GR32 & imm32
+                    (ops GR32:$src1, i32imm:$src2),
+                    "test{l} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and GR32:$src1, imm:$src2), 0)]>;
+
+def TEST8mi  : Ii8 <0xF6, MRM0m,                     // flags = [mem8]  & imm8
+                    (ops i8mem:$src1, i8imm:$src2),
+                    "test{b} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0)]>;
+def TEST16mi : Ii16<0xF7, MRM0m,                     // flags = [mem16] & imm16
+                    (ops i16mem:$src1, i16imm:$src2),
+                    "test{w} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0)]>,
+               OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m,                     // flags = [mem32] & imm32
+                    (ops i32mem:$src1, i32imm:$src2),
+                    "test{l} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0)]>;
+
+
+// Condition code ops, incl. set if equal/not equal/...
+def SAHF     : I<0x9E, RawFrm, (ops), "sahf", []>, Imp<[AH],[]>;  // flags = AH
+def LAHF     : I<0x9F, RawFrm, (ops), "lahf", []>, Imp<[],[AH]>;  // AH = flags
+
+def SETEr    : I<0x94, MRM0r, 
+                 (ops GR8   :$dst),
+                 "sete $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_E))]>,
+               TB;                        // GR8 = ==
+def SETEm    : I<0x94, MRM0m, 
+                 (ops i8mem:$dst),
+                 "sete $dst",
+                 [(store (X86setcc X86_COND_E), addr:$dst)]>,
+               TB;                        // [mem8] = ==
+def SETNEr   : I<0x95, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setne $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NE))]>,
+               TB;                        // GR8 = !=
+def SETNEm   : I<0x95, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setne $dst",
+                 [(store (X86setcc X86_COND_NE), addr:$dst)]>,
+               TB;                        // [mem8] = !=
+def SETLr    : I<0x9C, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setl $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_L))]>,
+               TB;                        // GR8 = <  signed
+def SETLm    : I<0x9C, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setl $dst",
+                 [(store (X86setcc X86_COND_L), addr:$dst)]>,
+               TB;                        // [mem8] = <  signed
+def SETGEr   : I<0x9D, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setge $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_GE))]>,
+               TB;                        // GR8 = >= signed
+def SETGEm   : I<0x9D, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setge $dst",
+                 [(store (X86setcc X86_COND_GE), addr:$dst)]>,
+               TB;                        // [mem8] = >= signed
+def SETLEr   : I<0x9E, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setle $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_LE))]>,
+               TB;                        // GR8 = <= signed
+def SETLEm   : I<0x9E, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setle $dst",
+                 [(store (X86setcc X86_COND_LE), addr:$dst)]>,
+               TB;                        // [mem8] = <= signed
+def SETGr    : I<0x9F, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setg $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_G))]>,
+               TB;                        // GR8 = >  signed
+def SETGm    : I<0x9F, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setg $dst",
+                 [(store (X86setcc X86_COND_G), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETBr    : I<0x92, MRM0r,
+                 (ops GR8   :$dst),
+                 "setb $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_B))]>,
+               TB;                        // GR8 = <  unsign
+def SETBm    : I<0x92, MRM0m,
+                 (ops i8mem:$dst),
+                 "setb $dst",
+                 [(store (X86setcc X86_COND_B), addr:$dst)]>,
+               TB;                        // [mem8] = <  unsign
+def SETAEr   : I<0x93, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setae $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_AE))]>,
+               TB;                        // GR8 = >= unsign
+def SETAEm   : I<0x93, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setae $dst",
+                 [(store (X86setcc X86_COND_AE), addr:$dst)]>,
+               TB;                        // [mem8] = >= unsign
+def SETBEr   : I<0x96, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setbe $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_BE))]>,
+               TB;                        // GR8 = <= unsign
+def SETBEm   : I<0x96, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setbe $dst",
+                 [(store (X86setcc X86_COND_BE), addr:$dst)]>,
+               TB;                        // [mem8] = <= unsign
+def SETAr    : I<0x97, MRM0r, 
+                 (ops GR8   :$dst),
+                 "seta $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_A))]>,
+               TB;                        // GR8 = >  signed
+def SETAm    : I<0x97, MRM0m, 
+                 (ops i8mem:$dst),
+                 "seta $dst",
+                 [(store (X86setcc X86_COND_A), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETSr    : I<0x98, MRM0r, 
+                 (ops GR8   :$dst),
+                 "sets $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_S))]>,
+               TB;                        // GR8 = <sign bit>
+def SETSm    : I<0x98, MRM0m, 
+                 (ops i8mem:$dst),
+                 "sets $dst",
+                 [(store (X86setcc X86_COND_S), addr:$dst)]>,
+               TB;                        // [mem8] = <sign bit>
+def SETNSr   : I<0x99, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setns $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NS))]>,
+               TB;                        // GR8 = !<sign bit>
+def SETNSm   : I<0x99, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setns $dst",
+                 [(store (X86setcc X86_COND_NS), addr:$dst)]>,
+               TB;                        // [mem8] = !<sign bit>
+def SETPr    : I<0x9A, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setp $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_P))]>,
+               TB;                        // GR8 = parity
+def SETPm    : I<0x9A, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setp $dst",
+                 [(store (X86setcc X86_COND_P), addr:$dst)]>,
+               TB;                        // [mem8] = parity
+def SETNPr   : I<0x9B, MRM0r, 
+                 (ops GR8   :$dst),
+                 "setnp $dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NP))]>,
+               TB;                        // GR8 = not parity
+def SETNPm   : I<0x9B, MRM0m, 
+                 (ops i8mem:$dst),
+                 "setnp $dst",
+                 [(store (X86setcc X86_COND_NP), addr:$dst)]>,
+               TB;                        // [mem8] = not parity
+
+// Integer comparisons
+def CMP8rr  : I<0x38, MRMDestReg,
+                (ops GR8 :$src1, GR8 :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, GR8:$src2)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+                (ops GR16:$src1, GR16:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, GR16:$src2)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+                (ops GR32:$src1, GR32:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, GR32:$src2)]>;
+def CMP8mr  : I<0x38, MRMDestMem,
+                (ops i8mem :$src1, GR8 :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi8 addr:$src1), GR8:$src2)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+                (ops i16mem:$src1, GR16:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi16 addr:$src1), GR16:$src2)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+                (ops i32mem:$src1, GR32:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi32 addr:$src1), GR32:$src2)]>;
+def CMP8rm  : I<0x3A, MRMSrcMem,
+                (ops GR8 :$src1, i8mem :$src2),
+                "cmp{b} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, (loadi8 addr:$src2))]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+                (ops GR16:$src1, i16mem:$src2),
+                "cmp{w} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, (loadi16 addr:$src2))]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+                (ops GR32:$src1, i32mem:$src2),
+                "cmp{l} {$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, (loadi32 addr:$src2))]>;
+def CMP8ri  : Ii8<0x80, MRM7r,
+                  (ops GR8:$src1, i8imm:$src2),
+                  "cmp{b} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp GR8:$src1, imm:$src2)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+                   (ops GR16:$src1, i16imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, imm:$src2)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+                   (ops GR32:$src1, i32imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, imm:$src2)]>;
+def CMP8mi  : Ii8 <0x80, MRM7m,
+                   (ops i8mem :$src1, i8imm :$src2),
+                   "cmp{b} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi8 addr:$src1), imm:$src2)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+                   (ops i16mem:$src1, i16imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), imm:$src2)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+                   (ops i32mem:$src1, i32imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), imm:$src2)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+                   (ops GR16:$src1, i16i8imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, i16immSExt8:$src2)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+                   (ops i16mem:$src1, i16i8imm:$src2),
+                   "cmp{w} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+                   (ops i32mem:$src1, i32i8imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+                   (ops GR32:$src1, i32i8imm:$src2),
+                   "cmp{l} {$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, i32immSExt8:$src2)]>;
+
+// Sign/Zero extenders
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+                   "movs{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (sext GR8:$src))]>, TB, OpSize;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+                   "movs{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+                   "movs{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+                   "movs{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+                   "movs{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+                   "movs{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (ops GR16:$dst, GR8 :$src),
+                   "movz{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (zext GR8:$src))]>, TB, OpSize;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (ops GR16:$dst, i8mem :$src),
+                   "movz{bw|x} {$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB, OpSize;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (ops GR32:$dst, GR8 :$src),
+                   "movz{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (ops GR32:$dst, i8mem :$src),
+                   "movz{bl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (ops GR32:$dst, GR16:$src),
+                   "movz{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (ops GR32:$dst, i16mem:$src),
+                   "movz{wl|x} {$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+def CBW : I<0x98, RawFrm, (ops),
+            "{cbtw|cbw}", []>, Imp<[AL],[AX]>, OpSize;   // AX = signext(AL)
+def CWDE : I<0x98, RawFrm, (ops),
+            "{cwtl|cwde}", []>, Imp<[AX],[EAX]>;   // EAX = signext(AX)
+
+def CWD : I<0x99, RawFrm, (ops),
+            "{cwtd|cwd}", []>, Imp<[AX],[AX,DX]>, OpSize; // DX:AX = signext(AX)
+def CDQ : I<0x99, RawFrm, (ops),
+            "{cltd|cdq}", []>, Imp<[EAX],[EAX,EDX]>; // EDX:EAX = signext(EAX)
+          
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+def MOV8r0   : I<0x30, MRMInitReg, (ops GR8 :$dst),
+                 "xor{b} $dst, $dst",
+                 [(set GR8:$dst, 0)]>;
+def MOV16r0  : I<0x31, MRMInitReg,  (ops GR16:$dst), 
+                 "xor{w} $dst, $dst",
+                 [(set GR16:$dst, 0)]>, OpSize;
+def MOV32r0  : I<0x31, MRMInitReg,  (ops GR32:$dst), 
+                 "xor{l} $dst, $dst",
+                 [(set GR32:$dst, 0)]>;
+
+// Basic operations on GR16 / GR32 subclasses GR16_ and GR32_ which contains only
+// those registers that have GR8 sub-registers (i.e. AX - DX, EAX - EDX).
+def MOV16to16_ : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32to32_ : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+
+def MOV16_rr : I<0x89, MRMDestReg, (ops GR16_:$dst, GR16_:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rr : I<0x89, MRMDestReg, (ops GR32_:$dst, GR32_:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_rm : I<0x8B, MRMSrcMem, (ops GR16_:$dst, i16mem:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_rm : I<0x8B, MRMSrcMem, (ops GR32_:$dst, i32mem:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+def MOV16_mr : I<0x89, MRMDestMem, (ops i16mem:$dst, GR16_:$src),
+                "mov{w} {$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32_mr : I<0x89, MRMDestMem, (ops i32mem:$dst, GR32_:$src),
+                "mov{l} {$src, $dst|$dst, $src}", []>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+def TLS_addr : I<0, Pseudo, (ops GR32:$dst, i32imm:$sym),
+               "leal ${sym:mem}(,%ebx,1), $dst",
+               [(set GR32:$dst, (X86tlsaddr tglobaltlsaddr:$sym))]>,
+               Imp<[EBX],[]>;
+
+let AddedComplexity = 10 in
+def TLS_gs_rr : I<0, Pseudo, (ops GR32:$dst, GR32:$src),
+                  "movl %gs:($src), $dst",
+                  [(set GR32:$dst, (load (add X86TLStp, GR32:$src)))]>;
+
+let AddedComplexity = 15 in
+def TLS_gs_ri : I<0, Pseudo, (ops GR32:$dst, i32imm:$src),
+                  "movl %gs:${src:mem}, $dst",
+                  [(set GR32:$dst,
+                    (load (add X86TLStp, (X86Wrapper tglobaltlsaddr:$src))))]>;
+
+def TLS_tp : I<0, Pseudo, (ops GR32:$dst),
+               "movl %gs:0, $dst",
+               [(set GR32:$dst, X86TLStp)]>;
+
+//===----------------------------------------------------------------------===//
+// DWARF Pseudo Instructions
+//
+
+def DWARF_LOC   : I<0, Pseudo, (ops i32imm:$line, i32imm:$col, i32imm:$file),
+                    "; .loc $file, $line, $col",
+                    [(dwarf_loc (i32 imm:$line), (i32 imm:$col),
+                      (i32 imm:$file))]>;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, noResults = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (ops GR32:$addr),
+                    "ret #eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)), (MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+
+// Calls
+def : Pat<(X86tailcall GR32:$dst),
+          (CALL32r     GR32:$dst)>;
+
+def : Pat<(X86tailcall (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(truncstorei1 (i8 imm:$src), addr:$dst), 
+          (MOV8mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei1 GR8:$src, addr:$dst), 
+          (MOV8mr addr:$dst, GR8:$src)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(X86cmp GR8:$src1, 0),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(X86cmp GR16:$src1, 0),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(X86cmp GR32:$src1, 0),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi16i1 addr:$src), (MOVSX16rm8 addr:$src)>;
+def : Pat<(sextloadi32i1 addr:$src), (MOVSX32rm8 addr:$src)>;
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
+def : Pat<(i16 (anyext (loadi8  addr:$src))), (MOVZX16rm8  addr:$src)>;
+def : Pat<(i32 (anyext (loadi8  addr:$src))), (MOVZX32rm8  addr:$src)>;
+def : Pat<(i32 (anyext (loadi16 addr:$src))), (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+              (shl GR32:$src2, (sub 32, CL:$amt))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+                     (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+              (srl GR32:$src2, (sub 32, CL:$amt))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+                     (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+              (shl GR16:$src2, (sub 16, CL:$amt))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+                     (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+              (srl GR16:$src2, (sub 16, CL:$amt))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+                     (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrX86-64.td"
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..c774460
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -0,0 +1,645 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction templates
+//===----------------------------------------------------------------------===//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXRI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, XS, Requires<[HasMMX]>;
+
+// Some 'special' instructions
+def IMPLICIT_DEF_VR64 : I<0, Pseudo, (ops VR64:$dst),
+                          "#IMPLICIT_DEF $dst",
+                          [(set VR64:$dst, (v8i8 (undef)))]>,
+                        Requires<[HasMMX]>;
+
+// 64-bit vector undef's.
+def : Pat<(v8i8  (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v4i16 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v2i32 (undef)), (IMPLICIT_DEF_VR64)>;
+def : Pat<(v1i64 (undef)), (IMPLICIT_DEF_VR64)>;
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8  : PatFrag<(ops node:$in), (v8i8  (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def MMX_UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKHMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def MMX_UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKLMask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def MMX_UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def MMX_UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+// Patterns for shuffling.
+def MMX_PSHUFW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], MMX_SHUFFLE_get_shuf_imm>;
+
+// Patterns for: vector_shuffle v1, v2, <4, 5, 2, 3>; etc.
+def MMX_MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLMask(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+  // MMXI_binop_rm - Simple MMX binary operator.
+  multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+                                         (bitconvert
+                                          (load_mmx addr:$src2)))))]>;
+  }
+
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))]>;
+  }
+
+  // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+  //
+  // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+  // to collapse (bitconvert VT to VT) into its operand.
+  //
+  multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst,
+                    (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId> {
+    def rr : MMXI<opc, MRMSrcReg, (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+    def rm : MMXI<opc, MRMSrcMem, (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))]>;
+    def ri : MMXIi8<opc2, ImmForm, (ops VR64:$dst, VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                    [(set VR64:$dst, (IntId VR64:$src1,
+                                      (scalar_to_vector (i32 imm:$src2))))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (ops), "emms",  [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (ops), "femms", [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (ops i32mem:$dst, VR64:$src),
+                        "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (ops VR64:$dst, GR64:$src),
+                             "movd {$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (ops VR64:$dst, VR64:$src),
+                        "movq {$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (ops VR64:$dst, i64mem:$src),
+                        "movq {$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+                        "movq {$src, $dst|$dst, $src}",
+                        [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : MMXID<0xD6, MRMDestMem, (ops VR64:$dst, VR128:$src),
+                          "movdq2q {$src, $dst|$dst, $src}",
+                          [(set VR64:$dst,
+                            (v1i64 (vector_extract (v2i64 VR128:$src),
+                                  (iPTR 0))))]>;
+
+def MMX_MOVQ2DQrr : MMXIS<0xD6, MRMDestMem, (ops VR128:$dst, VR64:$src),
+                          "movq2dq {$src, $dst|$dst, $src}",
+                          [(set VR128:$dst,
+                            (bitconvert (v1i64 VR64:$src)))]>;
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (ops i64mem:$dst, VR64:$src),
+                         "movntq {$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMX2I<0x6E, MRMSrcReg, (ops VR64:$dst, GR32:$src),
+                             "movd {$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle immAllZerosV,
+                                       (v2i32 (scalar_to_vector GR32:$src)),
+                                       MMX_MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMX2I<0x6E, MRMSrcMem, (ops VR64:$dst, i32mem:$src),
+                             "movd {$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle immAllZerosV,
+                                       (v2i32 (scalar_to_vector
+                                               (loadi32 addr:$src))),
+                                       MMX_MOVL_shuffle_mask)))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8,  1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xE0, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+  def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+                         (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                         "pandn {$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  VR64:$src2)))]>;
+  def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+                         (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                         "pandn {$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let isTwoAddress = 1 in {
+  // Unpack High Packed Data Instructions
+  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+                                      MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)),
+                                      MMX_UNPCKH_shuffle_mask)))]>;
+
+  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)),
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+
+  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckhdq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+  def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckhdq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)),
+                                       MMX_UNPCKH_shuffle_mask)))]>;
+
+  // Unpack Low Packed Data Instructions
+  def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpcklbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1, VR64:$src2,
+                                      MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpcklbw {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (vector_shuffle VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)),
+                                      MMX_UNPCKL_shuffle_mask)))]>;
+
+  def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpcklwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpcklwd {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (vector_shuffle VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)),
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+
+  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg, 
+                             (ops VR64:$dst, VR64:$src1, VR64:$src2),
+                             "punpckldq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1, VR64:$src2,
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem, 
+                             (ops VR64:$dst, VR64:$src1, i64mem:$src2),
+                             "punpckldq {$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (vector_shuffle VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)),
+                                       MMX_UNPCKL_shuffle_mask)))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (ops VR64:$dst, VR64:$src1, i8imm:$src2),
+                          "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (vector_shuffle
+                                    VR64:$src1, (undef),
+                                    MMX_PSHUFW_shuffle_mask:$src2)))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (ops VR64:$dst, i64mem:$src1, i8imm:$src2),
+                          "pshufw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (vector_shuffle
+                                    (bc_v4i16 (load_mmx addr:$src1)),
+                                    (undef),
+                                    MMX_PSHUFW_shuffle_mask:$src2)))]>;
+
+// -- Conversion Instructions
+def MMX_CVTPD2PIrr  : MMX2I<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                            "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+                            "cvtpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+                            "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                            "cvtpi2pd {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (ops VR128:$dst, VR64:$src),
+                           "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                           "cvtpi2ps {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr  : MMXI<0x2D, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                           "cvtps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTPS2PIrm  : MMXI<0x2D, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+                           "cvtps2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                            "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (ops VR64:$dst, f128mem:$src),
+                            "cvttpd2pi {$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (ops VR64:$dst, VR128:$src),
+                           "cvttps2pi {$src, $dst|$dst, $src}", []>;
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (ops VR64:$dst, f64mem:$src),
+                           "cvttps2pi {$src, $dst|$dst, $src}", []>;
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
+                           (ops GR32:$dst, VR64:$src1, i16i8imm:$src2),
+                           "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+                                             (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+  def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+                      (ops VR64:$dst, VR64:$src1, GR32:$src2, i16i8imm:$src3),
+                      "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                                               GR32:$src2, (iPTR imm:$src3))))]>;
+  def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+                     (ops VR64:$dst, VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+                     "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst,
+                       (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                               (i32 (anyext (loadi16 addr:$src2))),
+                               (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (ops GR32:$dst, VR64:$src),
+                          "pmovmskb {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+def MMX_MASKMOVQ : MMXI<0xF7, MRMDestMem, (ops VR64:$src, VR64:$mask),
+                        "maskmovq {$mask, $src|$src, $mask}",
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>,
+                        Imp<[EDI],[]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in {
+  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (ops VR64:$dst),
+                              "pxor $dst, $dst",
+                              [(set VR64:$dst, (v1i64 immAllZerosV))]>;
+  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (ops VR64:$dst),
+                              "pcmpeqd $dst, $dst",
+                              [(set VR64:$dst, (v1i64 immAllOnesV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8  VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// 64-bit vector all zero's.
+def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v2i32 immAllZerosV), (MMX_V_SET0)>;
+def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+
+// 64-bit vector all one's.
+def : Pat<(v8i8  immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v4i16 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v2i32 immAllOnesV), (MMX_V_SETALLONES)>;
+def : Pat<(v1i64 immAllOnesV), (MMX_V_SETALLONES)>;
+
+// Bit convert.
+def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+
+def MMX_X86s2vec : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+  def : Pat<(v8i8 (vector_shuffle immAllZerosV,
+                    (v8i8 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v4i16 (vector_shuffle immAllZerosV,
+                    (v4i16 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+  def : Pat<(v2i32 (vector_shuffle immAllZerosV,
+                    (v2i32 (MMX_X86s2vec GR32:$src)), MMX_MOVL_shuffle_mask)),
+            (MMX_MOVZDI2PDIrr GR32:$src)>;
+}
+
+// Scalar to v2i32 / v4i16 / v8i8. The source may be a GR32, but only the lower
+// 8 or 16-bits matter.
+def : Pat<(v8i8  (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v4i16 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+def : Pat<(v2i32 (MMX_X86s2vec GR32:$src)), (MMX_MOVD64rr GR32:$src)>;
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKL_v_undef_shuffle_mask)),
+            (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (vector_shuffle VR64:$src, (undef),
+                    MMX_UNPCKH_v_undef_shuffle_mask)),
+            (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+  def : Pat<(bc_v2i32 (vector_shuffle immAllZerosV,
+                       (v2i32 (scalar_to_vector (load_mmx addr:$src))),
+                       MMX_UNPCKL_shuffle_mask)),
+            (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..5fc7a65
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -0,0 +1,2572 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set -------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Evan Cheng and is distributed under the University
+// of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+
+def X86loadp   : SDNode<"X86ISD::LOAD_PACK", SDTLoad, [SDNPHasChain]>;
+def X86loadu   : SDNode<"X86ISD::LOAD_UA",   SDTLoad, [SDNPHasChain]>;
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86s2vec   : SDNode<"X86ISD::S2VEC",  SDTypeProfile<1, 1, []>, []>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+//===----------------------------------------------------------------------===//
+// SSE 'Special' Instructions
+//===----------------------------------------------------------------------===//
+
+def IMPLICIT_DEF_VR128 : I<0, Pseudo, (ops VR128:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set VR128:$dst, (v4f32 (undef)))]>,
+                         Requires<[HasSSE1]>;
+def IMPLICIT_DEF_FR32  : I<0, Pseudo, (ops FR32:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
+def IMPLICIT_DEF_FR64  : I<0, Pseudo, (ops FR64:$dst),
+                           "#IMPLICIT_DEF $dst",
+                           [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 4, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain]>;
+def sse_load_f64 : ComplexPattern<v2f64, 4, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def X86loadpf32  : PatFrag<(ops node:$ptr), (f32   (X86loadp node:$ptr))>;
+def X86loadpf64  : PatFrag<(ops node:$ptr), (f64   (X86loadp node:$ptr))>;
+
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def PSxLDQ_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to 
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to 
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<build_vector, [{
+  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+def SSE_splat_mask : PatLeaf<(build_vector), [{
+  return X86::isSplatMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SSE_splat_lo_mask : PatLeaf<(build_vector), [{
+  return X86::isSplatLoMask(N);
+}]>;
+
+def MOVHLPS_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHLPSMask(N);
+}]>;
+
+def MOVHLPS_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHLPS_v_undef_Mask(N);
+}]>;
+
+def MOVHP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVHPMask(N);
+}]>;
+
+def MOVLP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLPMask(N);
+}]>;
+
+def MOVL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVLMask(N);
+}]>;
+
+def MOVSHDUP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVSHDUPMask(N);
+}]>;
+
+def MOVSLDUP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isMOVSLDUPMask(N);
+}]>;
+
+def UNPCKL_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKLMask(N);
+}]>;
+
+def UNPCKH_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKHMask(N);
+}]>;
+
+def UNPCKL_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKL_v_undef_Mask(N);
+}]>;
+
+def UNPCKH_v_undef_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isUNPCKH_v_undef_Mask(N);
+}]>;
+
+def PSHUFD_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFHW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFHWMask(N);
+}], SHUFFLE_get_pshufhw_imm>;
+
+def PSHUFLW_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFLWMask(N);
+}], SHUFFLE_get_pshuflw_imm>;
+
+def SHUFP_unary_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isPSHUFDMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def SHUFP_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+def PSHUFD_binary_shuffle_mask : PatLeaf<(build_vector), [{
+  return X86::isSHUFPMask(N);
+}], SHUFFLE_get_shuf_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded by the
+// scheduler into a branch sequence.
+let usesCustomDAGSchedInserter = 1 in {  // Expanded by the scheduler.
+  def CMOV_FR32 : I<0, Pseudo,
+                    (ops FR32:$dst, FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (ops FR64:$dst, FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (ops VR128:$dst, VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with TB prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// Move Instructions
+def MOVSSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                  "movss {$src, $dst|$dst, $src}", []>;
+def MOVSSrm : SSI<0x10, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+                  "movss {$src, $dst|$dst, $src}",
+                  [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, FR32:$src),
+                  "movss {$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, FR32:$src),
+                      "cvttss2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                      "cvttss2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr  : SSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+                      "cvtsi2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm  : SSI<0x2A, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+                      "cvtsi2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                         "cvtss2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                         "cvtss2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si
+                                           (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                          "cvttss2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (ops GR32:$dst, f32mem:$src),
+                          "cvttss2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+  def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR32:$src2),
+                           "cvtsi2ss {$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              GR32:$src2))]>;
+  def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+                           "cvtsi2ss {$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+  def CMPSSrr : SSI<0xC2, MRMSrcReg, 
+                    (ops FR32:$dst, FR32:$src1, FR32:$src, SSECC:$cc),
+                    "cmp${cc}ss {$src, $dst|$dst, $src}",
+                    []>;
+  def CMPSSrm : SSI<0xC2, MRMSrcMem, 
+                    (ops FR32:$dst, FR32:$src1, f32mem:$src, SSECC:$cc),
+                    "cmp${cc}ss {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops FR32:$src1, FR32:$src2),
+                   "ucomiss {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, FR32:$src2)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops FR32:$src1, f32mem:$src2),
+                   "ucomiss {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, (loadf32 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+  def Int_CMPSSrr : SSI<0xC2, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}ss {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSSrm : SSI<0xC2, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, f32mem:$src, SSECC:$cc),
+                        "cmp${cc}ss {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                       "ucomiss {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                       "ucomiss {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                      "comiss {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), VR128:$src2)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                      "comiss {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SS : I<0xEF, MRMInitReg, (ops FR32:$dst),
+                 "pxor $dst, $dst", [(set FR32:$dst, fp32imm0)]>,
+               Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                     "movaps {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (ops FR32:$dst, f128mem:$src),
+                     "movaps {$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (X86loadpf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+  def FsANDPSrr : PSI<0x54, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "andps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+  def FsORPSrr  : PSI<0x56, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "orps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+  def FsXORPSrr : PSI<0x57, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                      "xorps {$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "andps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fand FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+def FsORPSrm  : PSI<0x56, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "orps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86for FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                    "xorps {$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fxor FR32:$src1,
+                                      (X86loadpf32 addr:$src2)))]>;
+
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (ops FR32:$dst, FR32:$src1, f128mem:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F32Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (ops FR32:$dst, FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PSrr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PSrm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+                     !strconcat(OpcodeStr, "ps {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movaps {$src, $dst|$dst, $src}", []>;
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movaps {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movaps {$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movups {$src, $dst|$dst, $src}", []>;
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movups {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movups {$src, $dst|$dst, $src}",
+                   [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+  let AddedComplexity = 20 in {
+    def MOVLPSrm : PSI<0x12, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movlps {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v4f32 (vector_shuffle VR128:$src1,
+                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                                 MOVLP_shuffle_mask)))]>;
+    def MOVHPSrm : PSI<0x16, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movhps {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v4f32 (vector_shuffle VR128:$src1,
+                         (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2)))),
+                                 MOVHP_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movlps {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movhps {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (vector_shuffle
+                                         (bc_v2f64 (v4f32 VR128:$src)), (undef),
+                                         UNPCKH_shuffle_mask)), (iPTR 0))),
+                     addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let AddedComplexity = 15 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "movlhps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                              MOVHP_shuffle_mask)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "movhlps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                              MOVHLPS_shuffle_mask)))]>;
+} // AddedComplexity
+} // isTwoAddress
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F32Int,
+                           Intrinsic V4F32Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SSr : SSI<opc, MRMSrcReg, (ops FR32:$dst, FR32:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SSm : SSI<opc, MRMSrcMem, (ops FR32:$dst, f32mem:$src),
+                !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PSr : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PSm : PSI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SSI<opc, MRMSrcMem, (ops VR128:$dst, ssmem:$src),
+                    !strconcat(OpcodeStr, "ss {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PSr_Int : PSI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : PSI<opc, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                    !strconcat(OpcodeStr, "ps {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
+                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let isTwoAddress = 1 in {
+  let isCommutable = 1 in {
+    def ANDPSrr : PSI<0x54, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "andps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (and VR128:$src1, VR128:$src2)))]>;
+    def ORPSrr  : PSI<0x56, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "orps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (or VR128:$src1, VR128:$src2)))]>;
+    def XORPSrr : PSI<0x57, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "xorps {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (xor VR128:$src1, VR128:$src2)))]>;
+  }
+
+  def ANDPSrm : PSI<0x54, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "andps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (and VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def ORPSrm  : PSI<0x56, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "orps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (or VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def XORPSrm : PSI<0x57, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "xorps {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (xor VR128:$src1,
+                                       (bc_v2i64 (loadv4f32 addr:$src2))))]>;
+  def ANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               VR128:$src2)))]>;
+  def ANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+                     "andnps {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               (bc_v2i64 (loadv4f32 addr:$src2)))))]>;
+}
+
+let isTwoAddress = 1 in {
+  def CMPPSrri : PSIi8<0xC2, MRMSrcReg, 
+                      (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                      "cmp${cc}ps {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                         VR128:$src, imm:$cc))]>;
+  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem, 
+                      (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+                      "cmp${cc}ps {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                         (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+  let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg, 
+                          (ops VR128:$dst, VR128:$src1,
+                           VR128:$src2, i32i8imm:$src3),
+                          "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+                          [(set VR128:$dst,
+                            (v4f32 (vector_shuffle
+                                    VR128:$src1, VR128:$src2,
+                                    SHUFP_shuffle_mask:$src3)))]>;
+  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1,
+                         f128mem:$src2, i32i8imm:$src3),
+                        "shufps {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v4f32 (vector_shuffle
+                                  VR128:$src1, (load addr:$src2),
+                                  SHUFP_shuffle_mask:$src3)))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPSrr : PSI<0x15, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpckhps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKH_shuffle_mask)))]>;
+    def UNPCKHPSrm : PSI<0x15, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpckhps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKH_shuffle_mask)))]>;
+
+    def UNPCKLPSrr : PSI<0x14, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpcklps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKL_shuffle_mask)))]>;
+    def UNPCKLPSrm : PSI<0x14, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpcklps {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKL_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "movmskps {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PSI<0x50, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "movmskpd {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetching loads.
+// TODO: no intrinsics for these?
+def PREFETCHT0   : PSI<0x18, MRM1m, (ops i8mem:$src), "prefetcht0 $src", []>;
+def PREFETCHT1   : PSI<0x18, MRM2m, (ops i8mem:$src), "prefetcht1 $src", []>;
+def PREFETCHT2   : PSI<0x18, MRM3m, (ops i8mem:$src), "prefetcht2 $src", []>;
+def PREFETCHNTA  : PSI<0x18, MRM0m, (ops i8mem:$src), "prefetchnta $src", []>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                    "movntps {$src, $dst|$dst, $src}",
+                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7m, (ops), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (ops i32mem:$src),
+                  "ldmxcsr $src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (ops i32mem:$dst),
+                  "stmxcsr $dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (ops VR128:$dst),
+                 "xorps $dst, $dst",
+                 [(set VR128:$dst, (v4f32 immAllZerosV))]>;
+
+// FR32 to 128-bit vector conversion.
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (ops VR128:$dst, FR32:$src),
+                      "movss {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (ops FR32:$dst, VR128:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (ops f32mem:$dst, VR128:$src),
+                     "movss {$src, $dst|$dst, $src}",
+                     [(store (f32 (vector_extract (v4f32 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+  def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, FR32:$src2),
+                        "movss {$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPSrr : SSI<0x10, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "movss {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                 MOVL_shuffle_mask)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (ops VR128:$dst, f32mem:$src),
+                      "movss {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle immAllZerosV,
+                                 (v4f32 (scalar_to_vector (loadf32 addr:$src))),
+                                                MOVL_shuffle_mask)))]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE2 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE2 Instruction Templates:
+// 
+//   SDI   - SSE2 instructions with XD prefix.
+//   PDI   - SSE2 instructions with TB and OpSize prefixes.
+//   PDIi8 - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// Move Instructions
+def MOVSDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                  "movsd {$src, $dst|$dst, $src}", []>;
+def MOVSDrm : SDI<0x10, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+                  "movsd {$src, $dst|$dst, $src}",
+                  [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, FR64:$src),
+                  "movsd {$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, FR64:$src),
+                      "cvttsd2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f64mem:$src),
+                      "cvttsd2si {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (ops FR32:$dst, FR64:$src),
+                      "cvtsd2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm  : SDI<0x5A, MRMSrcMem, (ops FR32:$dst, f64mem:$src), 
+                      "cvtsd2ss {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))]>;
+def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR32:$src),
+                      "cvtsi2sd {$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (ops FR64:$dst, i32mem:$src),
+                      "cvtsi2sd {$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (ops FR64:$dst, FR32:$src),
+                   "cvtss2sd {$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (ops FR64:$dst, f32mem:$src),
+                   "cvtss2sd {$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                         "cvtsd2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+                         "cvtsd2si {$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+                                           (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                          "cvttsd2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (ops GR32:$dst, f128mem:$src),
+                          "cvttsd2si {$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+                                            (load addr:$src)))]>;
+
+// Comparison instructions
+let isTwoAddress = 1 in {
+  def CMPSDrr : SDI<0xC2, MRMSrcReg, 
+                    (ops FR64:$dst, FR64:$src1, FR64:$src, SSECC:$cc),
+                    "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+  def CMPSDrm : SDI<0xC2, MRMSrcMem, 
+                    (ops FR64:$dst, FR64:$src1, f64mem:$src, SSECC:$cc),
+                    "cmp${cc}sd {$src, $dst|$dst, $src}", []>;
+}
+
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops FR64:$src1, FR64:$src2),
+                   "ucomisd {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, FR64:$src2)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops FR64:$src1, f64mem:$src2),
+                   "ucomisd {$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, (loadf64 addr:$src2))]>;
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let isTwoAddress = 1 in {
+  def Int_CMPSDrr : SDI<0xC2, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}sd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSDrm : SDI<0xC2, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, f64mem:$src, SSECC:$cc),
+                        "cmp${cc}sd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                       "ucomisd {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                       "ucomisd {$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (ops VR128:$src1, VR128:$src2),
+                      "comisd {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2))]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (ops VR128:$src1, f128mem:$src2),
+                      "comisd {$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (load addr:$src2))]>;
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names that
+// start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+def FsFLD0SD : I<0xEF, MRMInitReg, (ops FR64:$dst),
+                 "pxor $dst, $dst", [(set FR64:$dst, fpimm0)]>,
+               Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                     "movapd {$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (ops FR64:$dst, f128mem:$src),
+                     "movapd {$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (X86loadpf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let isTwoAddress = 1 in {
+let isCommutable = 1 in {
+  def FsANDPDrr : PDI<0x54, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "andpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+  def FsORPDrr  : PDI<0x56, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "orpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+  def FsXORPDrr : PDI<0x57, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                      "xorpd {$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "andpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fand FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+def FsORPDrm  : PDI<0x56, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "orpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86for FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                    "xorpd {$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fxor FR64:$src1,
+                                      (X86loadpf64 addr:$src2)))]>;
+
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}", []>;
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (ops FR64:$dst, FR64:$src1, f128mem:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}", []>;
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements undefined.
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F64Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let isTwoAddress = 1 in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (ops FR64:$dst, FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+                 
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                 [(set VR128:$dst, (OpNode VR128:$src1, (loadv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PDrr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PDrm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                     !strconcat(OpcodeStr, "pd {$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, (load addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movapd {$src, $dst|$dst, $src}", []>;
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movapd {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movapd {$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movupd {$src, $dst|$dst, $src}", []>;
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                   "movupd {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                   "movupd {$src, $dst|$dst, $src}",
+                   [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let isTwoAddress = 1 in {
+  let AddedComplexity = 20 in {
+    def MOVLPDrm : PDI<0x12, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movlpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (vector_shuffle VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)),
+                                 MOVLP_shuffle_mask)))]>;
+    def MOVHPDrm : PDI<0x16, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f64mem:$src2),
+                       "movhpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, 
+                         (v2f64 (vector_shuffle VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)),
+                                 MOVHP_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movlpd {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                   "movhpd {$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (vector_shuffle VR128:$src, (undef),
+                                         UNPCKH_shuffle_mask)), (iPTR 0))),
+                     addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtdq2ps {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                       "cvtdq2ps {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+                                         (bitconvert (loadv2i64 addr:$src))))]>,
+                     TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtdq2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+                     XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                       "cvtdq2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+                                          (bitconvert (loadv2i64 addr:$src))))]>,
+                     XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                         "cvtps2dq {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                         "cvtps2dq {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+                                            (load addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                        "cvttps2dq {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq VR128:$src))]>,
+                      XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                        "cvttps2dq {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                           (load addr:$src)))]>,
+                      XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                       "cvtpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (load addr:$src)))]>,
+                     XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                          "cvttpd2dq {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                          "cvttpd2dq {$src, $dst|$dst, $src}",
+                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                             (load addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                       "cvtps2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcReg, (ops VR128:$dst, f64mem:$src),
+                       "cvtps2pd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+                                          (load addr:$src)))]>,
+                     TB, Requires<[HasSSE2]>;
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                         "cvtpd2ps {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcReg, (ops VR128:$dst, f128mem:$src),
+                         "cvtpd2ps {$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+                                            (load addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let isTwoAddress = 1 in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, GR32:$src2),
+                        "cvtsi2sd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, i32mem:$src2),
+                        "cvtsi2sd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                   "cvtsd2ss {$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, f64mem:$src2), 
+                   "cvtsd2ss {$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "cvtss2sd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS,
+                    Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (ops VR128:$dst, VR128:$src1, f32mem:$src2),
+                    "cvtss2sd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS,
+                    Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F64Int,
+                           Intrinsic V2F64Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SDr : SDI<opc, MRMSrcReg, (ops FR64:$dst, FR64:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SDm : SDI<opc, MRMSrcMem, (ops FR64:$dst, f64mem:$src),
+                !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+                 
+  // Vector operation, reg.
+  def PDr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+              !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PDm : PDI<opc, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SDI<opc, MRMSrcMem, (ops VR128:$dst, sdmem:$src),
+                    !strconcat(OpcodeStr, "sd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : PDI<opc, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                    !strconcat(OpcodeStr, "pd {$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (load addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let isTwoAddress = 1 in {
+  let isCommutable = 1 in {
+    def ANDPDrr : PDI<0x54, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "andpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (and (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def ORPDrr  : PDI<0x56, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "orpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (or (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def XORPDrr : PDI<0x57, MRMSrcReg,
+                      (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                      "xorpd {$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  }
+
+  def ANDPDrm : PDI<0x54, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "andpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (and (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def ORPDrm  : PDI<0x56, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "orpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (or (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def XORPDrm : PDI<0x57, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                    "xorpd {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                       (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+  def ANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  def ANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (ops VR128:$dst, VR128:$src1,f128mem:$src2),
+                     "andnpd {$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (loadv2f64 addr:$src2))))]>;
+}
+
+let isTwoAddress = 1 in {
+  def CMPPDrri : PDIi8<0xC2, MRMSrcReg, 
+                      (ops VR128:$dst, VR128:$src1, VR128:$src, SSECC:$cc),
+                      "cmp${cc}pd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                         VR128:$src, imm:$cc))]>;
+  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem, 
+                      (ops VR128:$dst, VR128:$src1, f128mem:$src, SSECC:$cc),
+                      "cmp${cc}pd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                         (load addr:$src), imm:$cc))]>;
+}
+
+// Shuffle and unpack instructions
+let isTwoAddress = 1 in {
+  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2, i8imm:$src3),
+                        "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst, (v2f64 (vector_shuffle
+                                                  VR128:$src1, VR128:$src2,
+                                                  SHUFP_shuffle_mask:$src3)))]>;
+  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufpd {$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v2f64 (vector_shuffle
+                                  VR128:$src1, (load addr:$src2),
+                                  SHUFP_shuffle_mask:$src3)))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPDrr : PDI<0x15, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpckhpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKH_shuffle_mask)))]>;
+    def UNPCKHPDrm : PDI<0x15, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpckhpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKH_shuffle_mask)))]>;
+
+    def UNPCKLPDrr : PDI<0x14, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "unpcklpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, VR128:$src2,
+                                   UNPCKL_shuffle_mask)))]>;
+    def UNPCKLPDrm : PDI<0x14, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                         "unpcklpd {$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (vector_shuffle
+                                   VR128:$src1, (load addr:$src2),
+                                   UNPCKL_shuffle_mask)))]>;
+  } // AddedComplexity
+} // isTwoAddress
+
+
+//===----------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                   "movdqa {$src, $dst|$dst, $src}", []>;
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "movdqa {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2i64 addr:$src))]>;
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                   "movdqa {$src, $dst|$dst, $src}",
+                   [(store (v2i64 VR128:$src), addr:$dst)]>;
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "movdqu {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+                 XS, Requires<[HasSSE2]>;
+def MOVDQUmr :   I<0x7F, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                   "movdqu {$src, $dst|$dst, $src}",
+                   [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+                 XS, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (loadv2i64 addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr, Intrinsic IntId> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (loadv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (scalar_to_vector (i32 imm:$src2))))]>;
+}
+
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+                                       (bitconvert (loadv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,(loadv2i64 addr:$src2)))]>;
+}
+
+} // isTwoAddress
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB  : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW  : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xE0, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw", int_x86_sse2_psll_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld", int_x86_sse2_psll_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq", int_x86_sse2_psll_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw", int_x86_sse2_psrl_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld", int_x86_sse2_psrl_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq", int_x86_sse2_psrl_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw", int_x86_sse2_psra_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad", int_x86_sse2_psra_d>;
+// PSRAQ doesn't exist in SSE[1-3].
+
+// 128-bit logical shifts.
+let isTwoAddress = 1 in {
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                       "pslldq {$src2, $dst|$dst, $src2}", []>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                       "psrldq {$src2, $dst|$dst, $src2}", []>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (v2f64 (PSRLDQri VR128:$src1, (PSxLDQ_imm imm:$src2)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let isTwoAddress = 1 in {
+  def PANDNrr : PDI<0xDF, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                    "pandn {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>;
+
+  def PANDNrm : PDI<0xDF, MRMSrcMem,
+                    (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                    "pandn {$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (load addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+                     (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+                     "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (vector_shuffle
+                                               VR128:$src1, (undef),
+                                               PSHUFD_shuffle_mask:$src2)))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+                     (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+                     "pshufd {$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (vector_shuffle
+                                               (bc_v4i32(loadv2i64 addr:$src1)),
+                                               (undef),
+                                               PSHUFD_shuffle_mask:$src2)))]>;
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, i8imm:$src2),
+                    "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              VR128:$src1, (undef),
+                                              PSHUFHW_shuffle_mask:$src2)))]>,
+                XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+                    (ops VR128:$dst, i128mem:$src1, i8imm:$src2),
+                    "pshufhw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              (bc_v8i16 (loadv2i64 addr:$src1)),
+                                              (undef),
+                                              PSHUFHW_shuffle_mask:$src2)))]>,
+                XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+                    (ops VR128:$dst, VR128:$src1, i32i8imm:$src2),
+                    "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              VR128:$src1, (undef),
+                                              PSHUFLW_shuffle_mask:$src2)))]>,
+                XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+                    (ops VR128:$dst, i128mem:$src1, i32i8imm:$src2),
+                    "pshuflw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (vector_shuffle
+                                              (bc_v8i16 (loadv2i64 addr:$src1)),
+                                              (undef),
+                                              PSHUFLW_shuffle_mask:$src2)))]>,
+                XD, Requires<[HasSSE2]>;
+
+
+let isTwoAddress = 1 in {
+  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpcklbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpcklbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1,
+                                  (bc_v16i8 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpcklwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpcklwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1,
+                                  (bc_v8i16 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckldq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckldq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1,
+                                  (bc_v4i32 (loadv2i64 addr:$src2)),
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "punpcklqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKL_shuffle_mask)))]>;
+  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem, 
+                         (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                         "punpcklqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1,
+                                  (loadv2i64 addr:$src2),
+                                  UNPCKL_shuffle_mask)))]>;
+  
+  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhbw {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (vector_shuffle VR128:$src1,
+                                  (bc_v16i8 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhwd {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (vector_shuffle VR128:$src1,
+                                  (bc_v8i16 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg, 
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "punpckhdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (vector_shuffle VR128:$src1,
+                                  (bc_v4i32 (loadv2i64 addr:$src2)),
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg, 
+                         (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                         "punpckhqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                  UNPCKH_shuffle_mask)))]>;
+  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem, 
+                        (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                        "punpckhqdq {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (vector_shuffle VR128:$src1,
+                                  (loadv2i64 addr:$src2),
+                                  UNPCKH_shuffle_mask)))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (ops GR32:$dst, VR128:$src1, i32i8imm:$src2),
+                    "pextrw {$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                     (iPTR imm:$src2)))]>;
+let isTwoAddress = 1 in {
+  def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1,
+                        GR32:$src2, i32i8imm:$src3),
+                       "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+                                 GR32:$src2, (iPTR imm:$src3))))]>;
+  def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+                       "pinsrw {$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (v8i16 (X86pinsrw (v8i16 VR128:$src1),
+                                 (i32 (anyext (loadi16 addr:$src2))),
+                                 (iPTR imm:$src3))))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (ops GR32:$dst, VR128:$src),
+                     "pmovmskb {$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (ops VR128:$src, VR128:$mask),
+                     "maskmovdqu {$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>,
+                 Imp<[EDI],[]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (ops i128mem:$dst, VR128:$src),
+                    "movntpd {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (ops f128mem:$dst, VR128:$src),
+                    "movntdq {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr  :   I<0xC3, MRMDestMem, (ops i32mem:$dst, GR32:$src),
+                    "movnti {$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>, 
+                  TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (ops i8mem:$src),
+               "clflush $src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5m, (ops),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6m, (ops),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+let isReMaterializable = 1 in
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (ops VR128:$dst),
+                         "pcmpeqd $dst, $dst",
+                         [(set VR128:$dst, (v2f64 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (ops VR128:$dst, FR64:$src),
+                      "movsd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, 
+                       (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (ops FR32:$dst, GR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (ops FR32:$dst, i32mem:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                    "movq {$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                  Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                      "movq {$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (ops FR64:$dst, VR128:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (ops f64mem:$dst, VR128:$src),
+                     "movsd {$src, $dst|$dst, $src}",
+                     [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (ops GR32:$dst, VR128:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))]>;
+def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, VR128:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (ops GR32:$dst, FR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (ops i32mem:$dst, FR32:$src),
+                      "movd {$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let isTwoAddress = 1 in {
+  def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, FR64:$src2),
+                        "movsd {$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPDrr : SDI<0x10, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "movsd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (vector_shuffle VR128:$src1, VR128:$src2,
+                                 MOVL_shuffle_mask)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+  def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                        "movsd {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2f64 (vector_shuffle immAllZerosV,
+                                  (v2f64 (scalar_to_vector
+                                          (loadf64 addr:$src))),
+                                  MOVL_shuffle_mask)))]>;
+
+let AddedComplexity = 15 in
+// movd / movq to XMM register zero-extends
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR32:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (vector_shuffle immAllZerosV,
+                                 (v4i32 (scalar_to_vector GR32:$src)),
+                                 MOVL_shuffle_mask)))]>;
+let AddedComplexity = 20 in
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (ops VR128:$dst, i32mem:$src),
+                       "movd {$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (vector_shuffle immAllZerosV,
+                                 (v4i32 (scalar_to_vector (loadi32 addr:$src))),
+                                 MOVL_shuffle_mask)))]>;
+
+// Moving from XMM to XMM but still clear upper 64 bits.
+let AddedComplexity = 15 in
+def MOVZQI2PQIrr : I<0x7E, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_movl_dq VR128:$src))]>,
+                   XS, Requires<[HasSSE2]>;
+let AddedComplexity = 20 in
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                     "movq {$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_movl_dq
+                                        (bitconvert (loadv2i64 addr:$src))))]>,
+                   XS, Requires<[HasSSE2]>;
+
+
+//===----------------------------------------------------------------------===//
+// SSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movshdup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                MOVSHDUP_shuffle_mask)))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                      "movshdup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                (loadv4f32 addr:$src), (undef),
+                                                MOVSHDUP_shuffle_mask)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movsldup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                MOVSLDUP_shuffle_mask)))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (ops VR128:$dst, f128mem:$src),
+                      "movsldup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (vector_shuffle
+                                                (loadv4f32 addr:$src), (undef),
+                                                MOVSLDUP_shuffle_mask)))]>;
+
+def MOVDDUPrr  : S3DI<0x12, MRMSrcReg, (ops VR128:$dst, VR128:$src),
+                      "movddup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v2f64 (vector_shuffle
+                                                VR128:$src, (undef),
+                                                SSE_splat_lo_mask)))]>;
+def MOVDDUPrm  : S3DI<0x12, MRMSrcMem, (ops VR128:$dst, f64mem:$src),
+                      "movddup {$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (vector_shuffle
+                                (scalar_to_vector (loadf64 addr:$src)),
+                                (undef),
+                                SSE_splat_lo_mask)))]>;
+
+// Arithmetic
+let isTwoAddress = 1 in {
+  def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+                        (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                        "addsubps {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           VR128:$src2))]>;
+  def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+                        (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                        "addsubps {$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           (load addr:$src2)))]>;
+  def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+                       (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                       "addsubpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          VR128:$src2))]>;
+  def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+                       (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+                       "addsubpd {$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          (load addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (ops VR128:$dst, i128mem:$src),
+                   "lddqu {$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+         !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+         !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (load addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcMem, (ops VR128:$dst, VR128:$src1, f128mem:$src2),
+        !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (load addr:$src2))))]>;
+
+let isTwoAddress = 1 in {
+  def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+  def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0xC8, RawFrm, (ops), "monitor",
+                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT   : I<0xC9, RawFrm, (ops), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  MOVSHDUP_shuffle_mask)),
+          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+                  MOVSHDUP_shuffle_mask)),
+          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+  def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                    MOVSLDUP_shuffle_mask)),
+            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+  def : Pat<(v4i32 (vector_shuffle (bc_v4i32 (loadv2i64 addr:$src)), (undef),
+                    MOVSLDUP_shuffle_mask)),
+            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===----------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 and OpSize prefixes.
+//   SS3AI - SSSE3 instructions with TA and OpSize prefixes.
+
+class SS38I<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, T8, OpSize, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, TA, OpSize, Requires<[HasSSSE3]>;
+
+/// SS3I_binop_rm_int - Simple SSSE3 binary operatr whose type is v2i64.
+let isTwoAddress = 1 in {
+  multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : SS38I<opc, MRMSrcReg, (ops VR128:$dst, VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : SS38I<opc, MRMSrcMem, (ops VR128:$dst, VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, " {$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId VR128:$src1,
+                      (bitconvert (loadv2i64 addr:$src2))))]>;
+  }
+}
+
+defm PMULHRSW128 : SS3I_binop_rm_int<0x0B, "pmulhrsw",
+                                     int_x86_ssse3_pmulhrsw_128, 1>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// 128-bit vector undef's.
+def : Pat<(v2f64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (undef)), (IMPLICIT_DEF_VR128)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all zero's.
+def : Pat<(v16i8 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 immAllZerosV), (V_SET0)>, Requires<[HasSSE2]>;
+
+// 128-bit vector all one's.
+def : Pat<(v16i8 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 immAllOnesV), (V_SETALLONES)>, Requires<[HasSSE1]>;
+
+// Store 128-bit integer vector values.
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+          (MOVDQAmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+// Scalar to v8i16 / v16i8. The source may be a GR32, but only the lower 8 or
+// 16-bits matter.
+def : Pat<(v8i16 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+      Requires<[HasSSE2]>;
+def : Pat<(v16i8 (X86s2vec GR32:$src)), (MOVDI2PDIrr GR32:$src)>,
+      Requires<[HasSSE2]>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+def : Pat<(v8i16 (vector_shuffle immAllZerosV,
+                  (v8i16 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle immAllZerosV,
+                  (v16i8 (X86s2vec GR32:$src)), MOVL_shuffle_mask)),
+          (MOVZDI2PDIrr GR32:$src)>, Requires<[HasSSE2]>;
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (vector_shuffle immAllZerosV,
+                  (v2f64 (scalar_to_vector FR64:$src)), MOVL_shuffle_mask)),
+          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle immAllZerosV,
+                  (v4f32 (scalar_to_vector FR32:$src)), MOVL_shuffle_mask)),
+          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2f64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), SSE_splat_lo_mask:$sm),
+          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v2i64 VR128:$src), (undef), UNPCKH_shuffle_mask:$sm),
+          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Splat v4f32
+def : Pat<(vector_shuffle (v4f32 VR128:$src), (undef), SSE_splat_mask:$sm),
+          (SHUFPSrri VR128:$src, VR128:$src, SSE_splat_mask:$sm)>,
+      Requires<[HasSSE1]>;
+
+// Special unary SHUFPSrri case.
+// FIXME: when we want non two-address code, then we should use PSHUFD?
+def : Pat<(vector_shuffle (v4f32 VR128:$src1), (undef),
+           SHUFP_unary_shuffle_mask:$sm),
+          (SHUFPSrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+      Requires<[HasSSE1]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(vector_shuffle (loadv4f32 addr:$src1), (undef),
+           SHUFP_unary_shuffle_mask:$sm),
+          (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
+      Requires<[HasSSE2]>;
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(vector_shuffle (v4i32 VR128:$src1), (v4i32 VR128:$src2),
+           PSHUFD_binary_shuffle_mask:$sm),
+          (SHUFPSrri VR128:$src1, VR128:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+           Requires<[HasSSE2]>;
+def : Pat<(vector_shuffle (v4i32 VR128:$src1),
+           (bc_v4i32 (loadv2i64 addr:$src2)), PSHUFD_binary_shuffle_mask:$sm),
+          (SHUFPSrmi VR128:$src1, addr:$src2, PSHUFD_binary_shuffle_mask:$sm)>,
+           Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKL_v_undef_shuffle_mask)),
+          (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v16i8 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                  UNPCKH_v_undef_shuffle_mask)),
+          (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+let AddedComplexity = 15 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVHP_shuffle_mask)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVHLPS_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (undef),
+                  MOVHLPS_v_undef_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (undef),
+                  MOVHLPS_v_undef_shuffle_mask)),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+// vector_shuffle v1, (load v2) <0, 1, 4, 5> using MOVHPS
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, (loadv4f32 addr:$src2),
+                  MOVHP_shuffle_mask)),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (vector_shuffle VR128:$src1, (loadv2f64 addr:$src2),
+                  MOVHP_shuffle_mask)),
+          (MOVHPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+                  MOVLP_shuffle_mask)),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)),
+                  MOVHP_shuffle_mask)),
+          (MOVHPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, (loadv2i64 addr:$src2),
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVL_shuffle_mask)),
+          (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVL_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
+                  MOVLP_shuffle_mask)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 20 in
+def : Pat<(bc_v2i64 (vector_shuffle immAllZerosV,
+                     (v2f64 (scalar_to_vector (loadf64 addr:$src))),
+                     MOVL_shuffle_mask)),
+          (MOVZQI2PQIrm addr:$src)>, Requires<[HasSSE2]>;
+
+// FIXME: Temporary workaround since 2-wide shuffle is broken.
+def : Pat<(int_x86_sse2_movs_d  VR128:$src1, VR128:$src2),
+          (v2f64 (MOVLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadh_pd VR128:$src1, addr:$src2),
+          (v2f64 (MOVHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_loadl_pd VR128:$src1, addr:$src2),
+          (v2f64 (MOVLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, VR128:$src2, imm:$src3),
+          (v2f64 (SHUFPDrri VR128:$src1, VR128:$src2, imm:$src3))>,
+      Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_shuf_pd VR128:$src1, (load addr:$src2), imm:$src3),
+          (v2f64 (SHUFPDrmi VR128:$src1, addr:$src2, imm:$src3))>,
+      Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, VR128:$src2),
+          (v2f64 (UNPCKHPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckh_pd VR128:$src1, (load addr:$src2)),
+          (v2f64 (UNPCKHPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, VR128:$src2),
+          (v2f64 (UNPCKLPDrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_unpckl_pd VR128:$src1, (load addr:$src2)),
+          (v2f64 (UNPCKLPDrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, VR128:$src2),
+          (v2i64 (PUNPCKHQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckh_qdq VR128:$src1, (load addr:$src2)),
+          (v2i64 (PUNPCKHQDQrm VR128:$src1, addr:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, VR128:$src2),
+          (v2i64 (PUNPCKLQDQrr VR128:$src1, VR128:$src2))>, Requires<[HasSSE2]>;
+def : Pat<(int_x86_sse2_punpckl_qdq VR128:$src1, (load addr:$src2)),
+          (PUNPCKLQDQrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  (load addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// Unaligned load
+def : Pat<(v4f32 (X86loadu addr:$src)), (MOVUPSrm addr:$src)>,
+      Requires<[HasSSE1]>;
diff --git a/lib/Target/X86/X86InstrX86-64.td b/lib/Target/X86/X86InstrX86-64.td
new file mode 100644
index 0000000..ac43846
--- /dev/null
+++ b/lib/Target/X86/X86InstrX86-64.td
@@ -0,0 +1,1165 @@
+//====- X86InstrX86-64.td - Describe the X86 Instruction Set ----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions...
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64>;
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64>;
+
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64, i32imm);
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printlea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions...
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+                               [add, mul, shl, or, frameindex, X86Wrapper],
+                               []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : I<o, F, ops, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii8<o, F, ops, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : Ii32<o, F, ops, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag ops, string asm, list<dag> pattern>
+  : X86Inst<o, f, Imm64, ops, asm>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : SSI<o, F, ops, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : SDI<o, F, ops, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag ops, string asm, list<dag> pattern>
+      : PDI<o, F, ops, asm, pattern>, REX_W;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+def i64immSExt32  : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getValue() == (int32_t)N->getValue();
+}]>;
+
+def i64immZExt32  : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // unsignedsign extended field.
+  return (uint64_t)N->getValue() == (uint32_t)N->getValue();
+}]>;
+
+def i64immSExt8  : PatLeaf<(i64 imm), [{
+  // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int64_t)N->getValue() == (int8_t)N->getValue();
+}]>;
+
+def sextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (sextloadi1 node:$ptr))>;
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+def IMPLICIT_DEF_GR64  : I<0, Pseudo, (ops GR64:$dst),
+                         "#IMPLICIT_DEF $dst",
+                         [(set GR64:$dst, (undef))]>;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1, noResults = 1 in
+  // All calls clobber the non-callee saved registers...
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15] in {
+    def CALL64pcrel32 : I<0xE8, RawFrm, (ops i64imm:$dst, variable_ops),
+                          "call ${dst:call}", []>;
+    def CALL64r       : I<0xFF, MRM2r, (ops GR64:$dst, variable_ops),
+                          "call {*}$dst", [(X86call GR64:$dst)]>;
+    def CALL64m       : I<0xFF, MRM2m, (ops i64mem:$dst, variable_ops),
+                          "call {*}$dst", []>;
+  }
+
+// Branches
+let isBranch = 1, isTerminator = 1, noResults = 1, isBarrier = 1 in {
+  def JMP64r     : I<0xFF, MRM4r, (ops GR64:$dst), "jmp{q} {*}$dst",
+                     [(brind GR64:$dst)]>;
+  def JMP64m     : I<0xFF, MRM4m, (ops i64mem:$dst), "jmp{q} {*}$dst",
+                     [(brind (loadi64 addr:$dst))]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+def LEAVE64  : I<0xC9, RawFrm,
+                 (ops), "leave", []>, Imp<[RBP,RSP],[RBP,RSP]>;
+def POP64r   : I<0x58, AddRegFrm,
+                 (ops GR64:$reg), "pop{q} $reg", []>, Imp<[RSP],[RSP]>;
+def PUSH64r  : I<0x50, AddRegFrm,
+                 (ops GR64:$reg), "push{q} $reg", []>, Imp<[RSP],[RSP]>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (ops GR32:$dst, lea64_32mem:$src),
+                  "lea{l} {$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+def LEA64r   : RI<0x8D, MRMSrcMem, (ops GR64:$dst, lea64mem:$src),
+                  "lea{q} {$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (ops GR64:$dst, GR64:$src),
+                  "bswap{q} $dst", 
+                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+// Exchange
+def XCHG64rr : RI<0x87, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64mr : RI<0x87, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+def XCHG64rm : RI<0x87, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                  "xchg{q} {$src2|$src1}, {$src1|$src2}", []>;
+
+// Repeat string ops
+def REP_MOVSQ : RI<0xA5, RawFrm, (ops), "{rep;movsq|rep movsq}",
+                   [(X86rep_movs i64)]>,
+                Imp<[RCX,RDI,RSI], [RCX,RDI,RSI]>, REP;
+def REP_STOSQ : RI<0xAB, RawFrm, (ops), "{rep;stosq|rep stosq}",
+                   [(X86rep_stos i64)]>,
+                Imp<[RAX,RCX,RDI], [RCX,RDI]>, REP;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+
+def MOV64rr : RI<0x89, MRMDestReg, (ops GR64:$dst, GR64:$src),
+                 "mov{q} {$src, $dst|$dst, $src}", []>;
+
+def MOV64ri : RIi64<0xB8, AddRegFrm, (ops GR64:$dst, i64imm:$src),
+                    "movabs{q} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (ops GR64:$dst, i64i32imm:$src),
+                      "mov{q} {$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, i64immSExt32:$src)]>;
+
+def MOV64rm : RI<0x8B, MRMSrcMem, (ops GR64:$dst, i64mem:$src),
+                 "mov{q} {$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                 "mov{q} {$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (ops i64mem:$dst, i64i32imm:$src),
+                      "mov{q} {$src, $dst|$dst, $src}",
+                      [(store i64immSExt32:$src, addr:$dst)]>;
+
+// Sign/Zero extenders
+
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+                    "movs{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+                    "movs{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+                    "movs{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+                    "movs{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (ops GR64:$dst, GR32:$src),
+                    "movs{lq|xd} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+                    "movs{lq|xd} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+def MOVZX64rr8 : RI<0xB6, MRMSrcReg, (ops GR64:$dst, GR8 :$src),
+                    "movz{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : RI<0xB6, MRMSrcMem, (ops GR64:$dst, i8mem :$src),
+                    "movz{bq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+def MOVZX64rr16: RI<0xB7, MRMSrcReg, (ops GR64:$dst, GR16:$src),
+                    "movz{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: RI<0xB7, MRMSrcMem, (ops GR64:$dst, i16mem:$src),
+                    "movz{wq|x} {$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+def CDQE : RI<0x98, RawFrm, (ops),
+             "{cltq|cdqe}", []>, Imp<[EAX],[RAX]>;     // RAX = signext(EAX)
+
+def CQO  : RI<0x99, RawFrm, (ops),
+              "{cqto|cqo}", []>, Imp<[RAX],[RAX,RDX]>; // RDX:RAX = signext(RAX)
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions...
+//
+
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+def ADD64rr  : RI<0x01, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (add GR64:$src1, GR64:$src2))]>;
+
+def ADD64ri32 : RIi32<0x81, MRM0r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "add{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2))]>;
+def ADD64ri8 : RIi8<0x83, MRM0r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "add{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2))]>;
+} // isConvertibleToThreeAddress
+
+def ADD64rm  : RI<0x03, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (add GR64:$src1, (load addr:$src2)))]>;
+} // isTwoAddress
+
+def ADD64mr  : RI<0x01, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                  "add{q} {$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (ops i64mem:$dst, i64i32imm :$src2),
+                      "add{q} {$src2, $dst|$dst, $src2}",
+               [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (ops i64mem:$dst, i64i8imm :$src2),
+                    "add{q} {$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr  : RI<0x11, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rm  : RI<0x13, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri32 : RIi32<0x81, MRM2r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "adc{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+def ADC64ri8 : RIi8<0x83, MRM2r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "adc{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr  : RI<0x11, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                  "adc{q} {$src2, $dst|$dst, $src2}",
+                  [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (ops i64mem:$dst, i64i32imm:$src2),
+                      "adc{q} {$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (ops i64mem:$dst, i64i8imm :$src2),
+                    "adc{q} {$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SUB64rr  : RI<0x29, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>;
+
+def SUB64rm  : RI<0x2B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2)))]>;
+
+def SUB64ri32 : RIi32<0x81, MRM5r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "sub{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2))]>;
+def SUB64ri8 : RIi8<0x83, MRM5r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "sub{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SUB64mr  : RI<0x29, MRMDestMem, (ops i64mem:$dst, GR64:$src2), 
+                  "sub{q} {$src2, $dst|$dst, $src2}",
+                  [(store (sub (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (ops i64mem:$dst, i64i32imm:$src2), 
+                      "sub{q} {$src2, $dst|$dst, $src2}",
+               [(store (sub (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SUB64mi8 : RIi8<0x83, MRM5m, (ops i64mem:$dst, i64i8imm :$src2), 
+                    "sub{q} {$src2, $dst|$dst, $src2}",
+                [(store (sub (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SBB64rr    : RI<0x19, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rm  : RI<0x1B, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "sbb{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri32 : RIi32<0x81, MRM3r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                      "sbb{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+def SBB64ri8 : RIi8<0x83, MRM3r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def SBB64mr  : RI<0x19, MRMDestMem, (ops i64mem:$dst, GR64:$src2), 
+                  "sbb{q} {$src2, $dst|$dst, $src2}",
+                  [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (ops i64mem:$dst, i64i32imm:$src2), 
+                      "sbb{q} {$src2, $dst|$dst, $src2}",
+              [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (ops i64mem:$dst, i64i8imm :$src2), 
+                    "sbb{q} {$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+
+// Unsigned multiplication
+def MUL64r : RI<0xF7, MRM4r, (ops GR64:$src),
+                "mul{q} $src", []>,
+             Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*GR64
+def MUL64m : RI<0xF7, MRM4m, (ops i64mem:$src),
+                "mul{q} $src", []>,
+             Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (ops GR64:$src),
+                 "imul{q} $src", []>,
+              Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*GR64
+def IMUL64m : RI<0xF7, MRM5m, (ops i64mem:$src),
+                 "imul{q} $src", []>,
+              Imp<[RAX],[RAX,RDX]>;         // RAX,RDX = RAX*[mem64]
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def IMUL64rr : RI<0xAF, MRMSrcReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "imul{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>, TB;
+
+def IMUL64rm : RI<0xAF, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "imul{q} {$src2, $dst|$dst, $src2}",
+                 [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2)))]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                        (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                        "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>;
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                      "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2))]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                        (ops GR64:$dst, i64mem:$src1, i64i32imm:$src2),
+                        "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                [(set GR64:$dst, (mul (load addr:$src1), i64immSExt32:$src2))]>;
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (ops GR64:$dst, i64mem:$src1, i64i8imm: $src2),
+                      "imul{q} {$src2, $src1, $dst|$dst, $src1, $src2}",
+                 [(set GR64:$dst, (mul (load addr:$src1), i64immSExt8:$src2))]>;
+
+// Unsigned division / remainder
+def DIV64r : RI<0xF7, MRM6r, (ops GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def DIV64m : RI<0xF7, MRM6m, (ops i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "div{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Signed division / remainder
+def IDIV64r: RI<0xF7, MRM7r, (ops GR64:$src),        // RDX:RAX/r64 = RAX,RDX
+                "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+def IDIV64m: RI<0xF7, MRM7m, (ops i64mem:$src),      // RDX:RAX/[mem64] = RAX,RDX
+                "idiv{q} $src", []>, Imp<[RAX,RDX],[RAX,RDX]>;
+
+// Unary instructions
+let CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (ops GR64:$dst, GR64:$src), "neg{q} $dst",
+                [(set GR64:$dst, (ineg GR64:$src))]>;
+def NEG64m : RI<0xF7, MRM3m, (ops i64mem:$dst), "neg{q} $dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (ops GR64:$dst, GR64:$src), "inc{q} $dst",
+                [(set GR64:$dst, (add GR64:$src, 1))]>;
+def INC64m : RI<0xFF, MRM0m, (ops i64mem:$dst), "inc{q} $dst",
+                [(store (add (loadi64 addr:$dst), 1), addr:$dst)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (ops GR64:$dst, GR64:$src), "dec{q} $dst",
+                [(set GR64:$dst, (add GR64:$src, -1))]>;
+def DEC64m : RI<0xFF, MRM1m, (ops i64mem:$dst), "dec{q} $dst",
+                [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (ops GR16:$dst, GR16:$src), "inc{w} $dst",
+                  [(set GR16:$dst, (add GR16:$src, 1))]>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (ops GR32:$dst, GR32:$src), "inc{l} $dst",
+                  [(set GR32:$dst, (add GR32:$src, 1))]>,
+                Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (ops GR16:$dst, GR16:$src), "dec{w} $dst",
+                  [(set GR16:$dst, (add GR16:$src, -1))]>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (ops GR32:$dst, GR32:$src), "dec{l} $dst",
+                  [(set GR32:$dst, (add GR32:$src, -1))]>,
+                Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+} // CodeSize
+
+
+// Shift instructions
+let isTwoAddress = 1 in {
+def SHL64rCL : RI<0xD3, MRM4r, (ops GR64:$dst, GR64:$src),
+                  "shl{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (shl GR64:$src, CL))]>,
+               Imp<[CL],[]>;
+def SHL64ri  : RIi8<0xC1, MRM4r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "shl{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+def SHL64r1  : RI<0xD1, MRM4r, (ops GR64:$dst, GR64:$src1),
+                 "shl{q} $dst", []>;
+} // isTwoAddress
+
+def SHL64mCL : RI<0xD3, MRM4m, (ops i64mem:$dst),
+                  "shl{q} {%cl, $dst|$dst, %CL}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (ops i64mem:$dst, i8imm:$src),
+                  "shl{q} {$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (ops i64mem:$dst),
+                  "shl{q} $dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SHR64rCL : RI<0xD3, MRM5r, (ops GR64:$dst, GR64:$src),
+                  "shr{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (srl GR64:$src, CL))]>,
+               Imp<[CL],[]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                  "shr{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1  : RI<0xD1, MRM5r, (ops GR64:$dst, GR64:$src1),
+                 "shr{q} $dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SHR64mCL : RI<0xD3, MRM5m, (ops i64mem:$dst),
+                  "shr{q} {%cl, $dst|$dst, %CL}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (ops i64mem:$dst, i8imm:$src),
+                  "shr{q} {$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (ops i64mem:$dst),
+                  "shr{q} $dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def SAR64rCL : RI<0xD3, MRM7r, (ops GR64:$dst, GR64:$src),
+                 "sar{q} {%cl, $dst|$dst, %CL}",
+                 [(set GR64:$dst, (sra GR64:$src, CL))]>, Imp<[CL],[]>;
+def SAR64ri  : RIi8<0xC1, MRM7r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                   "sar{q} {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1  : RI<0xD1, MRM7r, (ops GR64:$dst, GR64:$src1),
+                 "sar{q} $dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def SAR64mCL : RI<0xD3, MRM7m, (ops i64mem:$dst), 
+                 "sar{q} {%cl, $dst|$dst, %CL}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def SAR64mi  : RIi8<0xC1, MRM7m, (ops i64mem:$dst, i8imm:$src),
+                    "sar{q} {$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (ops i64mem:$dst),
+                  "sar{q} $dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+let isTwoAddress = 1 in {
+def ROL64rCL : RI<0xD3, MRM0r, (ops GR64:$dst, GR64:$src),
+                  "rol{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotl GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROL64ri  : RIi8<0xC1, MRM0r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "rol{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1  : RI<0xD1, MRM0r, (ops GR64:$dst, GR64:$src1),
+                  "rol{q} $dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROL64mCL :  I<0xD3, MRM0m, (ops i64mem:$dst),
+                  "rol{q} {%cl, $dst|$dst, %CL}",
+                  [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def ROL64mi  : RIi8<0xC1, MRM0m, (ops i64mem:$dst, i8imm:$src),
+                    "rol{q} {$src, $dst|$dst, $src}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1  : RI<0xD1, MRM0m, (ops i64mem:$dst),
+                 "rol{q} $dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+def ROR64rCL : RI<0xD3, MRM1r, (ops GR64:$dst, GR64:$src),
+                  "ror{q} {%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotr GR64:$src, CL))]>, Imp<[CL],[]>;
+def ROR64ri  : RIi8<0xC1, MRM1r, (ops GR64:$dst, GR64:$src1, i8imm:$src2),
+                    "ror{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1  : RI<0xD1, MRM1r, (ops GR64:$dst, GR64:$src1),
+                  "ror{q} $dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+def ROR64mCL : RI<0xD3, MRM1m, (ops i64mem:$dst), 
+                  "ror{q} {%cl, $dst|$dst, %CL}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>,
+               Imp<[CL],[]>;
+def ROR64mi  : RIi8<0xC1, MRM1m, (ops i64mem:$dst, i8imm:$src),
+                    "ror{q} {$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1  : RI<0xD1, MRM1m, (ops i64mem:$dst),
+                 "ror{q} $dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                    "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+
+let isCommutable = 1 in {  // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (ops GR64:$dst, GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+} // isCommutable
+} // isTwoAddress
+
+// Temporary hack: there is no patterns associated with these instructions
+// so we have to tell tblgen that these do not produce results.
+let noResults = 1 in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                    "shld{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (ops i64mem:$dst, GR64:$src2),
+                    "shrd{q} {%cl, $src2, $dst|$dst, $src2, %CL}", []>,
+                 Imp<[CL],[]>, TB;
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (ops i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q} {$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                 TB;
+} // noResults
+
+//===----------------------------------------------------------------------===//
+//  Logical Instructions...
+//
+
+let isTwoAddress = 1 in
+def NOT64r : RI<0xF7, MRM2r, (ops GR64:$dst, GR64:$src), "not{q} $dst",
+                [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (ops i64mem:$dst), "not{q} $dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr  : RI<0x21, MRMDestReg, 
+                  (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "and{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>;
+def AND64rm  : RI<0x23, MRMSrcMem,
+                  (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "and{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2)))]>;
+def AND64ri32  : RIi32<0x81, MRM4r, 
+                       (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                       "and{q} {$src2, $dst|$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2))]>;
+def AND64ri8 : RIi8<0x83, MRM4r, 
+                    (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "and{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def AND64mr  : RI<0x21, MRMDestMem,
+                  (ops i64mem:$dst, GR64:$src),
+                  "and{q} {$src, $dst|$dst, $src}",
+                  [(store (and (load addr:$dst), GR64:$src), addr:$dst)]>;
+def AND64mi32  : RIi32<0x81, MRM4m,
+                       (ops i64mem:$dst, i64i32imm:$src),
+                       "and{q} {$src, $dst|$dst, $src}",
+             [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+                    (ops i64mem:$dst, i64i8imm :$src),
+                    "and{q} {$src, $dst|$dst, $src}",
+                 [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr   : RI<0x09, MRMDestReg, (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                  "or{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>;
+def OR64rm   : RI<0x0B, MRMSrcMem , (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                  "or{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2)))]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (ops GR64:$dst, GR64:$src1, i64i32imm:$src2),
+                     "or{q} {$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2))]>;
+def OR64ri8  : RIi8<0x83, MRM1r, (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "or{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                "or{q} {$src, $dst|$dst, $src}",
+                [(store (or (load addr:$dst), GR64:$src), addr:$dst)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (ops i64mem:$dst, i64i32imm:$src),
+                     "or{q} {$src, $dst|$dst, $src}",
+              [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def OR64mi8  : RIi8<0x83, MRM1m, (ops i64mem:$dst, i64i8imm:$src),
+                    "or{q} {$src, $dst|$dst, $src}",
+                  [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr  : RI<0x31, MRMDestReg,  (ops GR64:$dst, GR64:$src1, GR64:$src2), 
+                  "xor{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
+def XOR64rm  : RI<0x33, MRMSrcMem, (ops GR64:$dst, GR64:$src1, i64mem:$src2), 
+                  "xor{q} {$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2)))]>;
+def XOR64ri32 : RIi32<0x81, MRM6r, 
+                      (ops GR64:$dst, GR64:$src1, i64i32imm:$src2), 
+                      "xor{q} {$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2))]>;
+def XOR64ri8 : RIi8<0x83, MRM6r,  (ops GR64:$dst, GR64:$src1, i64i8imm:$src2),
+                    "xor{q} {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2))]>;
+} // isTwoAddress
+
+def XOR64mr  : RI<0x31, MRMDestMem, (ops i64mem:$dst, GR64:$src),
+                  "xor{q} {$src, $dst|$dst, $src}",
+                  [(store (xor (load addr:$dst), GR64:$src), addr:$dst)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (ops i64mem:$dst, i64i32imm:$src),
+                      "xor{q} {$src, $dst|$dst, $src}",
+             [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (ops i64mem:$dst, i64i8imm :$src),
+                    "xor{q} {$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+// Integer comparison
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                  "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, GR64:$src2), 0)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                  "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (ops GR64:$src1, i64i32imm:$src2),
+                       "test{q} {$src2, $src1|$src1, $src2}",
+                       [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (ops i64mem:$src1, i64i32imm:$src2),
+                       "test{q} {$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0)]>;
+
+def CMP64rr : RI<0x39, MRMDestReg, (ops GR64:$src1, GR64:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, GR64:$src2)]>;
+def CMP64mr : RI<0x39, MRMDestMem, (ops i64mem:$src1, GR64:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp (loadi64 addr:$src1), GR64:$src2)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (ops GR64:$src1, i64mem:$src2),
+                 "cmp{q} {$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, (loadi64 addr:$src2))]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (ops GR64:$src1, i64i32imm:$src2),
+                      "cmp{q} {$src2, $src1|$src1, $src2}",
+                      [(X86cmp GR64:$src1, i64immSExt32:$src2)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (ops i64mem:$src1, i64i32imm:$src2),
+                      "cmp{q} {$src2, $src1|$src1, $src2}",
+                      [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (ops i64mem:$src1, i64i8imm:$src2),
+                    "cmp{q} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (ops GR64:$src1, i64i8imm:$src2),
+                    "cmp{q} {$src2, $src1|$src1, $src2}",
+                    [(X86cmp GR64:$src1, i64immSExt8:$src2)]>;
+
+// Conditional moves
+let isTwoAddress = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg,       // if <u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovb {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_B))]>, TB;
+def CMOVB64rm : RI<0x42, MRMSrcMem,       // if <u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovb {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_B))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg,       // if >=u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovae {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_AE))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem,       // if >=u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovae {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_AE))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg,       // if ==, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmove {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_E))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem,       // if ==, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmove {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_E))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg,       // if !=, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovne {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NE))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem,       // if !=, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovne {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NE))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg,       // if <=u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovbe {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_BE))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem,       // if <=u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovbe {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_BE))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg,       // if >u, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmova {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_A))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem,       // if >u, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmova {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_A))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg,       // if <s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovl {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_L))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem,       // if <s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovl {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_L))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg,       // if >=s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovge {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_GE))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem,       // if >=s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovge {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_GE))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg,       // if <=s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovle {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_LE))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem,       // if <=s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovle {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_LE))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg,       // if >s, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovg {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_G))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem,       // if >s, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovg {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_G))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg,       // if signed, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovs {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_S))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem,       // if signed, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovs {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_S))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg,       // if !signed, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovns {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem,       // if !signed, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovns {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg,       // if parity, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovp {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_P))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem,       // if parity, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovp {$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_P))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg,       // if !parity, GR64 = GR64
+                   (ops GR64:$dst, GR64:$src1, GR64:$src2),
+                   "cmovnp {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NP))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem,       // if !parity, GR64 = [mem64]
+                   (ops GR64:$dst, GR64:$src1, i64mem:$src2),
+                   "cmovnp {$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NP))]>, TB;
+} // isTwoAddress
+
+//===----------------------------------------------------------------------===//
+//  Conversion Instructions...
+//
+
+// f64 -> signed i64
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                           "cvtsd2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+                           "cvtsd2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, FR64:$src),
+                        "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f64mem:$src),
+                        "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                            "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (ops GR64:$dst, f128mem:$src),
+                            "cvttsd2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+                       "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+                       "cvtsi2sd{q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR64:$src2),
+                           "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+                           "cvtsi2sd{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (ops FR32:$dst, GR64:$src),
+                       "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (ops FR32:$dst, i64mem:$src),
+                       "cvtsi2ss{q} {$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+let isTwoAddress = 1 in {
+def Int_CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg,
+                           (ops VR128:$dst, VR128:$src1, GR64:$src2),
+                           "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem,
+                           (ops VR128:$dst, VR128:$src1, i64mem:$src2),
+                           "cvtsi2ss{q} {$src2, $dst|$dst, $src2}",
+                           []>; // TODO: add intrinsic
+} // isTwoAddress
+
+// f32 -> signed i64
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                           "cvtss2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                           "cvtss2si{q} {$src, $dst|$dst, $src}",
+                           []>; // TODO: add intrinsic
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, FR32:$src),
+                        "cvttss2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                        "cvttss2si{q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (ops GR64:$dst, VR128:$src),
+                            "cvttss2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (ops GR64:$dst, f32mem:$src),
+                            "cvttss2si{q} {$src, $dst|$dst, $src}",
+                            []>; // TODO: add intrinsic
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Truncate
+// In 64-mode, each 64-bit and 32-bit registers has a low 8-bit sub-register.
+def TRUNC_64to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR64:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+                     [(set GR8:$dst, (trunc GR64:$src))]>;
+def TRUNC_32to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR32:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}",
+                     [(set GR8:$dst, (trunc GR32:$src))]>,
+                   Requires<[In64BitMode]>;
+def TRUNC_16to8  : I<0x88, MRMDestReg, (ops GR8:$dst, GR16:$src),
+                     "mov{b} {${src:subreg8}, $dst|$dst, ${src:subreg8}}",
+                     [(set GR8:$dst, (trunc GR16:$src))]>,
+                   Requires<[In64BitMode]>;
+
+def TRUNC_64to16 : I<0x89, MRMDestReg, (ops GR16:$dst, GR64:$src),
+                     "mov{w} {${src:subreg16}, $dst|$dst, ${src:subreg16}}",
+                     [(set GR16:$dst, (trunc GR64:$src))]>;
+
+def TRUNC_64to32 : I<0x89, MRMDestReg, (ops GR32:$dst, GR64:$src),
+                     "mov{l} {${src:subreg32}, $dst|$dst, ${src:subreg32}}",
+                     [(set GR32:$dst, (trunc GR64:$src))]>;
+
+// Zero-extension
+// TODO: Remove this after proper i32 -> i64 zext support.
+def PsMOVZX64rr32: I<0x89, MRMDestReg, (ops GR64:$dst, GR32:$src),
+                     "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                     [(set GR64:$dst, (zext GR32:$src))]>;
+def PsMOVZX64rm32: I<0x8B, MRMSrcMem, (ops GR64:$dst, i32mem:$src),
+                     "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                     [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: AddedComplexity gives MOV64r0 a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let AddedComplexity = 1 in
+def MOV64r0  : RI<0x31, MRMInitReg,  (ops GR64:$dst), 
+                 "xor{q} $dst, $dst",
+                 [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero.
+let AddedComplexity = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (ops GR64:$dst, i64i32imm:$src),
+                        "mov{l} {$src, ${dst:subreg32}|${dst:subreg32}, $src}",
+                        [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[NotSmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[NotSmallCode]>;
+
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[SmallCode, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86tailcall (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>;
+
+def : Pat<(X86tailcall GR64:$dst),
+          (CALL64r GR64:$dst)>;
+
+// {s|z}extload bool -> {s|z}extload byte
+def : Pat<(sextloadi64i1 addr:$src), (MOVSX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+def : Pat<(extloadi64i32 addr:$src), (PsMOVZX64rm32 addr:$src)>;
+
+// anyext -> zext
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8 :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16:$src)>;
+def : Pat<(i64 (anyext GR32:$src)), (PsMOVZX64rr32 GR32:$src)>;
+def : Pat<(i64 (anyext (loadi8  addr:$src))), (MOVZX64rm8  addr:$src)>;
+def : Pat<(i64 (anyext (loadi16 addr:$src))), (MOVZX64rm16 addr:$src)>;
+def : Pat<(i64 (anyext (loadi32 addr:$src))), (PsMOVZX64rm32 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (or (x >> c) | (y << (64 - c))) ==> (shrd64 x, y, c)
+def : Pat<(or (srl GR64:$src1, CL:$amt),
+              (shl GR64:$src2, (sub 64, CL:$amt))),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (srl (loadi64 addr:$dst), CL:$amt),
+                     (shl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHRD64mrCL addr:$dst, GR64:$src2)>;
+
+// (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+def : Pat<(or (shl GR64:$src1, CL:$amt),
+              (srl GR64:$src2, (sub 64, CL:$amt))),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+
+def : Pat<(store (or (shl (loadi64 addr:$dst), CL:$amt),
+                     (srl GR64:$src2, (sub 64, CL:$amt))), addr:$dst),
+          (SHLD64mrCL addr:$dst, GR64:$src2)>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, imm:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (ops VR128:$dst, GR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOV64toPQIrm : RPDI<0x6E, MRMSrcMem, (ops VR128:$dst, i64mem:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>;
+
+def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, VR128:$src),
+                         "mov{d|q} {$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                           (iPTR 0)))]>;
+def MOVPQIto64mr  : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, VR128:$src),
+                         "mov{d|q} {$src, $dst|$dst, $src}",
+                         [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                       (iPTR 0))), addr:$dst)]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (ops FR64:$dst, GR64:$src),
+                       "mov{d|q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (ops FR64:$dst, i64mem:$src),
+                       "mov{d|q} {$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (ops GR64:$dst, FR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (ops i64mem:$dst, FR64:$src),
+                        "mov{d|q} {$src, $dst|$dst, $src}",
+                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
diff --git a/lib/Target/X86/X86IntelAsmPrinter.cpp b/lib/Target/X86/X86IntelAsmPrinter.cpp
new file mode 100755
index 0000000..39b65ee
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.cpp
@@ -0,0 +1,533 @@
+//===-- X86IntelAsmPrinter.cpp - Convert X86 LLVM code to Intel assembly --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Intel format assembly language.
+// This printer is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelAsmPrinter.h"
+#include "X86TargetAsmInfo.h"
+#include "X86.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/Mangler.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+std::string X86IntelAsmPrinter::getSectionForFunction(const Function &F) const {
+  // Intel asm always emits functions to _text.
+  return "_text";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool X86IntelAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  O << "\n\n";
+
+  // Print out constants referenced by the function
+  EmitConstantPool(MF.getConstantPool());
+
+  // Print out labels for the function.
+  const Function *F = MF.getFunction();
+  unsigned CC = F->getCallingConv();
+
+  // Populate function information map.  Actually, We don't want to populate
+  // non-stdcall or non-fastcall functions' information right now.
+  if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+    FunctionInfoMap[F] = *MF.getInfo<X86MachineFunctionInfo>();
+
+  X86SharedAsmPrinter::decorateName(CurrentFnName, F);
+
+  SwitchToTextSection(getSectionForFunction(*F).c_str(), F);
+
+  switch (F->getLinkage()) {
+  default: assert(0 && "Unsupported linkage type!");
+  case Function::InternalLinkage:
+    EmitAlignment(4);
+    break;    
+  case Function::DLLExportLinkage:
+    DLLExportedFns.insert(CurrentFnName);
+    //FALLS THROUGH
+  case Function::ExternalLinkage:
+    O << "\tpublic " << CurrentFnName << "\n";
+    EmitAlignment(4);
+    break;    
+  }
+  
+  O << CurrentFnName << "\tproc near\n";
+  
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    // Print a label for the basic block if there are any predecessors.
+    if (I->pred_begin() != I->pred_end()) {
+      printBasicBlockLabel(I, true);
+      O << '\n';
+    }
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      // Print the assembly for the instruction.
+      O << "\t";
+      printMachineInstruction(II);
+    }
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo(MF.getJumpTableInfo(), MF);
+
+  O << CurrentFnName << "\tendp\n";
+
+  // We didn't modify anything.
+  return false;
+}
+
+void X86IntelAsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImmedValue();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86IntelAsmPrinter::printOp(const MachineOperand &MO, 
+                                 const char *Modifier) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {      
+    if (MRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      unsigned Reg = MO.getReg();
+      if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+        MVT::ValueType VT = (strcmp(Modifier,"subreg64") == 0) ?
+          MVT::i64 : ((strcmp(Modifier, "subreg32") == 0) ? MVT::i32 :
+                      ((strcmp(Modifier,"subreg16") == 0) ? MVT::i16 :MVT::i8));
+        Reg = getX86SubSuperRegister(Reg, VT);
+      }
+      O << RI.get(Reg).Name;
+    } else
+      O << "reg" << MO.getReg();
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImmedValue();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    printBasicBlockLabel(MO.getMachineBasicBlock());
+    return;
+  case MachineOperand::MO_JumpTableIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << TAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << "_" << MO.getJumpTableIndex();
+    return;
+  }    
+  case MachineOperand::MO_ConstantPoolIndex: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    if (!isMemOp) O << "OFFSET ";
+    O << "[" << TAI->getPrivateGlobalPrefix() << "CPI"
+      << getFunctionNumber() << "_" << MO.getConstantPoolIndex();
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << Offset;
+    O << "]";
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    GlobalValue *GV = MO.getGlobal();    
+    std::string Name = Mang->getValueName(GV);
+
+    X86SharedAsmPrinter::decorateName(Name, GV);
+
+    if (!isMemOp && !isCallOp) O << "OFFSET ";
+    if (GV->hasDLLImportLinkage()) {
+      // FIXME: This should be fixed with full support of stdcall & fastcall
+      // CC's
+      O << "__imp_";          
+    } 
+    O << Name;
+    int Offset = MO.getOffset();
+    if (Offset > 0)
+      O << " + " << Offset;
+    else if (Offset < 0)
+      O << Offset;
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    if (!isCallOp) O << "OFFSET ";
+    O << TAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  }
+  default:
+    O << "<unknown operand type>"; return;
+  }
+}
+
+void X86IntelAsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                           const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  int ScaleVal                   = MI->getOperand(Op+1).getImmedValue();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  O << "[";
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOp(BaseReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << "*";
+    printOp(IndexReg, Modifier);
+    NeedPlus = true;
+  }
+
+  if (DispSpec.isGlobalAddress() || DispSpec.isConstantPoolIndex() ||
+      DispSpec.isJumpTableIndex()) {
+    if (NeedPlus)
+      O << " + ";
+    printOp(DispSpec, "mem");
+  } else {
+    int DispVal = DispSpec.getImmedValue();
+    if (DispVal || (!BaseReg.getReg() && !IndexReg.getReg())) {
+      if (NeedPlus)
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      O << DispVal;
+    }
+  }
+  O << "]";
+}
+
+void X86IntelAsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  O << "\"L" << getFunctionNumber() << "$pb\"\n";
+  O << "\"L" << getFunctionNumber() << "$pb\":";
+}
+
+bool X86IntelAsmPrinter::printAsmMRegister(const MachineOperand &MO,
+                                           const char Mode) {
+  const MRegisterInfo &RI = *TM.getRegisterInfo();
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  }
+
+  O << '%' << RI.get(Reg).Name;
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86IntelAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                         unsigned AsmVariant, 
+                                         const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+      return printAsmMRegister(MI->getOperand(OpNo), ExtraCode[0]);
+    }
+  }
+  
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86IntelAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                               unsigned OpNo,
+                                               unsigned AsmVariant, 
+                                               const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+/// printMachineInstruction -- Print out a single X86 LLVM instruction
+/// MI in Intel syntax to the current output stream.
+///
+void X86IntelAsmPrinter::printMachineInstruction(const MachineInstr *MI) {
+  ++EmittedInsts;
+
+  // See if a truncate instruction can be turned into a nop.
+  switch (MI->getOpcode()) {
+  default: break;
+  case X86::TRUNC_64to32:
+  case X86::TRUNC_64to16:
+  case X86::TRUNC_32to16:
+  case X86::TRUNC_32to8:
+  case X86::TRUNC_16to8:
+  case X86::TRUNC_32_to8:
+  case X86::TRUNC_16_to8: {
+    const MachineOperand &MO0 = MI->getOperand(0);
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned Reg0 = MO0.getReg();
+    unsigned Reg1 = MO1.getReg();
+    unsigned Opc = MI->getOpcode();
+    if (Opc == X86::TRUNC_64to32)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i32);
+    else if (Opc == X86::TRUNC_32to16 || Opc == X86::TRUNC_64to16)
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i16);
+    else
+      Reg1 = getX86SubSuperRegister(Reg1, MVT::i8);
+    O << TAI->getCommentString() << " TRUNCATE ";
+    if (Reg0 != Reg1)
+      O << "\n\t";
+    break;
+  }
+  case X86::PsMOVZX64rr32:
+    O << TAI->getCommentString() << " ZERO-EXTEND " << "\n\t";
+    break;
+  }
+
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+}
+
+bool X86IntelAsmPrinter::doInitialization(Module &M) {
+  X86SharedAsmPrinter::doInitialization(M);
+  
+  Mang->markCharUnacceptable('.');
+
+  O << "\t.686\n\t.model flat\n\n";
+
+  // Emit declarations for external functions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+      X86SharedAsmPrinter::decorateName(Name, I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }      
+      O << Name << ":near\n";
+    }
+  
+  // Emit declarations for external globals.  Note that VC++ always declares
+  // external globals to have type byte, and if that's good enough for VC++...
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) {
+      std::string Name = Mang->getValueName(I);
+
+      O << "\textern " ;
+      if (I->hasDLLImportLinkage()) {
+        O << "__imp_";
+      }      
+      O << Name << ":byte\n";
+    }
+  }
+
+  return false;
+}
+
+bool X86IntelAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  // Print out module-level global variables here.
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    if (I->isDeclaration()) continue;   // External global require no code
+    
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(I))
+      continue;
+    
+    std::string name = Mang->getValueName(I);
+    Constant *C = I->getInitializer();
+    unsigned Align = TD->getPreferredAlignmentLog(I);
+    bool bCustomSegment = false;
+
+    switch (I->getLinkage()) {
+    case GlobalValue::LinkOnceLinkage:
+    case GlobalValue::WeakLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment common 'COMMON'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::AppendingLinkage:
+      SwitchToDataSection("");
+      O << name << "?\tsegment public 'DATA'\n";
+      bCustomSegment = true;
+      // FIXME: the default alignment is 16 bytes, but 1, 2, 4, and 256
+      // are also available.
+      break;
+    case GlobalValue::DLLExportLinkage:
+      DLLExportedGVs.insert(name);
+      // FALL THROUGH
+    case GlobalValue::ExternalLinkage:
+      O << "\tpublic " << name << "\n";
+      // FALL THROUGH
+    case GlobalValue::InternalLinkage:
+      SwitchToDataSection(TAI->getDataSection(), I);
+      break;
+    default:
+      assert(0 && "Unknown linkage type!");
+    }
+
+    if (!bCustomSegment)
+      EmitAlignment(Align, I);
+
+    O << name << ":\t\t\t\t" << TAI->getCommentString()
+      << " " << I->getName() << '\n';
+
+    EmitGlobalConstant(C);
+
+    if (bCustomSegment)
+      O << name << "?\tends\n";
+  }
+
+    // Output linker support code for dllexported globals
+  if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+      (DLLExportedFns.begin() != DLLExportedFns.end())) {
+    SwitchToDataSection("");
+    O << "; WARNING: The following code is valid only with MASM v8.x and (possible) higher\n"
+      << "; This version of MASM is usually shipped with Microsoft Visual Studio 2005\n"
+      << "; or (possible) further versions. Unfortunately, there is no way to support\n"
+      << "; dllexported symbols in the earlier versions of MASM in fully automatic way\n\n";
+    O << "_drectve\t segment info alias('.drectve')\n";
+  }
+
+  for (std::set<std::string>::iterator i = DLLExportedGVs.begin(),
+         e = DLLExportedGVs.end();
+         i != e; ++i) {
+    O << "\t db ' /EXPORT:" << *i << ",data'\n";
+  }    
+
+  for (std::set<std::string>::iterator i = DLLExportedFns.begin(),
+         e = DLLExportedFns.end();
+         i != e; ++i) {
+    O << "\t db ' /EXPORT:" << *i << "'\n";
+  }    
+
+  if ((DLLExportedGVs.begin() != DLLExportedGVs.end()) ||
+      (DLLExportedFns.begin() != DLLExportedFns.end())) {
+    O << "_drectve\t ends\n";    
+  }
+  
+  // Bypass X86SharedAsmPrinter::doFinalization().
+  AsmPrinter::doFinalization(M);
+  SwitchToDataSection("");
+  O << "\tend\n";
+  return false; // success
+}
+
+void X86IntelAsmPrinter::EmitString(const ConstantArray *CVA) const {
+  unsigned NumElts = CVA->getNumOperands();
+  if (NumElts) {
+    // ML does not have escape sequences except '' for '.  It also has a maximum
+    // string length of 255.
+    unsigned len = 0;
+    bool inString = false;
+    for (unsigned i = 0; i < NumElts; i++) {
+      int n = cast<ConstantInt>(CVA->getOperand(i))->getZExtValue() & 255;
+      if (len == 0)
+        O << "\tdb ";
+
+      if (n >= 32 && n <= 127) {
+        if (!inString) {
+          if (len > 0) {
+            O << ",'";
+            len += 2;
+          } else {
+            O << "'";
+            len++;
+          }
+          inString = true;
+        }
+        if (n == '\'') {
+          O << "'";
+          len++;
+        }
+        O << char(n);
+      } else {
+        if (inString) {
+          O << "'";
+          len++;
+          inString = false;
+        }
+        if (len > 0) {
+          O << ",";
+          len++;
+        }
+        O << n;
+        len += 1 + (n > 9) + (n > 99);
+      }
+
+      if (len > 60) {
+        if (inString) {
+          O << "'";
+          inString = false;
+        }
+        O << "\n";
+        len = 0;
+      }
+    }
+
+    if (len > 0) {
+      if (inString)
+        O << "'";
+      O << "\n";
+    }
+  }
+}
+
+// Include the auto-generated portion of the assembly writer.
+#include "X86GenAsmWriter1.inc"
diff --git a/lib/Target/X86/X86IntelAsmPrinter.h b/lib/Target/X86/X86IntelAsmPrinter.h
new file mode 100755
index 0000000..9ad11ff
--- /dev/null
+++ b/lib/Target/X86/X86IntelAsmPrinter.h
@@ -0,0 +1,112 @@
+//===-- X86IntelAsmPrinter.h - Convert X86 LLVM code to Intel assembly ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Intel assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INTELASMPRINTER_H
+#define X86INTELASMPRINTER_H
+
+#include "X86AsmPrinter.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/MRegisterInfo.h"
+
+namespace llvm {
+
+struct X86IntelAsmPrinter : public X86SharedAsmPrinter {
+  X86IntelAsmPrinter(std::ostream &O, X86TargetMachine &TM,
+                     const TargetAsmInfo *T)
+      : X86SharedAsmPrinter(O, TM, T) {
+  }
+
+  virtual const char *getPassName() const {
+    return "X86 Intel-Style Assembly Printer";
+  }
+
+  /// printInstruction - This method is automatically generated by tablegen
+  /// from the instruction set description.  This method returns true if the
+  /// machine instruction was sufficiently described to print it, otherwise it
+  /// returns false.
+  bool printInstruction(const MachineInstr *MI);
+
+  // This method is used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (MO.isRegister()) {
+      assert(MRegisterInfo::isPhysicalRegister(MO.getReg()) && "Not physreg??");
+      O << TM.getRegisterInfo()->get(MO.getReg()).Name;
+    } else {
+      printOp(MO, Modifier);
+    }
+  }
+
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, const char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+  void printMachineInstruction(const MachineInstr *MI);
+  void printOp(const MachineOperand &MO, const char *Modifier = 0);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+  bool runOnMachineFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  
+  /// getSectionForFunction - Return the section that we should emit the
+  /// specified function body into.
+  virtual std::string getSectionForFunction(const Function &F) const;
+
+  virtual void EmitString(const ConstantArray *CVA) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..b9e5d5b
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -0,0 +1,372 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineCodeEmitter.h"
+#include "llvm/Config/alloca.h"
+#include <cstdlib>
+using namespace llvm;
+
+#ifdef _MSC_VER
+  extern "C" void *_AddressOfReturnAddress(void);
+  #pragma intrinsic(_AddressOfReturnAddress)
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(__x86_64__)
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+  ASMPREFIX "X86CompilationCallback:\n"
+    // Save RBP
+    "pushq   %rbp\n"
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    "pushq   %rsi\n"
+    "pushq   %rdx\n"
+    "pushq   %rcx\n"
+    "pushq   %r8\n"
+    "pushq   %r9\n"
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    "popq    %r9\n"
+    "popq    %r8\n"
+    "popq    %rcx\n"
+    "popq    %rdx\n"
+    "popq    %rsi\n"
+    "popq    %rdi\n"
+    // Restore RBP
+    "popq    %rbp\n"
+    "ret\n");
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+#ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX  "X86CompilationCallback\n"
+  ASMPREFIX "X86CompilationCallback:\n"
+    "pushl   %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+#if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "ret\n");
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX  "X86CompilationCallback_SSE\n"
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    "pushl   %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    "movaps  32(%esp), %xmm2\n"
+    "movaps  16(%esp), %xmm1\n"
+    "movaps  (%esp), %xmm0\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "ret\n");
+#else
+  void X86CompilationCallback2(void);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  eax
+      push  edx
+      push  ecx
+      call  X86CompilationCallback2
+      pop   ecx
+      pop   edx
+      pop   eax
+      ret
+    }
+  }
+#endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    assert(0 && "Cannot call X86CompilationCallback() on a non-x86 arch!\n");
+    abort();
+  }
+#endif
+}
+
+/// X86CompilationCallback - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+#ifdef _MSC_VER
+extern "C" void X86CompilationCallback2() {
+  assert(sizeof(size_t) == 4); // FIXME: handle Win64
+  intptr_t *RetAddrLoc = (intptr_t *)_AddressOfReturnAddress();
+  RetAddrLoc += 4;  // skip over ret addr, edx, eax, ecx
+  intptr_t RetAddr = *RetAddrLoc;
+#else
+extern "C" void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+#endif
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCD;
+
+  // The call instruction should have pushed the return value onto the stack...
+#ifdef __x86_64__
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DOUT << "In callback! Addr=" << (void*)RetAddr
+       << " ESP=" << (void*)StackPtr
+       << ": Resolving call to function: "
+       << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n";
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#ifdef __x86_64__
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#ifdef __x86_64__
+  *(intptr_t *)(RetAddr - 0xa) = NewVal;
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCD byte (interrupt) dead, so the marker doesn't effect anything.
+#ifdef __x86_64__
+    ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#ifdef __x86_64__
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+  !defined(_MSC_VER) && !defined(__x86_64__)
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  if (!X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1)) {
+    // FIXME: support for AMD family of processors.
+    if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+      X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+      if ((EDX >> 25) & 0x1)
+        return X86CompilationCallback_SSE;
+    }
+  }
+#endif
+
+  return X86CompilationCallback;
+}
+
+void *X86JITInfo::emitFunctionStub(void *Fn, MachineCodeEmitter &MCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that 
+  // complains about casting a function pointer to a normal pointer.
+#if (defined(__i386__) || defined(i386) || defined(_M_IX86)) && \
+  !defined(_MSC_VER) && !defined(__x86_64__)
+  bool NotCC = (Fn != (void*)(intptr_t)X86CompilationCallback &&
+                Fn != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Fn != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  if (NotCC) {
+#ifdef __x86_64__
+    MCE.startFunctionStub(13, 4);
+    MCE.emitByte(0x49);          // REX prefix
+    MCE.emitByte(0xB8+2);        // movabsq r10
+    MCE.emitWordLE(((unsigned *)&Fn)[0]);
+    MCE.emitWordLE(((unsigned *)&Fn)[1]);
+    MCE.emitByte(0x41);          // REX prefix
+    MCE.emitByte(0xFF);          // jmpq *r10
+    MCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    MCE.startFunctionStub(5, 4);
+    MCE.emitByte(0xE9);
+    MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+    return MCE.finishFunctionStub(0);
+  }
+
+#ifdef __x86_64__
+  MCE.startFunctionStub(14, 4);
+  MCE.emitByte(0x49);          // REX prefix
+  MCE.emitByte(0xB8+2);        // movabsq r10
+  MCE.emitWordLE(((unsigned *)&Fn)[0]);
+  MCE.emitWordLE(((unsigned *)&Fn)[1]);
+  MCE.emitByte(0x41);          // REX prefix
+  MCE.emitByte(0xFF);          // callq *r10
+  MCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  MCE.startFunctionStub(6, 4);
+  MCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  MCE.emitWordLE((intptr_t)Fn-MCE.getCurrentPCValue()-4);
+#endif
+
+  MCE.emitByte(0xCD);   // Interrupt - Just a marker identifying the stub!
+  return MCE.finishFunctionStub(0);
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr-(intptr_t)RelocPos-4-MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_absolute_word:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    case X86::reloc_absolute_dword:
+      *((intptr_t*)RelocPos) += ResultPtr;
+      break;
+    }
+  }
+}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..a4c731a
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h
@@ -0,0 +1,50 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86TargetMachine;
+
+  class X86JITInfo : public TargetJITInfo {
+    X86TargetMachine &TM;
+  public:
+    X86JITInfo(X86TargetMachine &tm) : TM(tm) {useGOT = 0;}
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitFunctionStub - Use the specified MachineCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(void *Fn, MachineCodeEmitter &MCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+  };
+}
+
+#endif
diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..7a21fb2
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h
@@ -0,0 +1,74 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Evan Cheng and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+  None,
+  StdCall,
+  FastCall
+};
+  
+/// X86MachineFunctionInfo - This class is derived from MachineFunction private
+/// X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or 
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - amount of bytes function pops on return.
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// If the function requires additional name decoration, DecorationStyle holds
+  /// the right way to do so.
+  NameDecorationStyle DecorationStyle;
+  
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             DecorationStyle(None) {}
+  
+  X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false),
+                                                CalleeSavedFrameSize(0),
+                                                BytesToPopOnReturn(0),
+                                                DecorationStyle(None) {}
+  
+  bool getForceFramePointer() const { return ForceFramePointer;} 
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+  void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+  
+};
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..da65db0
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -0,0 +1,1613 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class.  This
+// file is responsible for the frame pointer elimination optimization on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+namespace {
+  cl::opt<bool>
+  NoFusing("disable-spill-fusing",
+           cl::desc("Disable fusing of spill code into instructions"));
+  cl::opt<bool>
+  PrintFailedFusing("print-failed-fuse-candidates",
+                    cl::desc("Print instructions that the allocator wants to"
+                             " fuse, but the X86 backend currently can't"),
+                    cl::Hidden);
+}
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo(X86::ADJCALLSTACKDOWN, X86::ADJCALLSTACKUP),
+    TM(tm), TII(tii) {
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+}
+
+bool X86RegisterInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  MachineFunction &MF = *MBB.getParent();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  X86FI->setCalleeSavedFrameSize(CSI.size() * SlotSize);
+  unsigned Opc = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, TII.get(Opc)).addReg(Reg);
+  }
+  return true;
+}
+
+bool X86RegisterInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                                 MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  unsigned Opc = Is64Bit ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    BuildMI(MBB, MI, TII.get(Opc), Reg);
+  }
+  return true;
+}
+
+void X86RegisterInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                          unsigned SrcReg, int FrameIdx,
+                                          const TargetRegisterClass *RC) const {
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_mr;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_mr;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::ST_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::ST_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSmr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDmr;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSmr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64mr;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  addFrameReference(BuildMI(MBB, MI, TII.get(Opc)), FrameIdx)
+    .addReg(SrcReg, false, false, true);
+}
+
+void X86RegisterInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_rm;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_rm;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::LD_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::LD_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSrm;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDrm;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSrm;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rm;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  addFrameReference(BuildMI(MBB, MI, TII.get(Opc), DestReg), FrameIdx);
+}
+
+void X86RegisterInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *RC) const {
+  unsigned Opc;
+  if (RC == &X86::GR64RegClass) {
+    Opc = X86::MOV64rr;
+  } else if (RC == &X86::GR32RegClass) {
+    Opc = X86::MOV32rr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rr;
+  } else if (RC == &X86::GR8RegClass) {
+    Opc = X86::MOV8rr;
+  } else if (RC == &X86::GR32_RegClass) {
+    Opc = X86::MOV32_rr;
+  } else if (RC == &X86::GR16_RegClass) {
+    Opc = X86::MOV16_rr;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::MOV_Fp3232;
+  } else if (RC == &X86::RFP64RegClass || RC == &X86::RSTRegClass) {
+    Opc = X86::MOV_Fp6464;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::FsMOVAPSrr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::FsMOVAPDrr;
+  } else if (RC == &X86::VR128RegClass) {
+    Opc = X86::MOVAPSrr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rr;
+  } else {
+    assert(0 && "Unknown regclass");
+    abort();
+  }
+  BuildMI(MBB, MI, TII.get(Opc), DestReg).addReg(SrcReg);
+}
+
+
+void X86RegisterInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg,
+                                    const MachineInstr *Orig) const {
+  MachineInstr *MI = Orig->clone();
+  MI->getOperand(0).setReg(DestReg);
+  MBB.insert(I, MI);
+}
+
+static MachineInstr *FuseTwoAddrInst(unsigned Opcode, unsigned FrameIndex,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  unsigned NumOps = TII.getNumOperands(MI->getOpcode())-2;
+  // Create the base instruction with the memory operand as the first part.
+  MachineInstrBuilder MIB = addFrameReference(BuildMI(TII.get(Opcode)),
+                                              FrameIndex);
+  
+  // Loop over the rest of the ri operands, converting them over.
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    if (MO.isReg())
+      MIB = MIB.addReg(MO.getReg(), false, MO.isImplicit());
+    else if (MO.isImm())
+      MIB = MIB.addImm(MO.getImm());
+    else if (MO.isGlobalAddress())
+      MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+    else if (MO.isJumpTableIndex())
+      MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+    else if (MO.isExternalSymbol())
+      MIB = MIB.addExternalSymbol(MO.getSymbolName());
+    else
+      assert(0 && "Unknown operand type!");
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(unsigned Opcode, unsigned OpNo,
+                              unsigned FrameIndex, MachineInstr *MI,
+                              const TargetInstrInfo &TII) {
+  MachineInstrBuilder MIB = BuildMI(TII.get(Opcode));
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      MIB = addFrameReference(MIB, FrameIndex);
+    } else if (MO.isReg())
+      MIB = MIB.addReg(MO.getReg(), MO.isDef(), MO.isImplicit());
+    else if (MO.isImm())
+      MIB = MIB.addImm(MO.getImm());
+    else if (MO.isGlobalAddress())
+      MIB = MIB.addGlobalAddress(MO.getGlobal(), MO.getOffset());
+    else if (MO.isJumpTableIndex())
+      MIB = MIB.addJumpTableIndex(MO.getJumpTableIndex());
+    else if (MO.isExternalSymbol())
+      MIB = MIB.addExternalSymbol(MO.getSymbolName());
+    else
+      assert(0 && "Unknown operand for FuseInst!");
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII,
+                                unsigned Opcode, unsigned FrameIndex,
+                                MachineInstr *MI) {
+  return addFrameReference(BuildMI(TII.get(Opcode)), FrameIndex).addImm(0);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// TableEntry - Maps the 'from' opcode to a fused form of the 'to' opcode.
+  ///
+  struct TableEntry {
+    unsigned from;                      // Original opcode.
+    unsigned to;                        // New opcode.
+                                        
+    // less operators used by STL search.                                    
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+/// TableIsSorted - Return true if the table is in 'from' opcode order.
+///
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 1; i != NumEntries; ++i)
+    if (!(Table[i-1] < Table[i])) {
+      cerr << "Entries out of order " << Table[i-1].from
+           << " " << Table[i].from << "\n";
+      return false;
+    }
+  return true;
+}
+
+/// TableLookup - Return the table entry matching the specified opcode.
+/// Otherwise return NULL.
+static const TableEntry *TableLookup(const TableEntry *Table, unsigned N,
+                                unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I;
+  return NULL;
+}
+
+#define ARRAY_SIZE(TABLE)  \
+   (sizeof(TABLE)/sizeof(TABLE[0]))
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, ARRAY_SIZE(TABLE)) &&                  \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+
+MachineInstr* X86RegisterInfo::foldMemoryOperand(MachineInstr *MI,
+                                                 unsigned i,
+                                                 int FrameIndex) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  // Table (and size) to search
+  const TableEntry *OpcodeTablePtr = NULL;
+  unsigned OpcodeTableSize = 0;
+  bool isTwoAddrFold = false;
+  unsigned NumOps = TII.getNumOperands(MI->getOpcode());
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getInstrDescriptor()->getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  MachineInstr *NewMI = NULL;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() && 
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::ADC32ri,     X86::ADC32mi },
+      { X86::ADC32ri8,    X86::ADC32mi8 },
+      { X86::ADC32rr,     X86::ADC32mr },
+      { X86::ADC64ri32,   X86::ADC64mi32 },
+      { X86::ADC64ri8,    X86::ADC64mi8 },
+      { X86::ADC64rr,     X86::ADC64mr },
+      { X86::ADD16ri,     X86::ADD16mi },
+      { X86::ADD16ri8,    X86::ADD16mi8 },
+      { X86::ADD16rr,     X86::ADD16mr },
+      { X86::ADD32ri,     X86::ADD32mi },
+      { X86::ADD32ri8,    X86::ADD32mi8 },
+      { X86::ADD32rr,     X86::ADD32mr },
+      { X86::ADD64ri32,   X86::ADD64mi32 },
+      { X86::ADD64ri8,    X86::ADD64mi8 },
+      { X86::ADD64rr,     X86::ADD64mr },
+      { X86::ADD8ri,      X86::ADD8mi },
+      { X86::ADD8rr,      X86::ADD8mr },
+      { X86::AND16ri,     X86::AND16mi },
+      { X86::AND16ri8,    X86::AND16mi8 },
+      { X86::AND16rr,     X86::AND16mr },
+      { X86::AND32ri,     X86::AND32mi },
+      { X86::AND32ri8,    X86::AND32mi8 },
+      { X86::AND32rr,     X86::AND32mr },
+      { X86::AND64ri32,   X86::AND64mi32 },
+      { X86::AND64ri8,    X86::AND64mi8 },
+      { X86::AND64rr,     X86::AND64mr },
+      { X86::AND8ri,      X86::AND8mi },
+      { X86::AND8rr,      X86::AND8mr },
+      { X86::DEC16r,      X86::DEC16m },
+      { X86::DEC32r,      X86::DEC32m },
+      { X86::DEC64_16r,   X86::DEC16m },
+      { X86::DEC64_32r,   X86::DEC32m },
+      { X86::DEC64r,      X86::DEC64m },
+      { X86::DEC8r,       X86::DEC8m },
+      { X86::INC16r,      X86::INC16m },
+      { X86::INC32r,      X86::INC32m },
+      { X86::INC64_16r,   X86::INC16m },
+      { X86::INC64_32r,   X86::INC32m },
+      { X86::INC64r,      X86::INC64m },
+      { X86::INC8r,       X86::INC8m },
+      { X86::NEG16r,      X86::NEG16m },
+      { X86::NEG32r,      X86::NEG32m },
+      { X86::NEG64r,      X86::NEG64m },
+      { X86::NEG8r,       X86::NEG8m },
+      { X86::NOT16r,      X86::NOT16m },
+      { X86::NOT32r,      X86::NOT32m },
+      { X86::NOT64r,      X86::NOT64m },
+      { X86::NOT8r,       X86::NOT8m },
+      { X86::OR16ri,      X86::OR16mi },
+      { X86::OR16ri8,     X86::OR16mi8 },
+      { X86::OR16rr,      X86::OR16mr },
+      { X86::OR32ri,      X86::OR32mi },
+      { X86::OR32ri8,     X86::OR32mi8 },
+      { X86::OR32rr,      X86::OR32mr },
+      { X86::OR64ri32,    X86::OR64mi32 },
+      { X86::OR64ri8,     X86::OR64mi8 },
+      { X86::OR64rr,      X86::OR64mr },
+      { X86::OR8ri,       X86::OR8mi },
+      { X86::OR8rr,       X86::OR8mr },
+      { X86::ROL16r1,     X86::ROL16m1 },
+      { X86::ROL16rCL,    X86::ROL16mCL },
+      { X86::ROL16ri,     X86::ROL16mi },
+      { X86::ROL32r1,     X86::ROL32m1 },
+      { X86::ROL32rCL,    X86::ROL32mCL },
+      { X86::ROL32ri,     X86::ROL32mi },
+      { X86::ROL64r1,     X86::ROL64m1 },
+      { X86::ROL64rCL,    X86::ROL64mCL },
+      { X86::ROL64ri,     X86::ROL64mi },
+      { X86::ROL8r1,      X86::ROL8m1 },
+      { X86::ROL8rCL,     X86::ROL8mCL },
+      { X86::ROL8ri,      X86::ROL8mi },
+      { X86::ROR16r1,     X86::ROR16m1 },
+      { X86::ROR16rCL,    X86::ROR16mCL },
+      { X86::ROR16ri,     X86::ROR16mi },
+      { X86::ROR32r1,     X86::ROR32m1 },
+      { X86::ROR32rCL,    X86::ROR32mCL },
+      { X86::ROR32ri,     X86::ROR32mi },
+      { X86::ROR64r1,     X86::ROR64m1 },
+      { X86::ROR64rCL,    X86::ROR64mCL },
+      { X86::ROR64ri,     X86::ROR64mi },
+      { X86::ROR8r1,      X86::ROR8m1 },
+      { X86::ROR8rCL,     X86::ROR8mCL },
+      { X86::ROR8ri,      X86::ROR8mi },
+      { X86::SAR16r1,     X86::SAR16m1 },
+      { X86::SAR16rCL,    X86::SAR16mCL },
+      { X86::SAR16ri,     X86::SAR16mi },
+      { X86::SAR32r1,     X86::SAR32m1 },
+      { X86::SAR32rCL,    X86::SAR32mCL },
+      { X86::SAR32ri,     X86::SAR32mi },
+      { X86::SAR64r1,     X86::SAR64m1 },
+      { X86::SAR64rCL,    X86::SAR64mCL },
+      { X86::SAR64ri,     X86::SAR64mi },
+      { X86::SAR8r1,      X86::SAR8m1 },
+      { X86::SAR8rCL,     X86::SAR8mCL },
+      { X86::SAR8ri,      X86::SAR8mi },
+      { X86::SBB32ri,     X86::SBB32mi },
+      { X86::SBB32ri8,    X86::SBB32mi8 },
+      { X86::SBB32rr,     X86::SBB32mr },
+      { X86::SBB64ri32,   X86::SBB64mi32 },
+      { X86::SBB64ri8,    X86::SBB64mi8 },
+      { X86::SBB64rr,     X86::SBB64mr },
+      { X86::SHL16r1,     X86::SHL16m1 },
+      { X86::SHL16rCL,    X86::SHL16mCL },
+      { X86::SHL16ri,     X86::SHL16mi },
+      { X86::SHL32r1,     X86::SHL32m1 },
+      { X86::SHL32rCL,    X86::SHL32mCL },
+      { X86::SHL32ri,     X86::SHL32mi },
+      { X86::SHL64r1,     X86::SHL64m1 },
+      { X86::SHL64rCL,    X86::SHL64mCL },
+      { X86::SHL64ri,     X86::SHL64mi },
+      { X86::SHL8r1,      X86::SHL8m1 },
+      { X86::SHL8rCL,     X86::SHL8mCL },
+      { X86::SHL8ri,      X86::SHL8mi },
+      { X86::SHLD16rrCL,  X86::SHLD16mrCL },
+      { X86::SHLD16rri8,  X86::SHLD16mri8 },
+      { X86::SHLD32rrCL,  X86::SHLD32mrCL },
+      { X86::SHLD32rri8,  X86::SHLD32mri8 },
+      { X86::SHLD64rrCL,  X86::SHLD64mrCL },
+      { X86::SHLD64rri8,  X86::SHLD64mri8 },
+      { X86::SHR16r1,     X86::SHR16m1 },
+      { X86::SHR16rCL,    X86::SHR16mCL },
+      { X86::SHR16ri,     X86::SHR16mi },
+      { X86::SHR32r1,     X86::SHR32m1 },
+      { X86::SHR32rCL,    X86::SHR32mCL },
+      { X86::SHR32ri,     X86::SHR32mi },
+      { X86::SHR64r1,     X86::SHR64m1 },
+      { X86::SHR64rCL,    X86::SHR64mCL },
+      { X86::SHR64ri,     X86::SHR64mi },
+      { X86::SHR8r1,      X86::SHR8m1 },
+      { X86::SHR8rCL,     X86::SHR8mCL },
+      { X86::SHR8ri,      X86::SHR8mi },
+      { X86::SHRD16rrCL,  X86::SHRD16mrCL },
+      { X86::SHRD16rri8,  X86::SHRD16mri8 },
+      { X86::SHRD32rrCL,  X86::SHRD32mrCL },
+      { X86::SHRD32rri8,  X86::SHRD32mri8 },
+      { X86::SHRD64rrCL,  X86::SHRD64mrCL },
+      { X86::SHRD64rri8,  X86::SHRD64mri8 },
+      { X86::SUB16ri,     X86::SUB16mi },
+      { X86::SUB16ri8,    X86::SUB16mi8 },
+      { X86::SUB16rr,     X86::SUB16mr },
+      { X86::SUB32ri,     X86::SUB32mi },
+      { X86::SUB32ri8,    X86::SUB32mi8 },
+      { X86::SUB32rr,     X86::SUB32mr },
+      { X86::SUB64ri32,   X86::SUB64mi32 },
+      { X86::SUB64ri8,    X86::SUB64mi8 },
+      { X86::SUB64rr,     X86::SUB64mr },
+      { X86::SUB8ri,      X86::SUB8mi },
+      { X86::SUB8rr,      X86::SUB8mr },
+      { X86::XOR16ri,     X86::XOR16mi },
+      { X86::XOR16ri8,    X86::XOR16mi8 },
+      { X86::XOR16rr,     X86::XOR16mr },
+      { X86::XOR32ri,     X86::XOR32mi },
+      { X86::XOR32ri8,    X86::XOR32mi8 },
+      { X86::XOR32rr,     X86::XOR32mr },
+      { X86::XOR64ri32,   X86::XOR64mi32 },
+      { X86::XOR64ri8,    X86::XOR64mi8 },
+      { X86::XOR64rr,     X86::XOR64mr },
+      { X86::XOR8ri,      X86::XOR8mi },
+      { X86::XOR8rr,      X86::XOR8mr }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    if (MI->getOpcode() == X86::MOV16r0)
+      NewMI = MakeM0Inst(TII, X86::MOV16mi, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV32r0)
+      NewMI = MakeM0Inst(TII, X86::MOV32mi, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV64r0)
+      NewMI = MakeM0Inst(TII, X86::MOV64mi32, FrameIndex, MI);
+    else if (MI->getOpcode() == X86::MOV8r0)
+      NewMI = MakeM0Inst(TII, X86::MOV8mi, FrameIndex, MI);
+    if (NewMI) {
+      NewMI->copyKillDeadInfo(MI);
+      return NewMI;
+    }
+    
+    static const TableEntry OpcodeTable[] = {
+      { X86::CMP16ri,     X86::CMP16mi },
+      { X86::CMP16ri8,    X86::CMP16mi8 },
+      { X86::CMP32ri,     X86::CMP32mi },
+      { X86::CMP32ri8,    X86::CMP32mi8 },
+      { X86::CMP8ri,      X86::CMP8mi },
+      { X86::DIV16r,      X86::DIV16m },
+      { X86::DIV32r,      X86::DIV32m },
+      { X86::DIV64r,      X86::DIV64m },
+      { X86::DIV8r,       X86::DIV8m },
+      { X86::FsMOVAPDrr,  X86::MOVSDmr },
+      { X86::FsMOVAPSrr,  X86::MOVSSmr },
+      { X86::IDIV16r,     X86::IDIV16m },
+      { X86::IDIV32r,     X86::IDIV32m },
+      { X86::IDIV64r,     X86::IDIV64m },
+      { X86::IDIV8r,      X86::IDIV8m },
+      { X86::IMUL16r,     X86::IMUL16m },
+      { X86::IMUL32r,     X86::IMUL32m },
+      { X86::IMUL64r,     X86::IMUL64m },
+      { X86::IMUL8r,      X86::IMUL8m },
+      { X86::MOV16ri,     X86::MOV16mi },
+      { X86::MOV16rr,     X86::MOV16mr },
+      { X86::MOV32ri,     X86::MOV32mi },
+      { X86::MOV32rr,     X86::MOV32mr },
+      { X86::MOV64ri32,   X86::MOV64mi32 },
+      { X86::MOV64rr,     X86::MOV64mr },
+      { X86::MOV8ri,      X86::MOV8mi },
+      { X86::MOV8rr,      X86::MOV8mr },
+      { X86::MOVAPDrr,    X86::MOVAPDmr },
+      { X86::MOVAPSrr,    X86::MOVAPSmr },
+      { X86::MOVPDI2DIrr, X86::MOVPDI2DImr },
+      { X86::MOVPQIto64rr,X86::MOVPQIto64mr },
+      { X86::MOVPS2SSrr,  X86::MOVPS2SSmr },
+      { X86::MOVSDrr,     X86::MOVSDmr },
+      { X86::MOVSDto64rr, X86::MOVSDto64mr },
+      { X86::MOVSS2DIrr,  X86::MOVSS2DImr },
+      { X86::MOVSSrr,     X86::MOVSSmr },
+      { X86::MOVUPDrr,    X86::MOVUPDmr },
+      { X86::MOVUPSrr,    X86::MOVUPSmr },
+      { X86::MUL16r,      X86::MUL16m },
+      { X86::MUL32r,      X86::MUL32m },
+      { X86::MUL64r,      X86::MUL64m },
+      { X86::MUL8r,       X86::MUL8m },
+      { X86::SETAEr,      X86::SETAEm },
+      { X86::SETAr,       X86::SETAm },
+      { X86::SETBEr,      X86::SETBEm },
+      { X86::SETBr,       X86::SETBm },
+      { X86::SETEr,       X86::SETEm },
+      { X86::SETGEr,      X86::SETGEm },
+      { X86::SETGr,       X86::SETGm },
+      { X86::SETLEr,      X86::SETLEm },
+      { X86::SETLr,       X86::SETLm },
+      { X86::SETNEr,      X86::SETNEm },
+      { X86::SETNPr,      X86::SETNPm },
+      { X86::SETNSr,      X86::SETNSm },
+      { X86::SETPr,       X86::SETPm },
+      { X86::SETSr,       X86::SETSm },
+      { X86::TEST16ri,    X86::TEST16mi },
+      { X86::TEST32ri,    X86::TEST32mi },
+      { X86::TEST64ri32,  X86::TEST64mi32 },
+      { X86::TEST8ri,     X86::TEST8mi },
+      { X86::XCHG16rr,    X86::XCHG16mr },
+      { X86::XCHG32rr,    X86::XCHG32mr },
+      { X86::XCHG64rr,    X86::XCHG64mr },
+      { X86::XCHG8rr,     X86::XCHG8mr }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  } else if (i == 1) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::CMP16rr,         X86::CMP16rm },
+      { X86::CMP32rr,         X86::CMP32rm },
+      { X86::CMP64ri32,       X86::CMP64mi32 },
+      { X86::CMP64ri8,        X86::CMP64mi8 },
+      { X86::CMP64rr,         X86::CMP64rm },
+      { X86::CMP8rr,          X86::CMP8rm },
+      { X86::CMPPDrri,        X86::CMPPDrmi },
+      { X86::CMPPSrri,        X86::CMPPSrmi },
+      { X86::CMPSDrr,         X86::CMPSDrm },
+      { X86::CMPSSrr,         X86::CMPSSrm },
+      { X86::CVTSD2SSrr,      X86::CVTSD2SSrm },
+      { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm },
+      { X86::CVTSI2SDrr,      X86::CVTSI2SDrm },
+      { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm },
+      { X86::CVTSI2SSrr,      X86::CVTSI2SSrm },
+      { X86::CVTSS2SDrr,      X86::CVTSS2SDrm },
+      { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm },
+      { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm },
+      { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm },
+      { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm },
+      { X86::FsMOVAPDrr,      X86::MOVSDrm },
+      { X86::FsMOVAPSrr,      X86::MOVSSrm },
+      { X86::IMUL16rri,       X86::IMUL16rmi },
+      { X86::IMUL16rri8,      X86::IMUL16rmi8 },
+      { X86::IMUL32rri,       X86::IMUL32rmi },
+      { X86::IMUL32rri8,      X86::IMUL32rmi8 },
+      { X86::IMUL64rr,        X86::IMUL64rm },
+      { X86::IMUL64rri32,     X86::IMUL64rmi32 },
+      { X86::IMUL64rri8,      X86::IMUL64rmi8 },
+      { X86::Int_CMPSDrr,     X86::Int_CMPSDrm },
+      { X86::Int_CMPSSrr,     X86::Int_CMPSSrm },
+      { X86::Int_COMISDrr,    X86::Int_COMISDrm },
+      { X86::Int_COMISSrr,    X86::Int_COMISSrm },
+      { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm },
+      { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm },
+      { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm },
+      { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm },
+      { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm },
+      { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm },
+      { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm },
+      { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm },
+      { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm },
+      { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm },
+      { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm },
+      { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm },
+      { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm },
+      { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm },
+      { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm },
+      { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm },
+      { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm },
+      { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm },
+      { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm },
+      { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm },
+      { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm },
+      { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm },
+      { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm },
+      { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm },
+      { X86::MOV16rr,         X86::MOV16rm },
+      { X86::MOV32rr,         X86::MOV32rm },
+      { X86::MOV64rr,         X86::MOV64rm },
+      { X86::MOV64toPQIrr,    X86::MOV64toPQIrm },
+      { X86::MOV64toSDrr,     X86::MOV64toSDrm },
+      { X86::MOV8rr,          X86::MOV8rm },
+      { X86::MOVAPDrr,        X86::MOVAPDrm },
+      { X86::MOVAPSrr,        X86::MOVAPSrm },
+      { X86::MOVDDUPrr,       X86::MOVDDUPrm },
+      { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm },
+      { X86::MOVDI2SSrr,      X86::MOVDI2SSrm },
+      { X86::MOVSD2PDrr,      X86::MOVSD2PDrm },
+      { X86::MOVSDrr,         X86::MOVSDrm },
+      { X86::MOVSHDUPrr,      X86::MOVSHDUPrm },
+      { X86::MOVSLDUPrr,      X86::MOVSLDUPrm },
+      { X86::MOVSS2PSrr,      X86::MOVSS2PSrm },
+      { X86::MOVSSrr,         X86::MOVSSrm },
+      { X86::MOVSX16rr8,      X86::MOVSX16rm8 },
+      { X86::MOVSX32rr16,     X86::MOVSX32rm16 },
+      { X86::MOVSX32rr8,      X86::MOVSX32rm8 },
+      { X86::MOVSX64rr16,     X86::MOVSX64rm16 },
+      { X86::MOVSX64rr32,     X86::MOVSX64rm32 },
+      { X86::MOVSX64rr8,      X86::MOVSX64rm8 },
+      { X86::MOVUPDrr,        X86::MOVUPDrm },
+      { X86::MOVUPSrr,        X86::MOVUPSrm },
+      { X86::MOVZX16rr8,      X86::MOVZX16rm8 },
+      { X86::MOVZX32rr16,     X86::MOVZX32rm16 },
+      { X86::MOVZX32rr8,      X86::MOVZX32rm8 },
+      { X86::MOVZX64rr16,     X86::MOVZX64rm16 },
+      { X86::MOVZX64rr8,      X86::MOVZX64rm8 },
+      { X86::PSHUFDri,        X86::PSHUFDmi },
+      { X86::PSHUFHWri,       X86::PSHUFHWmi },
+      { X86::PSHUFLWri,       X86::PSHUFLWmi },
+      { X86::PsMOVZX64rr32,   X86::PsMOVZX64rm32 },
+      { X86::TEST16rr,        X86::TEST16rm },
+      { X86::TEST32rr,        X86::TEST32rm },
+      { X86::TEST64rr,        X86::TEST64rm },
+      { X86::TEST8rr,         X86::TEST8rm },
+      // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+      { X86::UCOMISDrr,       X86::UCOMISDrm },
+      { X86::UCOMISSrr,       X86::UCOMISSrm },
+      { X86::XCHG16rr,        X86::XCHG16rm },
+      { X86::XCHG32rr,        X86::XCHG32rm },
+      { X86::XCHG64rr,        X86::XCHG64rm },
+      { X86::XCHG8rr,         X86::XCHG8rm }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  } else if (i == 2) {
+    static const TableEntry OpcodeTable[] = {
+      { X86::ADC32rr,         X86::ADC32rm },
+      { X86::ADC64rr,         X86::ADC64rm },
+      { X86::ADD16rr,         X86::ADD16rm },
+      { X86::ADD32rr,         X86::ADD32rm },
+      { X86::ADD64rr,         X86::ADD64rm },
+      { X86::ADD8rr,          X86::ADD8rm },
+      { X86::ADDPDrr,         X86::ADDPDrm },
+      { X86::ADDPSrr,         X86::ADDPSrm },
+      { X86::ADDSDrr,         X86::ADDSDrm },
+      { X86::ADDSSrr,         X86::ADDSSrm },
+      { X86::ADDSUBPDrr,      X86::ADDSUBPDrm },
+      { X86::ADDSUBPSrr,      X86::ADDSUBPSrm },
+      { X86::AND16rr,         X86::AND16rm },
+      { X86::AND32rr,         X86::AND32rm },
+      { X86::AND64rr,         X86::AND64rm },
+      { X86::AND8rr,          X86::AND8rm },
+      { X86::ANDNPDrr,        X86::ANDNPDrm },
+      { X86::ANDNPSrr,        X86::ANDNPSrm },
+      { X86::ANDPDrr,         X86::ANDPDrm },
+      { X86::ANDPSrr,         X86::ANDPSrm },
+      { X86::CMOVA16rr,       X86::CMOVA16rm },
+      { X86::CMOVA32rr,       X86::CMOVA32rm },
+      { X86::CMOVA64rr,       X86::CMOVA64rm },
+      { X86::CMOVAE16rr,      X86::CMOVAE16rm },
+      { X86::CMOVAE32rr,      X86::CMOVAE32rm },
+      { X86::CMOVAE64rr,      X86::CMOVAE64rm },
+      { X86::CMOVB16rr,       X86::CMOVB16rm },
+      { X86::CMOVB32rr,       X86::CMOVB32rm },
+      { X86::CMOVB64rr,       X86::CMOVB64rm },
+      { X86::CMOVBE16rr,      X86::CMOVBE16rm },
+      { X86::CMOVBE32rr,      X86::CMOVBE32rm },
+      { X86::CMOVBE64rr,      X86::CMOVBE64rm },
+      { X86::CMOVE16rr,       X86::CMOVE16rm },
+      { X86::CMOVE32rr,       X86::CMOVE32rm },
+      { X86::CMOVE64rr,       X86::CMOVE64rm },
+      { X86::CMOVG16rr,       X86::CMOVG16rm },
+      { X86::CMOVG32rr,       X86::CMOVG32rm },
+      { X86::CMOVG64rr,       X86::CMOVG64rm },
+      { X86::CMOVGE16rr,      X86::CMOVGE16rm },
+      { X86::CMOVGE32rr,      X86::CMOVGE32rm },
+      { X86::CMOVGE64rr,      X86::CMOVGE64rm },
+      { X86::CMOVL16rr,       X86::CMOVL16rm },
+      { X86::CMOVL32rr,       X86::CMOVL32rm },
+      { X86::CMOVL64rr,       X86::CMOVL64rm },
+      { X86::CMOVLE16rr,      X86::CMOVLE16rm },
+      { X86::CMOVLE32rr,      X86::CMOVLE32rm },
+      { X86::CMOVLE64rr,      X86::CMOVLE64rm },
+      { X86::CMOVNE16rr,      X86::CMOVNE16rm },
+      { X86::CMOVNE32rr,      X86::CMOVNE32rm },
+      { X86::CMOVNE64rr,      X86::CMOVNE64rm },
+      { X86::CMOVNP16rr,      X86::CMOVNP16rm },
+      { X86::CMOVNP32rr,      X86::CMOVNP32rm },
+      { X86::CMOVNP64rr,      X86::CMOVNP64rm },
+      { X86::CMOVNS16rr,      X86::CMOVNS16rm },
+      { X86::CMOVNS32rr,      X86::CMOVNS32rm },
+      { X86::CMOVNS64rr,      X86::CMOVNS64rm },
+      { X86::CMOVP16rr,       X86::CMOVP16rm },
+      { X86::CMOVP32rr,       X86::CMOVP32rm },
+      { X86::CMOVP64rr,       X86::CMOVP64rm },
+      { X86::CMOVS16rr,       X86::CMOVS16rm },
+      { X86::CMOVS32rr,       X86::CMOVS32rm },
+      { X86::CMOVS64rr,       X86::CMOVS64rm },
+      { X86::DIVPDrr,         X86::DIVPDrm },
+      { X86::DIVPSrr,         X86::DIVPSrm },
+      { X86::DIVSDrr,         X86::DIVSDrm },
+      { X86::DIVSSrr,         X86::DIVSSrm },
+      { X86::HADDPDrr,        X86::HADDPDrm },
+      { X86::HADDPSrr,        X86::HADDPSrm },
+      { X86::HSUBPDrr,        X86::HSUBPDrm },
+      { X86::HSUBPSrr,        X86::HSUBPSrm },
+      { X86::IMUL16rr,        X86::IMUL16rm },
+      { X86::IMUL32rr,        X86::IMUL32rm },
+      { X86::MAXPDrr,         X86::MAXPDrm },
+      { X86::MAXPDrr_Int,     X86::MAXPDrm_Int },
+      { X86::MAXPSrr,         X86::MAXPSrm },
+      { X86::MAXPSrr_Int,     X86::MAXPSrm_Int },
+      { X86::MAXSDrr,         X86::MAXSDrm },
+      { X86::MAXSDrr_Int,     X86::MAXSDrm_Int },
+      { X86::MAXSSrr,         X86::MAXSSrm },
+      { X86::MAXSSrr_Int,     X86::MAXSSrm_Int },
+      { X86::MINPDrr,         X86::MINPDrm },
+      { X86::MINPDrr_Int,     X86::MINPDrm_Int },
+      { X86::MINPSrr,         X86::MINPSrm },
+      { X86::MINPSrr_Int,     X86::MINPSrm_Int },
+      { X86::MINSDrr,         X86::MINSDrm },
+      { X86::MINSDrr_Int,     X86::MINSDrm_Int },
+      { X86::MINSSrr,         X86::MINSSrm },
+      { X86::MINSSrr_Int,     X86::MINSSrm_Int },
+      { X86::MULPDrr,         X86::MULPDrm },
+      { X86::MULPSrr,         X86::MULPSrm },
+      { X86::MULSDrr,         X86::MULSDrm },
+      { X86::MULSSrr,         X86::MULSSrm },
+      { X86::OR16rr,          X86::OR16rm },
+      { X86::OR32rr,          X86::OR32rm },
+      { X86::OR64rr,          X86::OR64rm },
+      { X86::OR8rr,           X86::OR8rm },
+      { X86::ORPDrr,          X86::ORPDrm },
+      { X86::ORPSrr,          X86::ORPSrm },
+      { X86::PACKSSDWrr,      X86::PACKSSDWrm },
+      { X86::PACKSSWBrr,      X86::PACKSSWBrm },
+      { X86::PACKUSWBrr,      X86::PACKUSWBrm },
+      { X86::PADDBrr,         X86::PADDBrm },
+      { X86::PADDDrr,         X86::PADDDrm },
+      { X86::PADDQrr,         X86::PADDQrm },
+      { X86::PADDSBrr,        X86::PADDSBrm },
+      { X86::PADDSWrr,        X86::PADDSWrm },
+      { X86::PADDWrr,         X86::PADDWrm },
+      { X86::PANDNrr,         X86::PANDNrm },
+      { X86::PANDrr,          X86::PANDrm },
+      { X86::PAVGBrr,         X86::PAVGBrm },
+      { X86::PAVGWrr,         X86::PAVGWrm },
+      { X86::PCMPEQBrr,       X86::PCMPEQBrm },
+      { X86::PCMPEQDrr,       X86::PCMPEQDrm },
+      { X86::PCMPEQWrr,       X86::PCMPEQWrm },
+      { X86::PCMPGTBrr,       X86::PCMPGTBrm },
+      { X86::PCMPGTDrr,       X86::PCMPGTDrm },
+      { X86::PCMPGTWrr,       X86::PCMPGTWrm },
+      { X86::PINSRWrri,       X86::PINSRWrmi },
+      { X86::PMADDWDrr,       X86::PMADDWDrm },
+      { X86::PMAXSWrr,        X86::PMAXSWrm },
+      { X86::PMAXUBrr,        X86::PMAXUBrm },
+      { X86::PMINSWrr,        X86::PMINSWrm },
+      { X86::PMINUBrr,        X86::PMINUBrm },
+      { X86::PMULHUWrr,       X86::PMULHUWrm },
+      { X86::PMULHWrr,        X86::PMULHWrm },
+      { X86::PMULLWrr,        X86::PMULLWrm },
+      { X86::PMULUDQrr,       X86::PMULUDQrm },
+      { X86::PORrr,           X86::PORrm },
+      { X86::PSADBWrr,        X86::PSADBWrm },
+      { X86::PSLLDrr,         X86::PSLLDrm },
+      { X86::PSLLQrr,         X86::PSLLQrm },
+      { X86::PSLLWrr,         X86::PSLLWrm },
+      { X86::PSRADrr,         X86::PSRADrm },
+      { X86::PSRAWrr,         X86::PSRAWrm },
+      { X86::PSRLDrr,         X86::PSRLDrm },
+      { X86::PSRLQrr,         X86::PSRLQrm },
+      { X86::PSRLWrr,         X86::PSRLWrm },
+      { X86::PSUBBrr,         X86::PSUBBrm },
+      { X86::PSUBDrr,         X86::PSUBDrm },
+      { X86::PSUBSBrr,        X86::PSUBSBrm },
+      { X86::PSUBSWrr,        X86::PSUBSWrm },
+      { X86::PSUBWrr,         X86::PSUBWrm },
+      { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm },
+      { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm },
+      { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm },
+      { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm },
+      { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm },
+      { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm },
+      { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm },
+      { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm },
+      { X86::PXORrr,          X86::PXORrm },
+      { X86::RCPPSr,          X86::RCPPSm },
+      { X86::RCPPSr_Int,      X86::RCPPSm_Int },
+      { X86::RSQRTPSr,        X86::RSQRTPSm },
+      { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int },
+      { X86::RSQRTSSr,        X86::RSQRTSSm },
+      { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int },
+      { X86::SBB32rr,         X86::SBB32rm },
+      { X86::SBB64rr,         X86::SBB64rm },
+      { X86::SHUFPDrri,       X86::SHUFPDrmi },
+      { X86::SHUFPSrri,       X86::SHUFPSrmi },
+      { X86::SQRTPDr,         X86::SQRTPDm },
+      { X86::SQRTPDr_Int,     X86::SQRTPDm_Int },
+      { X86::SQRTPSr,         X86::SQRTPSm },
+      { X86::SQRTPSr_Int,     X86::SQRTPSm_Int },
+      { X86::SQRTSDr,         X86::SQRTSDm },
+      { X86::SQRTSDr_Int,     X86::SQRTSDm_Int },
+      { X86::SQRTSSr,         X86::SQRTSSm },
+      { X86::SQRTSSr_Int,     X86::SQRTSSm_Int },
+      { X86::SUB16rr,         X86::SUB16rm },
+      { X86::SUB32rr,         X86::SUB32rm },
+      { X86::SUB64rr,         X86::SUB64rm },
+      { X86::SUB8rr,          X86::SUB8rm },
+      { X86::SUBPDrr,         X86::SUBPDrm },
+      { X86::SUBPSrr,         X86::SUBPSrm },
+      { X86::SUBSDrr,         X86::SUBSDrm },
+      { X86::SUBSSrr,         X86::SUBSSrm },
+      // FIXME: TEST*rr -> swapped operand of TEST*mr.
+      { X86::UNPCKHPDrr,      X86::UNPCKHPDrm },
+      { X86::UNPCKHPSrr,      X86::UNPCKHPSrm },
+      { X86::UNPCKLPDrr,      X86::UNPCKLPDrm },
+      { X86::UNPCKLPSrr,      X86::UNPCKLPSrm },
+      { X86::XOR16rr,         X86::XOR16rm },
+      { X86::XOR32rr,         X86::XOR32rm },
+      { X86::XOR64rr,         X86::XOR64rm },
+      { X86::XOR8rr,          X86::XOR8rm },
+      { X86::XORPDrr,         X86::XORPDrm },
+      { X86::XORPSrr,         X86::XORPSrm }
+    };
+    ASSERT_SORTED(OpcodeTable);
+    OpcodeTablePtr = OpcodeTable;
+    OpcodeTableSize = ARRAY_SIZE(OpcodeTable);
+  }
+  
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    unsigned fromOpcode = MI->getOpcode();
+    // Lookup fromOpcode in table
+    if (const TableEntry *Entry = TableLookup(OpcodeTablePtr, OpcodeTableSize,
+                                              fromOpcode)) {
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(Entry->to, FrameIndex, MI, TII);
+      else
+        NewMI = FuseInst(Entry->to, i, FrameIndex, MI, TII);
+      NewMI->copyKillDeadInfo(MI);
+      return NewMI;
+    }
+  }
+  
+  // No fusion 
+  if (PrintFailedFusing)
+    cerr << "We failed to fuse ("
+         << ((i == 1) ? "r" : "s") << "): " << *MI;
+  return NULL;
+}
+
+
+const unsigned *X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs32Bit[] = {
+    X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs32EHRet[] = {
+    X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs64Bit[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  if (Is64Bit)
+    return CalleeSavedRegs64Bit;
+  else {
+    if (MF) {
+        MachineFrameInfo *MFI = MF->getFrameInfo();
+        MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+        if (MMI && MMI->callsEHReturn())
+          return CalleeSavedRegs32EHRet;
+    }
+    return CalleeSavedRegs32Bit;
+  }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+
+  if (Is64Bit)
+    return CalleeSavedRegClasses64Bit;
+  else {
+    if (MF) {
+        MachineFrameInfo *MFI = MF->getFrameInfo();
+        MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+        if (MMI && MMI->callsEHReturn())
+          return CalleeSavedRegClasses32EHRet;
+    }
+    return CalleeSavedRegClasses32Bit;
+  }
+
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(X86::RSP);
+  Reserved.set(X86::ESP);
+  Reserved.set(X86::SP);
+  Reserved.set(X86::SPL);
+  if (hasFP(MF)) {
+    Reserved.set(X86::RBP);
+    Reserved.set(X86::EBP);
+    Reserved.set(X86::BP);
+    Reserved.set(X86::BPL);
+  }
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+  return (NoFramePointerElim || 
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          (MMI && MMI->callsUnwindInit()));
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+    // <amt>'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == X86::ADJCALLSTACKDOWN) {
+        New=BuildMI(TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == X86::ADJCALLSTACKUP);
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount) {
+          unsigned Opc = (Amount < 128) ?
+            (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+            (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+          New = BuildMI(TII.get(Opc),  StackPtr)
+                        .addReg(StackPtr).addImm(Amount);
+        }
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      if (New) MBB.insert(I, New);
+    }
+  } else if (I->getOpcode() == X86::ADJCALLSTACKUP) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      unsigned Opc = (CalleeAmt < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+      MachineInstr *New =
+        BuildMI(TII.get(Opc), StackPtr).addReg(StackPtr).addImm(CalleeAmt);
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, RegScavenger *RS) const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  while (!MI.getOperand(i).isFrameIndex()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getFrameIndex();
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(i).ChangeToRegister(hasFP(MF) ? FramePtr : StackPtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+                   MI.getOperand(i+3).getImm()+SlotSize;
+
+  if (!hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += SlotSize;  // Skip the saved EBP
+
+  MI.getOperand(i+3).ChangeToImmediate(Offset);
+}
+
+void
+X86RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF) const{
+  if (hasFP(MF)) {
+    // Create a frame entry for the EBP register that must be saved.
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                        (int)SlotSize * -2);
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for EBP register must be last in order to be found!");
+  }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+                  const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc = isSub
+    ? ((Offset < 128) ?
+       (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+       (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+    : ((Offset < 128) ?
+       (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+       (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+  uint64_t Chunk = (1LL << 31) - 1;
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    BuildMI(MBB, MBBI, TII.get(Opc), StackPtr).addReg(StackPtr).addImm(ThisVal);
+    Offset -= ThisVal;
+  }
+}
+
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  const Function* Fn = MF.getFunction();
+  const X86Subtarget* Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0, StartLabelId = 0;
+  
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+
+  if (MMI && MMI->needsFrameInfo()) {
+    // Mark function start
+    StartLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(StartLabelId);
+  }
+
+  if (hasFP(MF)) {
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(SlotSize-NumBytes);
+
+    // Save EBP into the appropriate stack slot...
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr);
+    NumBytes -= SlotSize;
+
+    if (MMI && MMI->needsFrameInfo()) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(FrameLabelId);
+    }
+
+    // Update EBP with the new base value...
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+      .addReg(StackPtr);
+  }
+  
+  unsigned ReadyLabelId = 0;
+  if (MMI && MMI->needsFrameInfo()) {
+    // Mark effective beginning of when frame pointer is ready.
+    ReadyLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, TII.get(X86::LABEL)).addImm(ReadyLabelId);
+  }
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r))
+    ++MBBI;
+
+  if (NumBytes) {   // adjust stack pointer: ESP -= numbytes
+    if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+      // Check, whether EAX is livein for this function
+      bool isEAXAlive = false;
+      for (MachineFunction::livein_iterator II = MF.livein_begin(),
+             EE = MF.livein_end(); (II != EE) && !isEAXAlive; ++II) {
+        unsigned Reg = II->first;
+        isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+                      Reg == X86::AH || Reg == X86::AL);
+      }
+
+      // Function prologue calls _alloca to probe the stack when allocating  
+      // more than 4k bytes in one go. Touching the stack at 4K increments is  
+      // necessary to ensure that the guard pages used by the OS virtual memory
+      // manager are allocated in correct sequence.
+      if (!isEAXAlive) {
+        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes);
+        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+      } else {
+        // Save EAX
+        BuildMI(MBB, MBBI, TII.get(X86::PUSH32r), X86::EAX);
+        // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+        // allocated bytes for EAX.
+        BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(NumBytes-4);
+        BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32))
+          .addExternalSymbol("_alloca");
+        // Restore EAX
+        MachineInstr *MI = addRegOffset(BuildMI(TII.get(X86::MOV32rm),X86::EAX),
+                                        StackPtr, NumBytes-4);
+        MBB.insert(MBBI, MI);
+      }
+    } else {
+      // If there is an ADD32ri or SUB32ri of ESP immediately after this
+      // instruction, merge the two instructions.
+      if (MBBI != MBB.end()) {
+        MachineBasicBlock::iterator NI = next(MBBI);
+        unsigned Opc = MBBI->getOpcode();
+        if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+             Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+            MBBI->getOperand(0).getReg() == StackPtr) {
+          NumBytes -= MBBI->getOperand(2).getImm();
+          MBB.erase(MBBI);
+          MBBI = NI;
+        } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+                    Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+                   MBBI->getOperand(0).getReg() == StackPtr) {
+          NumBytes += MBBI->getOperand(2).getImm();
+          MBB.erase(MBBI);
+          MBBI = NI;
+        }
+      }
+
+      if (NumBytes)
+        emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+    }
+  }
+
+  if (MMI && MMI->needsFrameInfo()) {
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    const TargetAsmInfo *TAI = MF.getTarget().getTargetAsmInfo();
+
+    // Calculate amount of bytes used for return address storing
+    int stackGrowth =
+      (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+       TargetFrameInfo::StackGrowsUp ?
+       TAI->getAddressSize() : -TAI->getAddressSize());
+
+    if (StackSize) {
+      // Show update of SP.
+      if (hasFP(MF)) {
+        // Adjust SP
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, 2*stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      } else {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, -StackSize+stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      }
+    } else {
+      //FIXME: Verify & implement for FP
+      MachineLocation SPDst(StackPtr);
+      MachineLocation SPSrc(StackPtr, stackGrowth);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+            
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      int64_t Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned Reg = CSI[I].getReg();
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+    }
+    
+    if (hasFP(MF)) {
+      // Save FP
+      MachineLocation FPDst(MachineLocation::VirtualFP, 2*stackGrowth);
+      MachineLocation FPSrc(FramePtr);
+      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    }
+    
+    MachineLocation FPDst(hasFP(MF) ? FramePtr : StackPtr);
+    MachineLocation FPSrc(MachineLocation::VirtualFP);
+    Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+  }
+
+  // If it's main() on Cygwin\Mingw32 we should align stack as well
+  if (Fn->hasExternalLinkage() && Fn->getName() == "main" &&
+      Subtarget->isTargetCygMing()) {
+    BuildMI(MBB, MBBI, TII.get(X86::AND32ri), X86::ESP)
+                .addReg(X86::ESP).addImm(-Align);
+
+    // Probe the stack
+    BuildMI(MBB, MBBI, TII.get(X86::MOV32ri), X86::EAX).addImm(Align);
+    BuildMI(MBB, MBBI, TII.get(X86::CALLpcrel32)).addExternalSymbol("_alloca");
+  }
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  switch (RetOpcode) {
+  case X86::RET:
+  case X86::RETI:
+  case X86::EH_RETURN:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = StackSize - CSSize;
+
+  if (hasFP(MF)) {
+    // pop EBP.
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+    NumBytes -= SlotSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);      
+    if (PI->getOpcode() != X86::POP32r && PI->getOpcode() != X86::POP64r)
+      break;
+    --MBBI;
+  }
+
+  // If dynamic alloca is used, then reset esp to point to the last
+  // callee-saved slot before popping them off!
+  if (MFI->hasVarSizedObjects()) {
+    unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+    MachineInstr *MI = addRegOffset(BuildMI(TII.get(Opc), StackPtr),
+                                    FramePtr, -CSSize);
+    MBB.insert(MBBI, MI);
+    NumBytes = 0;
+  }
+
+  if (NumBytes) {    // adjust stack pointer back: ESP += numbytes
+    // If there is an ADD32ri or SUB32ri of ESP immediately before this
+    // instruction, merge the two instructions.
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PI = prior(MBBI);
+      unsigned Opc = PI->getOpcode();
+      if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+           Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+          PI->getOperand(0).getReg() == StackPtr) {
+        NumBytes += PI->getOperand(2).getImm();
+        MBB.erase(PI);
+      } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+                  Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+                 PI->getOperand(0).getReg() == StackPtr) {
+        NumBytes -= PI->getOperand(2).getImm();
+        MBB.erase(PI);
+      }
+    }
+
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN) {
+    MBBI = prior(MBB.end());
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),StackPtr).
+      addReg(DestAddr.getReg());
+  }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+  if (Is64Bit)
+    return X86::RIP;  // Should have dwarf #16
+  else
+    return X86::EIP;  // Should have dwarf #8
+}
+
+unsigned X86RegisterInfo::getFrameRegister(MachineFunction &MF) const {
+  return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = (Is64Bit ? -8 : -4);
+
+  // Initial state of the frame pointer is esp+4.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(StackPtr, stackGrowth);
+  Moves.push_back(MachineMove(0, Dst, Src));
+
+  // Add return address to move list
+  MachineLocation CSDst(StackPtr, stackGrowth);
+  MachineLocation CSSrc(getRARegister());
+  Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::ValueType VT, bool High) {
+  switch (VT) {
+  default: return Reg;
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+
+  return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"
+
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..ab9e33f
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -0,0 +1,130 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/MRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86TargetMachine;
+
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+  X86TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  bool Is64Bit;
+
+  /// SlotSize - Stack slot size in bytes.
+  unsigned SlotSize;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  unsigned FramePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  void copyRegToReg(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *RC) const;
+ 
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, const MachineInstr *Orig) const;
+
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand.  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  MachineInstr* foldMemoryOperand(MachineInstr* MI,
+                                  unsigned OpNum,
+                                  int FrameIndex) const;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+  /// register classes to spill each callee-saved register with.  The order and
+  /// length of this list match the getCalleeSavedRegs() list.
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                           int SPAdj, RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::ValueType, bool High=false);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..a1e7bb9
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -0,0 +1,468 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+let Namespace = "X86" in {
+
+  // In the register alias definitions below, we define which registers alias
+  // which others.  We only specify which registers the small registers alias,
+  // because the register file generator is smart enough to figure out that
+  // AL aliases AX if we tell it that AX aliased AL (for example).
+
+  // FIXME: X86-64 have different Dwarf numbers.
+  // 8-bit registers
+  // Low registers
+  def AL : Register<"AL">, DwarfRegNum<0>;
+  def CL : Register<"CL">, DwarfRegNum<1>;
+  def DL : Register<"DL">, DwarfRegNum<2>;
+  def BL : Register<"BL">, DwarfRegNum<3>;
+
+  // X86-64 only
+  def SIL : Register<"SIL">, DwarfRegNum<4>;
+  def DIL : Register<"DIL">, DwarfRegNum<5>;
+  def BPL : Register<"BPL">, DwarfRegNum<6>;
+  def SPL : Register<"SPL">, DwarfRegNum<7>;
+  def R8B  : Register<"R8B">,  DwarfRegNum<8>;
+  def R9B  : Register<"R9B">,  DwarfRegNum<9>;
+  def R10B : Register<"R10B">, DwarfRegNum<10>;
+  def R11B : Register<"R11B">, DwarfRegNum<11>;
+  def R12B : Register<"R12B">, DwarfRegNum<12>;
+  def R13B : Register<"R13B">, DwarfRegNum<13>;
+  def R14B : Register<"R14B">, DwarfRegNum<14>;
+  def R15B : Register<"R15B">, DwarfRegNum<15>;
+
+  // High registers X86-32 only
+  def AH : Register<"AH">, DwarfRegNum<0>;
+  def CH : Register<"CH">, DwarfRegNum<1>;
+  def DH : Register<"DH">, DwarfRegNum<2>;
+  def BH : Register<"BH">, DwarfRegNum<3>;
+
+  // 16-bit registers
+  def AX : RegisterWithSubRegs<"AX", [AH,AL]>, DwarfRegNum<0>;
+  def CX : RegisterWithSubRegs<"CX", [CH,CL]>, DwarfRegNum<1>;
+  def DX : RegisterWithSubRegs<"DX", [DH,DL]>, DwarfRegNum<2>;
+  def BX : RegisterWithSubRegs<"BX", [BH,BL]>, DwarfRegNum<3>;
+  def SP : RegisterWithSubRegs<"SP", [SPL]>, DwarfRegNum<4>;
+  def BP : RegisterWithSubRegs<"BP", [BPL]>, DwarfRegNum<5>;
+  def SI : RegisterWithSubRegs<"SI", [SIL]>, DwarfRegNum<6>;
+  def DI : RegisterWithSubRegs<"DI", [DIL]>, DwarfRegNum<7>;
+  def IP : Register<"IP">, DwarfRegNum<8>;  
+  
+  // X86-64 only
+  def R8W  : RegisterWithSubRegs<"R8W", [R8B]>, DwarfRegNum<8>;
+  def R9W  : RegisterWithSubRegs<"R9W", [R9B]>, DwarfRegNum<9>;
+  def R10W : RegisterWithSubRegs<"R10W", [R10B]>, DwarfRegNum<10>;
+  def R11W : RegisterWithSubRegs<"R11W", [R11B]>, DwarfRegNum<11>;
+  def R12W : RegisterWithSubRegs<"R12W", [R12B]>, DwarfRegNum<12>;
+  def R13W : RegisterWithSubRegs<"R13W", [R13B]>, DwarfRegNum<13>;
+  def R14W : RegisterWithSubRegs<"R14W", [R14B]>, DwarfRegNum<14>;
+  def R15W : RegisterWithSubRegs<"R15W", [R15B]>, DwarfRegNum<15>;
+
+  // 32-bit registers
+  def EAX : RegisterWithSubRegs<"EAX", [AX]>, DwarfRegNum<0>;
+  def ECX : RegisterWithSubRegs<"ECX", [CX]>, DwarfRegNum<1>;
+  def EDX : RegisterWithSubRegs<"EDX", [DX]>, DwarfRegNum<2>;
+  def EBX : RegisterWithSubRegs<"EBX", [BX]>, DwarfRegNum<3>;
+  def ESP : RegisterWithSubRegs<"ESP", [SP]>, DwarfRegNum<4>;
+  def EBP : RegisterWithSubRegs<"EBP", [BP]>, DwarfRegNum<5>;
+  def ESI : RegisterWithSubRegs<"ESI", [SI]>, DwarfRegNum<6>;
+  def EDI : RegisterWithSubRegs<"EDI", [DI]>, DwarfRegNum<7>;
+  def EIP : RegisterWithSubRegs<"EIP", [IP]>, DwarfRegNum<8>;  
+  
+  // X86-64 only
+  def R8D  : RegisterWithSubRegs<"R8D", [R8W]>, DwarfRegNum<8>;
+  def R9D  : RegisterWithSubRegs<"R9D", [R9W]>, DwarfRegNum<9>;
+  def R10D : RegisterWithSubRegs<"R10D", [R10W]>, DwarfRegNum<10>;
+  def R11D : RegisterWithSubRegs<"R11D", [R11W]>, DwarfRegNum<11>;
+  def R12D : RegisterWithSubRegs<"R12D", [R12W]>, DwarfRegNum<12>;
+  def R13D : RegisterWithSubRegs<"R13D", [R13W]>, DwarfRegNum<13>;
+  def R14D : RegisterWithSubRegs<"R14D", [R14W]>, DwarfRegNum<14>;
+  def R15D : RegisterWithSubRegs<"R15D", [R15W]>, DwarfRegNum<15>;
+
+  // 64-bit registers, X86-64 only
+  def RAX : RegisterWithSubRegs<"RAX", [EAX]>, DwarfRegNum<0>;
+  def RDX : RegisterWithSubRegs<"RDX", [EDX]>, DwarfRegNum<1>;
+  def RCX : RegisterWithSubRegs<"RCX", [ECX]>, DwarfRegNum<2>;
+  def RBX : RegisterWithSubRegs<"RBX", [EBX]>, DwarfRegNum<3>;
+  def RSI : RegisterWithSubRegs<"RSI", [ESI]>, DwarfRegNum<4>;
+  def RDI : RegisterWithSubRegs<"RDI", [EDI]>, DwarfRegNum<5>;
+  def RBP : RegisterWithSubRegs<"RBP", [EBP]>, DwarfRegNum<6>;
+  def RSP : RegisterWithSubRegs<"RSP", [ESP]>, DwarfRegNum<7>;
+
+  def R8  : RegisterWithSubRegs<"R8", [R8D]>, DwarfRegNum<8>;
+  def R9  : RegisterWithSubRegs<"R9", [R9D]>, DwarfRegNum<9>;
+  def R10 : RegisterWithSubRegs<"R10", [R10D]>, DwarfRegNum<10>;
+  def R11 : RegisterWithSubRegs<"R11", [R11D]>, DwarfRegNum<11>;
+  def R12 : RegisterWithSubRegs<"R12", [R12D]>, DwarfRegNum<12>;
+  def R13 : RegisterWithSubRegs<"R13", [R13D]>, DwarfRegNum<13>;
+  def R14 : RegisterWithSubRegs<"R14", [R14D]>, DwarfRegNum<14>;
+  def R15 : RegisterWithSubRegs<"R15", [R15D]>, DwarfRegNum<15>;
+  def RIP : RegisterWithSubRegs<"RIP", [EIP]>,  DwarfRegNum<16>;
+
+  // MMX Registers. These are actually aliased to ST0 .. ST7
+  def MM0 : Register<"MM0">, DwarfRegNum<29>;
+  def MM1 : Register<"MM1">, DwarfRegNum<30>;
+  def MM2 : Register<"MM2">, DwarfRegNum<31>;
+  def MM3 : Register<"MM3">, DwarfRegNum<32>;
+  def MM4 : Register<"MM4">, DwarfRegNum<33>;
+  def MM5 : Register<"MM5">, DwarfRegNum<34>;
+  def MM6 : Register<"MM6">, DwarfRegNum<35>;
+  def MM7 : Register<"MM7">, DwarfRegNum<36>;
+  
+  // Pseudo Floating Point registers
+  def FP0 : Register<"FP0">, DwarfRegNum<-1>;
+  def FP1 : Register<"FP1">, DwarfRegNum<-1>;
+  def FP2 : Register<"FP2">, DwarfRegNum<-1>;
+  def FP3 : Register<"FP3">, DwarfRegNum<-1>;
+  def FP4 : Register<"FP4">, DwarfRegNum<-1>;
+  def FP5 : Register<"FP5">, DwarfRegNum<-1>;
+  def FP6 : Register<"FP6">, DwarfRegNum<-1>; 
+
+  // XMM Registers, used by the various SSE instruction set extensions
+  def XMM0: Register<"XMM0">, DwarfRegNum<17>;
+  def XMM1: Register<"XMM1">, DwarfRegNum<18>;
+  def XMM2: Register<"XMM2">, DwarfRegNum<19>;
+  def XMM3: Register<"XMM3">, DwarfRegNum<20>;
+  def XMM4: Register<"XMM4">, DwarfRegNum<21>;
+  def XMM5: Register<"XMM5">, DwarfRegNum<22>;
+  def XMM6: Register<"XMM6">, DwarfRegNum<23>;
+  def XMM7: Register<"XMM7">, DwarfRegNum<24>;
+
+  // X86-64 only
+  def XMM8:  Register<"XMM8">,  DwarfRegNum<25>;
+  def XMM9:  Register<"XMM9">,  DwarfRegNum<26>;
+  def XMM10: Register<"XMM10">, DwarfRegNum<27>;
+  def XMM11: Register<"XMM11">, DwarfRegNum<28>;
+  def XMM12: Register<"XMM12">, DwarfRegNum<29>;
+  def XMM13: Register<"XMM13">, DwarfRegNum<30>;
+  def XMM14: Register<"XMM14">, DwarfRegNum<31>;
+  def XMM15: Register<"XMM15">, DwarfRegNum<32>;
+
+  // Floating point stack registers
+  def ST0 : Register<"ST(0)">, DwarfRegNum<11>;
+  def ST1 : Register<"ST(1)">, DwarfRegNum<12>;
+  def ST2 : Register<"ST(2)">, DwarfRegNum<13>;
+  def ST3 : Register<"ST(3)">, DwarfRegNum<14>;
+  def ST4 : Register<"ST(4)">, DwarfRegNum<15>;
+  def ST5 : Register<"ST(5)">, DwarfRegNum<16>;
+  def ST6 : Register<"ST(6)">, DwarfRegNum<17>;
+  def ST7 : Register<"ST(7)">, DwarfRegNum<18>; 
+}
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and 
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B. 
+// FIXME: Allow AH, CH, DH, BH in 64-mode for non-REX instructions,
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        [AL, CL, DL, BL, AH, CH, DH, BH, SIL, DIL, BPL, SPL,
+                         R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SPL or BPL.
+      static const unsigned X86_GR8_AO_64_fp[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR8_AO_64[] =
+      {X86::AL, X86::CL, X86::DL, X86::SIL, X86::DIL,
+       X86::R8B, X86::R9B, X86::R10B, X86::R11B,
+       X86::BL, X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL};
+      // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+      static const unsigned X86_GR8_AO_32[] =
+      {X86::AL, X86::CL, X86::DL, X86::AH, X86::CH, X86::DH, X86::BL, X86::BH};
+
+    GR8Class::iterator
+    GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32;
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp;
+      else
+        return X86_GR8_AO_64;
+    }
+
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return X86_GR8_AO_32 + (sizeof(X86_GR8_AO_32) / sizeof(unsigned));
+      else if (RI->hasFP(MF))
+        return X86_GR8_AO_64_fp + (sizeof(X86_GR8_AO_64_fp) / sizeof(unsigned));
+      else
+        return X86_GR8_AO_64 + (sizeof(X86_GR8_AO_64) / sizeof(unsigned));
+    }
+  }];
+}
+
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         [AX, CX, DX, SI, DI, BX, BP, SP,
+                          R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate SP or BP.
+      static const unsigned X86_GR16_AO_64_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W};
+      static const unsigned X86_GR16_AO_32_fp[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR16_AO_64[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI,
+       X86::R8W, X86::R9W, X86::R10W, X86::R11W,
+       X86::BX, X86::R14W, X86::R15W, X86::R12W, X86::R13W, X86::BP};
+      static const unsigned X86_GR16_AO_32[] =
+      {X86::AX, X86::CX, X86::DX, X86::SI, X86::DI, X86::BX, X86::BP};
+
+    GR16Class::iterator
+    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp;
+        else
+          return X86_GR16_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp;
+        else
+          return X86_GR16_AO_32;
+      }
+    }
+
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_64_fp+(sizeof(X86_GR16_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_64 + (sizeof(X86_GR16_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR16_AO_32_fp+(sizeof(X86_GR16_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR16_AO_32 + (sizeof(X86_GR16_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR32 : RegisterClass<"X86", [i32], 32, 
+                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                          R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      // If so, don't allocate ESP or EBP.
+      static const unsigned X86_GR32_AO_64_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D};
+      static const unsigned X86_GR32_AO_32_fp[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX};
+      // If not, just don't allocate SPL.
+      static const unsigned X86_GR32_AO_64[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI,
+       X86::R8D, X86::R9D, X86::R10D, X86::R11D,
+       X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP};
+      static const unsigned X86_GR32_AO_32[] =
+      {X86::EAX, X86::ECX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP};
+
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp;
+        else
+          return X86_GR32_AO_64;
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp;
+        else
+          return X86_GR32_AO_32;
+      }
+    }
+
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_64_fp+(sizeof(X86_GR32_AO_64_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_64 + (sizeof(X86_GR32_AO_64) / sizeof(unsigned));
+      } else {
+        if (RI->hasFP(MF))
+          return X86_GR32_AO_32_fp+(sizeof(X86_GR32_AO_32_fp)/sizeof(unsigned));
+        else
+          return X86_GR32_AO_32 + (sizeof(X86_GR32_AO_32) / sizeof(unsigned));
+      }
+    }
+  }];
+}
+
+
+def GR64 : RegisterClass<"X86", [i64], 64, 
+                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                          RBX, R14, R15, R12, R13, RBP, RSP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-2;  // If so, don't allocate RSP or RBP
+      else
+        return end()-1;  // If not, just don't allocate RSP
+    }
+  }];
+}
+
+
+// GR16, GR32 subclasses which contain registers that have R8 sub-registers.
+// These should only be used for 32-bit mode.
+def GR16_ : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]>;
+def GR32_ : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]>;
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR32Class::iterator
+    FR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR64Class::iterator
+    FR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86", [f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86", [f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f64], 32,
+                        [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+    let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RSTClass::iterator
+    RSTClass::allocation_order_end(const MachineFunction &MF) const {
+      return begin();
+    }
+  }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
+                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                           XMM8, XMM9, XMM10, XMM11,
+                           XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR128Class::iterator
+    VR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..3dd2b24
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h
@@ -0,0 +1,34 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    enum RelocationType {
+      // reloc_pcrel_word - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      // reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
+      // add the relocated value to the value already in memory.
+      reloc_absolute_word = 1,
+      reloc_absolute_dword = 2
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..1a75e04
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -0,0 +1,293 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Subtarget.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+cl::opt<X86Subtarget::AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(X86Subtarget::Unset),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(
+    clEnumValN(X86Subtarget::ATT,   "att",   "  Emit AT&T-style assembly"),
+    clEnumValN(X86Subtarget::Intel, "intel", "  Emit Intel-style assembly"),
+    clEnumValEnd));
+
+
+/// True if accessing the GV requires an extra load. For Windows, dllimported
+/// symbols are indirect, loading the value at address GV rather then the
+/// value of GV itself. This means that the GlobalAddress must be in the base
+/// or index register of the address, not the GV offset field.
+bool X86Subtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+                                       const TargetMachine& TM,
+                                       bool isDirectCall) const
+{
+  // FIXME: PIC
+  if (TM.getRelocationModel() != Reloc::Static)
+    if (isTargetDarwin()) {
+      return (!isDirectCall &&
+              (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+               (GV->isDeclaration() && !GV->hasNotBeenReadFromBitcode())));
+    } else if (TM.getRelocationModel() == Reloc::PIC_ && isPICStyleGOT()) {
+      // Extra load is needed for all non-statics.
+      return (!isDirectCall &&
+              (GV->isDeclaration() || !GV->hasInternalLinkage()));
+    } else if (isTargetCygMing() || isTargetWindows()) {
+      return (GV->hasDLLImportLinkage());
+    }
+  
+  return false;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+bool X86::GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                          unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+  asm ("movq\t%%rbx, %%rsi\n\t"
+       "cpuid\n\t"
+       "xchgq\t%%rbx, %%rsi\n\t"
+       : "=a" (*rEAX),
+         "=S" (*rEBX),
+         "=c" (*rECX),
+         "=d" (*rEDX)
+       :  "a" (value));
+  return false;
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+#if defined(__GNUC__)
+  asm ("movl\t%%ebx, %%esi\n\t"
+       "cpuid\n\t"
+       "xchgl\t%%ebx, %%esi\n\t"
+       : "=a" (*rEAX),
+         "=S" (*rEBX),
+         "=c" (*rECX),
+         "=d" (*rEDX)
+       :  "a" (value));
+  return false;
+#elif defined(_MSC_VER)
+  __asm {
+    mov   eax,value
+    cpuid
+    mov   esi,rEAX
+    mov   dword ptr [esi],eax
+    mov   esi,rEBX
+    mov   dword ptr [esi],ebx
+    mov   esi,rECX
+    mov   dword ptr [esi],ecx
+    mov   esi,rEDX
+    mov   dword ptr [esi],edx
+  }
+  return false;
+#endif
+#endif
+  return true;
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+  
+  if (X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+    return;
+
+  X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+  
+  if ((EDX >> 23) & 0x1) X86SSELevel = MMX;
+  if ((EDX >> 25) & 0x1) X86SSELevel = SSE1;
+  if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
+  if (ECX & 0x1)         X86SSELevel = SSE3;
+  if ((ECX >> 9)  & 0x1) X86SSELevel = SSSE3;
+
+  if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
+      memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+    HasX86_64 = (EDX >> 29) & 0x1;
+  }
+}
+
+static const char *GetCurrentX86CPU() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  if (X86::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX))
+    return "generic";
+  unsigned Family  = (EAX >> 8) & 0xf; // Bits 8 - 11
+  unsigned Model   = (EAX >> 4) & 0xf; // Bits 4 - 7
+  X86::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+  bool Em64T = (EDX >> 29) & 0x1;
+
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  X86::GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+  if (memcmp(text.c, "GenuineIntel", 12) == 0) {
+    switch (Family) {
+      case 3:
+        return "i386";
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 4:  return "pentium-mmx";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 1:  return "pentiumpro";
+        case 3:
+        case 5:
+        case 6:  return "pentium2";
+        case 7:
+        case 8:
+        case 10:
+        case 11: return "pentium3";
+        case 9:
+        case 13: return "pentium-m";
+        case 14: return "yonah";
+        case 15: return "core2";
+        default: return "i686";
+        }
+      case 15: {
+        switch (Model) {
+        case 3:  
+        case 4:
+          return (Em64T) ? "nocona" : "prescott";
+        default:
+          return (Em64T) ? "x86-64" : "pentium4";
+        }
+      }
+        
+    default:
+      return "generic";
+    }
+  } else if (memcmp(text.c, "AuthenticAMD", 12) == 0) {
+    // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
+    // appears to be no way to generate the wide variety of AMD-specific targets
+    // from the information returned from CPUID.
+    switch (Family) {
+      case 4:
+        return "i486";
+      case 5:
+        switch (Model) {
+        case 6:
+        case 7:  return "k6";
+        case 8:  return "k6-2";
+        case 9:
+        case 13: return "k6-3";
+        default: return "pentium";
+        }
+      case 6:
+        switch (Model) {
+        case 4:  return "athlon-tbird";
+        case 6:
+        case 7:
+        case 8:  return "athlon-mp";
+        case 10: return "athlon-xp";
+        default: return "athlon";
+        }
+      case 15:
+        switch (Model) {
+        case 1:  return "opteron";
+        case 5:  return "athlon-fx"; // also opteron
+        default: return "athlon64";
+        }
+    default:
+      return "generic";
+    }
+  } else {
+    return "generic";
+  }
+}
+
+X86Subtarget::X86Subtarget(const Module &M, const std::string &FS, bool is64Bit)
+  : AsmFlavor(AsmWriterFlavor)
+  , PICStyle(PICStyle::None)
+  , X86SSELevel(NoMMXSSE)
+  , HasX86_64(false)
+  , stackAlignment(8)
+  // FIXME: this is a known good value for Yonah. How about others?
+  , MinRepStrSizeThreshold(128)
+  , Is64Bit(is64Bit)
+  , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+  // Determine default and user specified characteristics
+  if (!FS.empty()) {
+    // If feature string is not empty, parse features string.
+    std::string CPU = GetCurrentX86CPU();
+    ParseSubtargetFeatures(FS, CPU);
+    
+    if (Is64Bit && !HasX86_64)
+      cerr << "Warning: Generation of 64-bit code for a 32-bit processor "
+           << "requested.\n";
+    if (Is64Bit && X86SSELevel < SSE2)
+      cerr << "Warning: 64-bit processors all have at least SSE2.\n";
+  } else {
+    // Otherwise, use CPUID to auto-detect feature set.
+    AutoDetectSubtargetFeatures();
+  }
+    
+  // If requesting codegen for X86-64, make sure that 64-bit and SSE2 features
+  // are enabled.  These are available on all x86-64 CPUs.
+  if (Is64Bit) {
+    HasX86_64 = true;
+    if (X86SSELevel < SSE2)
+      X86SSELevel = SSE2;
+  }
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  const std::string& TT = M.getTargetTriple();
+  if (TT.length() > 5) {
+    if (TT.find("cygwin") != std::string::npos)
+      TargetType = isCygwin;
+    else if (TT.find("mingw") != std::string::npos)
+      TargetType = isMingw;
+    else if (TT.find("darwin") != std::string::npos)
+      TargetType = isDarwin;
+    else if (TT.find("win32") != std::string::npos)
+      TargetType = isWindows;
+  } else if (TT.empty()) {
+#if defined(__CYGWIN__)
+    TargetType = isCygwin;
+#elif defined(__MINGW32__)
+    TargetType = isMingw;
+#elif defined(__APPLE__)
+    TargetType = isDarwin;
+#elif defined(_WIN32)
+    TargetType = isWindows;
+#endif
+  }
+
+  // If the asm syntax hasn't been overridden on the command line, use whatever
+  // the target wants.
+  if (AsmFlavor == X86Subtarget::Unset) {
+    if (TargetType == isWindows) {
+      AsmFlavor = X86Subtarget::Intel;
+    } else {
+      AsmFlavor = X86Subtarget::ATT;
+    }
+  }
+
+  if (TargetType == isDarwin ||
+      TargetType == isCygwin ||
+      TargetType == isMingw  ||
+      (TargetType == isELF && Is64Bit))
+    stackAlignment = 16;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..2cda970
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h
@@ -0,0 +1,156 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class Module;
+class GlobalValue;
+class TargetMachine;
+  
+namespace PICStyle {
+enum Style {
+  Stub, GOT, RIPRel, WinPIC, None
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+public:
+  enum AsmWriterFlavorTy {
+    // Note: This numbering has to match the GCC assembler dialects for inline
+    // asm alternatives to work right.
+    ATT = 0, Intel = 1, Unset
+  };
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  /// AsmFlavor - Which x86 asm dialect to use.
+  AsmWriterFlavorTy AsmFlavor;
+
+  /// PICStyle - Which PIC style to use
+  PICStyle::Style PICStyle;
+  
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, or none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  X863DNowEnum X863DNowLevel;
+
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  bool HasX86_64;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Min. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  unsigned MinRepStrSizeThreshold;
+
+private:
+  /// Is64Bit - True if the processor supports 64-bit instructions and module
+  /// pointer size is 64 bit.
+  bool Is64Bit;
+
+public:
+  enum {
+    isELF, isCygwin, isDarwin, isWindows, isMingw
+  } TargetType;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified module.
+  ///
+  X86Subtarget(const Module &M, const std::string &FS, bool is64Bit);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMinRepStrSizeThreshold - Returns the minimum memset / memcpy size
+  /// required to turn the operation into a X86 rep/movs or rep/stos
+  /// instruction. This is only used if the src / dst alignment is not DWORD
+  /// aligned.
+  unsigned getMinRepStrSizeThreshold() const { return MinRepStrSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(const std::string &FS, const std::string &CPU);
+
+  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+  /// instruction.
+  void AutoDetectSubtargetFeatures();
+
+  bool is64Bit() const { return Is64Bit; }
+
+  PICStyle::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyle::Style Style)  { PICStyle = Style; }
+
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+
+  unsigned getAsmFlavor() const {
+    return AsmFlavor != Unset ? unsigned(AsmFlavor) : 0;
+  }
+
+  bool isFlavorAtt() const { return AsmFlavor == ATT; }
+  bool isFlavorIntel() const { return AsmFlavor == Intel; }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+  bool isTargetWindows() const { return TargetType == isWindows; }
+  bool isTargetMingw() const { return TargetType == isMingw; }
+  bool isTargetCygMing() const { return (TargetType == isMingw ||
+                                         TargetType == isCygwin); }
+  bool isTargetCygwin() const { return TargetType == isCygwin; }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyle::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyle::GOT; }
+  bool isPICStyleStub() const { return PICStyle == PICStyle::Stub; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyle::RIPRel; }
+  bool isPICStyleWinPIC() const { return PICStyle == PICStyle:: WinPIC; }
+    
+  /// True if accessing the GV requires an extra load. For Windows, dllimported
+  /// symbols are indirect, loading the value at address GV rather then the
+  /// value of GV itself. This means that the GlobalAddress must be in the base
+  /// or index register of the address, not the GV offset field.
+  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+
+};
+
+namespace X86 {
+  /// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+  /// the specified arguments.  If we can't run cpuid on the host, return true.
+  bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                       unsigned *rECX, unsigned *rEDX);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/X86/X86TargetAsmInfo.cpp b/lib/Target/X86/X86TargetAsmInfo.cpp
new file mode 100644
index 0000000..4bb854e
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.cpp
@@ -0,0 +1,280 @@
+//===-- X86TargetAsmInfo.cpp - X86 asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86TargetAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+static const char* x86_asm_table[] = {"{si}", "S",
+                                      "{di}", "D",
+                                      "{ax}", "a",
+                                      "{cx}", "c",
+                                      "{memory}", "memory",
+                                      "{flags}", "",
+                                      "{dirflag}", "",
+                                      "{fpsr}", "",
+                                      "{cc}", "cc",
+                                      0,0};
+
+X86TargetAsmInfo::X86TargetAsmInfo(const X86TargetMachine &TM) {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  
+  // FIXME - Should be simplified.
+
+  AsmTransCBE = x86_asm_table;
+  
+  switch (Subtarget->TargetType) {
+  case X86Subtarget::isDarwin:
+    AlignmentIsInBytes = false;
+    GlobalPrefix = "_";
+    if (!Subtarget->is64Bit())
+      Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+    ZeroDirective = "\t.space\t";  // ".space N" emits N zeros.
+    PrivateGlobalPrefix = "L";     // Marker for constant pool idxs
+    BSSSection = 0;                       // no BSS section.
+    ZeroFillDirective = "\t.zerofill\t";  // Uses .zerofill
+    ConstantPoolSection = "\t.const\n";
+    JumpTableDataSection = "\t.const\n";
+    CStringSection = "\t.cstring";
+    FourByteConstantSection = "\t.literal4\n";
+    EightByteConstantSection = "\t.literal8\n";
+    if (Subtarget->is64Bit())
+      SixteenByteConstantSection = "\t.literal16\n";
+    ReadOnlySection = "\t.const\n";
+    LCOMMDirective = "\t.lcomm\t";
+    COMMDirectiveTakesAlignment = false;
+    HasDotTypeDotSizeDirective = false;
+    if (TM.getRelocationModel() == Reloc::Static) {
+      StaticCtorsSection = ".constructor";
+      StaticDtorsSection = ".destructor";
+    } else {
+      StaticCtorsSection = ".mod_init_func";
+      StaticDtorsSection = ".mod_term_func";
+    }
+    InlineAsmStart = "# InlineAsm Start";
+    InlineAsmEnd = "# InlineAsm End";
+    SetDirective = "\t.set";
+    UsedDirective = "\t.no_dead_strip\t";
+    WeakRefDirective = "\t.weak_reference\t";
+    HiddenDirective = "\t.private_extern\t";
+    
+    // In non-PIC modes, emit a special label before jump tables so that the
+    // linker can perform more accurate dead code stripping.
+    if (TM.getRelocationModel() != Reloc::PIC_) {
+      // Emit a local label that is preserved until the linker runs.
+      JumpTableSpecialLabelPrefix = "l";
+    }
+
+    SupportsDebugInformation = true;
+    NeedsSet = true;
+    DwarfAbbrevSection = ".section __DWARF,__debug_abbrev,regular,debug";
+    DwarfInfoSection = ".section __DWARF,__debug_info,regular,debug";
+    DwarfLineSection = ".section __DWARF,__debug_line,regular,debug";
+    DwarfFrameSection = ".section __DWARF,__debug_frame,regular,debug";
+    DwarfPubNamesSection = ".section __DWARF,__debug_pubnames,regular,debug";
+    DwarfPubTypesSection = ".section __DWARF,__debug_pubtypes,regular,debug";
+    DwarfStrSection = ".section __DWARF,__debug_str,regular,debug";
+    DwarfLocSection = ".section __DWARF,__debug_loc,regular,debug";
+    DwarfARangesSection = ".section __DWARF,__debug_aranges,regular,debug";
+    DwarfRangesSection = ".section __DWARF,__debug_ranges,regular,debug";
+    DwarfMacInfoSection = ".section __DWARF,__debug_macinfo,regular,debug";
+    break;
+
+  case X86Subtarget::isELF:
+    ReadOnlySection = "\t.section\t.rodata";
+    FourByteConstantSection = "\t.section\t.rodata.cst4,\"aM\",@progbits,4";
+    EightByteConstantSection = "\t.section\t.rodata.cst8,\"aM\",@progbits,8";
+    SixteenByteConstantSection = "\t.section\t.rodata.cst16,\"aM\",@progbits,16";
+    CStringSection = "\t.section\t.rodata.str1.1,\"aMS\",@progbits,1";
+    PrivateGlobalPrefix = ".L";
+    WeakRefDirective = "\t.weak\t";
+    SetDirective = "\t.set\t";
+    PCSymbol = ".";
+
+    // Set up DWARF directives
+    HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+    AbsoluteDebugSectionOffsets = true;
+    AbsoluteEHSectionOffsets = false;
+    SupportsDebugInformation = true;
+    DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"\",@progbits";
+    DwarfInfoSection =    "\t.section\t.debug_info,\"\",@progbits";
+    DwarfLineSection =    "\t.section\t.debug_line,\"\",@progbits";
+    DwarfFrameSection =   "\t.section\t.debug_frame,\"\",@progbits";
+    DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"\",@progbits";
+    DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"\",@progbits";
+    DwarfStrSection =     "\t.section\t.debug_str,\"\",@progbits";
+    DwarfLocSection =     "\t.section\t.debug_loc,\"\",@progbits";
+    DwarfARangesSection = "\t.section\t.debug_aranges,\"\",@progbits";
+    DwarfRangesSection =  "\t.section\t.debug_ranges,\"\",@progbits";
+    DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"\",@progbits";
+
+    if (!Subtarget->is64Bit())
+      SupportsExceptionHandling = true;
+    DwarfEHFrameSection = "\t.section\t.eh_frame,\"aw\",@progbits";
+    DwarfExceptionSection = "\t.section\t.gcc_except_table,\"a\",@progbits";
+    break;
+
+  case X86Subtarget::isCygwin:
+  case X86Subtarget::isMingw:
+    GlobalPrefix = "_";
+    LCOMMDirective = "\t.lcomm\t";
+    COMMDirectiveTakesAlignment = false;
+    HasDotTypeDotSizeDirective = false;
+    StaticCtorsSection = "\t.section .ctors,\"aw\"";
+    StaticDtorsSection = "\t.section .dtors,\"aw\"";
+    HiddenDirective = NULL;
+    PrivateGlobalPrefix = "L";  // Prefix for private global symbols
+    WeakRefDirective = "\t.weak\t";
+    SetDirective = "\t.set\t";
+
+    // Set up DWARF directives
+    HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+    AbsoluteDebugSectionOffsets = true;
+    AbsoluteEHSectionOffsets = false;
+    SupportsDebugInformation = true;
+    DwarfSectionOffsetDirective = "\t.secrel32\t";
+    DwarfAbbrevSection =  "\t.section\t.debug_abbrev,\"dr\"";
+    DwarfInfoSection =    "\t.section\t.debug_info,\"dr\"";
+    DwarfLineSection =    "\t.section\t.debug_line,\"dr\"";
+    DwarfFrameSection =   "\t.section\t.debug_frame,\"dr\"";
+    DwarfPubNamesSection ="\t.section\t.debug_pubnames,\"dr\"";
+    DwarfPubTypesSection ="\t.section\t.debug_pubtypes,\"dr\"";
+    DwarfStrSection =     "\t.section\t.debug_str,\"dr\"";
+    DwarfLocSection =     "\t.section\t.debug_loc,\"dr\"";
+    DwarfARangesSection = "\t.section\t.debug_aranges,\"dr\"";
+    DwarfRangesSection =  "\t.section\t.debug_ranges,\"dr\"";
+    DwarfMacInfoSection = "\t.section\t.debug_macinfo,\"dr\"";
+    break;
+
+  case X86Subtarget::isWindows:
+    GlobalPrefix = "_";
+    HasDotTypeDotSizeDirective = false;
+    break;
+
+  default: break;
+  }
+  
+  if (Subtarget->isFlavorIntel()) {
+    GlobalPrefix = "_";
+    CommentString = ";";
+  
+    PrivateGlobalPrefix = "$";
+    AlignDirective = "\talign\t";
+    ZeroDirective = "\tdb\t";
+    ZeroDirectiveSuffix = " dup(0)";
+    AsciiDirective = "\tdb\t";
+    AscizDirective = 0;
+    Data8bitsDirective = "\tdb\t";
+    Data16bitsDirective = "\tdw\t";
+    Data32bitsDirective = "\tdd\t";
+    Data64bitsDirective = "\tdq\t";
+    HasDotTypeDotSizeDirective = false;
+    
+    TextSection = "_text";
+    DataSection = "_data";
+    JumpTableDataSection = NULL;
+    SwitchToSectionDirective = "";
+    TextSectionStartSuffix = "\tsegment 'CODE'";
+    DataSectionStartSuffix = "\tsegment 'DATA'";
+    SectionEndDirectiveSuffix = "\tends\n";
+  }
+
+  AssemblerDialect = Subtarget->getAsmFlavor();
+}
+
+bool X86TargetAsmInfo::LowerToBSwap(CallInst *CI) const {
+  // FIXME: this should verify that we are targetting a 486 or better.  If not,
+  // we will turn this bswap into something that will be lowered to logical ops
+  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
+  // so don't worry about this.
+  
+  // Verify this is a simple bswap.
+  if (CI->getNumOperands() != 2 ||
+      CI->getType() != CI->getOperand(1)->getType() ||
+      !CI->getType()->isInteger())
+    return false;
+  
+  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+  
+  // Okay, we can do this xform, do so now.
+  const Type *Tys[] = { Ty, Ty };
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 2);
+  
+  Value *Op = CI->getOperand(1);
+  Op = new CallInst(Int, Op, CI->getName(), CI);
+  
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
+
+
+bool X86TargetAsmInfo::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+  
+  std::string AsmStr = IA->getAsmString();
+  
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  std::vector<std::string> AsmPieces;
+  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+  
+  switch (AsmPieces.size()) {
+  default: return false;    
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
+    
+    // bswap $0
+    if (AsmPieces.size() == 2 && 
+        AsmPieces[0] == "bswap" && AsmPieces[1] == "$0") {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return LowerToBSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType() == Type::Int64Ty && Constraints.size() >= 2 &&
+        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+      std::vector<std::string> Words;
+      SplitString(AsmPieces[0], Words, " \t");
+      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t");
+        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+              Words[2] == "%edx") {
+            return LowerToBSwap(CI);
+          }
+        }
+      }
+    }
+    break;
+  }
+  return false;
+}
diff --git a/lib/Target/X86/X86TargetAsmInfo.h b/lib/Target/X86/X86TargetAsmInfo.h
new file mode 100644
index 0000000..cc509d1
--- /dev/null
+++ b/lib/Target/X86/X86TargetAsmInfo.h
@@ -0,0 +1,33 @@
+//=====-- X86TargetAsmInfo.h - X86 asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by James M. Laskey and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86TargetAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/Target/TargetAsmInfo.h"
+
+namespace llvm {
+
+  // Forward declaration.
+  class X86TargetMachine;
+
+  struct X86TargetAsmInfo : public TargetAsmInfo {
+    X86TargetAsmInfo(const X86TargetMachine &TM);
+    
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+  private:
+    bool LowerToBSwap(CallInst *CI) const;
+  };
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..4d4bd3f
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -0,0 +1,190 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachineRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+/// X86TargetMachineModule - Note that this is used on hosts that cannot link
+/// in a library unless there are references into the library.  In particular,
+/// it seems that it is not possible to get things to work on Win32 without
+/// this.  Though it is unused, do not remove it.
+extern "C" int X86TargetMachineModule;
+int X86TargetMachineModule = 0;
+
+namespace {
+  // Register the target.
+  RegisterTarget<X86_32TargetMachine>
+  X("x86",    "  32-bit X86: Pentium-Pro and above");
+  RegisterTarget<X86_64TargetMachine>
+  Y("x86-64", "  64-bit X86: EM64T and AMD64");
+}
+
+const TargetAsmInfo *X86TargetMachine::createTargetAsmInfo() const {
+  return new X86TargetAsmInfo(*this);
+}
+
+unsigned X86_32TargetMachine::getJITMatchQuality() {
+#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_64TargetMachine::getJITMatchQuality() {
+#if defined(__x86_64__)
+  return 10;
+#endif
+  return 0;
+}
+
+unsigned X86_32TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "i[3-9]86-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 5 && TT[0] == 'i' && TT[2] == '8' && TT[3] == '6' &&
+      TT[4] == '-' && TT[1] - '3' < 6)
+    return 20;
+  // If the target triple is something non-X86, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer32)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+unsigned X86_64TargetMachine::getModuleMatchQuality(const Module &M) {
+  // We strongly match "x86_64-*".
+  std::string TT = M.getTargetTriple();
+  if (TT.size() >= 7 && TT[0] == 'x' && TT[1] == '8' && TT[2] == '6' &&
+      TT[3] == '_' && TT[4] == '6' && TT[5] == '4' && TT[6] == '-')
+    return 20;
+
+  // We strongly match "amd64-*".
+  if (TT.size() >= 6 && TT[0] == 'a' && TT[1] == 'm' && TT[2] == 'd' &&
+      TT[3] == '6' && TT[4] == '4' && TT[5] == '-')
+    return 20;
+  
+  // If the target triple is something non-X86-64, we don't match.
+  if (!TT.empty()) return 0;
+
+  if (M.getEndianness()  == Module::LittleEndian &&
+      M.getPointerSize() == Module::Pointer64)
+    return 10;                                   // Weak match
+  else if (M.getEndianness() != Module::AnyEndianness ||
+           M.getPointerSize() != Module::AnyPointerSize)
+    return 0;                                    // Match for some other target
+
+  return getJITMatchQuality()/2;
+}
+
+X86_32TargetMachine::X86_32TargetMachine(const Module &M, const std::string &FS) 
+  : X86TargetMachine(M, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Module &M, const std::string &FS)
+  : X86TargetMachine(M, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an ILP32 architecture model
+///
+X86TargetMachine::X86TargetMachine(const Module &M, const std::string &FS,
+                                   bool is64Bit)
+  : Subtarget(M, FS, is64Bit),
+    DataLayout(Subtarget.is64Bit() ?
+               std::string("e-p:64:64-f64:32:64-i64:32:64") :
+               std::string("e-p:32:32-f64:32:64-i64:32:64")),
+    FrameInfo(TargetFrameInfo::StackGrowsDown,
+              Subtarget.getStackAlignment(), Subtarget.is64Bit() ? -8 : -4),
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this) {
+  if (getRelocationModel() == Reloc::Default)
+    if (Subtarget.isTargetDarwin() || Subtarget.isTargetCygMing())
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+  if (Subtarget.is64Bit()) {
+    // No DynamicNoPIC support under X86-64.
+    if (getRelocationModel() == Reloc::DynamicNoPIC)
+      setRelocationModel(Reloc::PIC_);
+    // Default X86-64 code model is small.
+    if (getCodeModel() == CodeModel::Default)
+      setCodeModel(CodeModel::Small);
+  }
+
+  if (Subtarget.isTargetCygMing())
+    Subtarget.setPICStyle(PICStyle::WinPIC);
+  else if (Subtarget.isTargetDarwin())
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyle::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyle::Stub);
+  else if (Subtarget.isTargetELF())
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyle::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyle::GOT);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(FunctionPassManager &PM, bool Fast) {
+  // Install an instruction selector.
+  PM.add(createX86ISelDag(*this, Fast));
+  return false;
+}
+
+bool X86TargetMachine::addPostRegAlloc(FunctionPassManager &PM, bool Fast) {
+  PM.add(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                          std::ostream &Out) {
+  PM.add(createX86CodePrinterPass(Out, *this));
+  return false;
+}
+
+bool X86TargetMachine::addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                      MachineCodeEmitter &MCE) {
+  // FIXME: Move this to TargetJITInfo!
+  setRelocationModel(Reloc::Static);
+  Subtarget.setPICStyle(PICStyle::None);
+  
+  // JIT cannot ensure globals are placed in the lower 4G of address.
+  if (Subtarget.is64Bit())
+    setCodeModel(CodeModel::Large);
+
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  return false;
+}
+
+bool X86TargetMachine::addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                            MachineCodeEmitter &MCE) {
+  PM.add(createX86CodeEmitterPass(*this, MCE));
+  return false;
+}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..0a4f1b5
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -0,0 +1,95 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+
+class X86TargetMachine : public LLVMTargetMachine {
+  X86Subtarget      Subtarget;
+  const TargetData  DataLayout; // Calculates type size & alignment
+  TargetFrameInfo   FrameInfo;
+  X86InstrInfo      InstrInfo;
+  X86JITInfo        JITInfo;
+  X86TargetLowering TLInfo;
+  X86ELFWriterInfo  ELFWriterInfo;
+
+protected:
+  virtual const TargetAsmInfo *createTargetAsmInfo() const;
+  
+public:
+  X86TargetMachine(const Module &M, const std::string &FS, bool is64Bit);
+
+  virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       TargetJITInfo    *getJITInfo()         { return &JITInfo; }
+  virtual const TargetSubtarget  *getSubtargetImpl() const{ return &Subtarget; }
+  virtual       X86TargetLowering *getTargetLowering() const { 
+    return const_cast<X86TargetLowering*>(&TLInfo); 
+  }
+  virtual const MRegisterInfo    *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
+
+  static unsigned getModuleMatchQuality(const Module &M);
+  static unsigned getJITMatchQuality();
+  
+  // Set up the pass pipeline.
+  virtual bool addInstSelector(FunctionPassManager &PM, bool Fast);  
+  virtual bool addPostRegAlloc(FunctionPassManager &PM, bool Fast);
+  virtual bool addAssemblyEmitter(FunctionPassManager &PM, bool Fast, 
+                                  std::ostream &Out);
+  virtual bool addCodeEmitter(FunctionPassManager &PM, bool Fast,
+                              MachineCodeEmitter &MCE);
+  virtual bool addSimpleCodeEmitter(FunctionPassManager &PM, bool Fast,
+                                    MachineCodeEmitter &MCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+  X86_32TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+  X86_64TargetMachine(const Module &M, const std::string &FS);
+  
+  static unsigned getJITMatchQuality();
+  static unsigned getModuleMatchQuality(const Module &M);
+};
+
+} // End llvm namespace
+
+#endif