Massive rewrite of MMX:  
The x86_mmx type is used for MMX intrinsics, parameters and
return values where these use MMX registers, and is also
supported in load, store, and bitcast.

Only the above operations generate MMX instructions, and optimizations
do not operate on or produce MMX intrinsics. 

MMX-sized vectors <2 x i32> etc. are lowered to XMM or split into
smaller pieces.  Optimizations may occur on these forms and the
result casted back to x86_mmx, provided the result feeds into a
previous existing x86_mmx operation.

The point of all this is prevent optimizations from introducing
MMX operations, which is unsafe due to the EMMS problem.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@115243 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index adc225b..2c77050 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4322,6 +4322,66 @@
     return 0;
   }
 
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDValue ShAmt = getValue(I.getArgOperand(1));
+    if (isa<ConstantSDNode>(ShAmt)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return 0;
+    }
+    unsigned NewIntrinsic = 0;
+    EVT ShAmtVT = MVT::v2i32;
+    switch (Intrinsic) {
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+      break;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+      break;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+      break;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+      break;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+      break;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+      break;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+      break;
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+      break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    }
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+    // to be zero.
+    // We must do this early because v2i32 is not a legal type.
+    DebugLoc dl = getCurDebugLoc();
+    SDValue ShOps[2];
+    ShOps[0] = ShAmt;
+    ShOps[1] = DAG.getConstant(0, MVT::i32);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    EVT DestVT = TLI.getValueType(I.getType());
+    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, DestVT, ShAmt);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                       DAG.getConstant(NewIntrinsic, MVT::i32),
+                       getValue(I.getArgOperand(0)), ShAmt);
+    setValue(&I, Res);
+    return 0;
+  }
   case Intrinsic::convertff:
   case Intrinsic::convertfsi:
   case Intrinsic::convertfui:
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index e3409ef..487e39f 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -48,7 +48,7 @@
 
   // MMX vector types are always returned in MM0. If the target doesn't have
   // MM0, it doesn't support these vector types.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToReg<[MM0]>>,
+  CCIfType<[x86mmx, v1i64], CCAssignToReg<[MM0]>>,
 
   // Long double types are always returned in ST0 (even with SSE).
   CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
@@ -95,14 +95,14 @@
   // returned in RAX. This disagrees with ABI documentation but is bug
   // compatible with gcc.
   CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
-  CCIfType<[v8i8, v4i16, v2i32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
 // X86-Win64 C return-value convention.
 def RetCC_X86_Win64_C : CallingConv<[
   // The X86-Win64 calling convention always returns __m64 values in RAX.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCBitConvertToType<i64>>,
+  CCIfType<[x86mmx, v1i64], CCBitConvertToType<i64>>,
 
   // And FP in XMM0 only.
   CCIfType<[f32], CCAssignToReg<[XMM0]>>,
@@ -161,7 +161,7 @@
 
   // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
   // registers on Darwin.
-  CCIfType<[v8i8, v4i16, v2i32],
+  CCIfType<[x86mmx],
             CCIfSubtarget<"isTargetDarwin()",
             CCIfSubtarget<"hasSSE2()",
             CCPromoteToType<v2i64>>>>,
@@ -192,7 +192,7 @@
            CCAssignToStack<32, 32>>,
 
   // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+  CCIfType<[x86mmx,v1i64], CCAssignToStack<8, 8>>
 ]>;
 
 // Calling convention used on Win64
@@ -210,8 +210,7 @@
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
 
   // The first 4 MMX vector arguments are passed in GPRs.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64],
-           CCBitConvertToType<i64>>,
+  CCIfType<[x86mmx, v1i64], CCBitConvertToType<i64>>,
 
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
@@ -233,7 +232,7 @@
   CCIfType<[f80], CCAssignToStack<0, 0>>,
 
   // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+  CCIfType<[x86mmx,v1i64], CCAssignToStack<8, 8>>
 ]>;
 
 def CC_X86_64_GHC : CallingConv<[
@@ -269,7 +268,7 @@
 
   // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
   // registers if the call is not a vararg call.
-  CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32],
+  CCIfNotVarArg<CCIfType<[x86mmx],
                 CCAssignToReg<[MM0, MM1, MM2]>>>,
 
   // Integer/Float values get stored in stack slots that are 4 bytes in
@@ -300,7 +299,7 @@
 
   // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
   // passed in the parameter area.
-  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
+  CCIfType<[x86mmx,v1i64], CCAssignToStack<8, 4>>]>;
 
 def CC_X86_32_C : CallingConv<[
   // Promote i8/i16 arguments to i32.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 55daf76..7da9ad6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -218,11 +218,8 @@
     setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
     if (Subtarget->is64Bit()) {
       setOperationAction(ISD::BIT_CONVERT    , MVT::f64  , Expand);
-      // Without SSE, i64->f64 goes through memory; i64->MMX is Legal.
-      if (Subtarget->hasMMX() && !DisableMMX)
-        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Custom);
-      else 
-        setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Expand);
+      // Without SSE, i64->f64 goes through memory.
+      setOperationAction(ISD::BIT_CONVERT    , MVT::i64  , Expand);
     }
   }
 
@@ -615,91 +612,41 @@
   // with -msoft-float, disable use of MMX as well.
   if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
     addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass, false);
-
-    // FIXME: Remove the rest of this stuff.
-    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass, false);
-    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false);
-    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false);
-    
-    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false);
-
-    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
-    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
-    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
-    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
-
-    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
-    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
-    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
-    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
-
-    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
-    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
-
-    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
-    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
-    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
-    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
-    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
-    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
-
-    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
-    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
-    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
-    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
-    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
-    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
-
-    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
-    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
-    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
-    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
-    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
-    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
-
-    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
-    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
-    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
-    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
-    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
-    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
-    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
-
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
-
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
-
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
-
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
-
-    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
-    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
-    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
-    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
-    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
-
-    if (!X86ScalarSSEf64 && Subtarget->is64Bit()) {
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Custom);
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Custom);
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Custom);
-      setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Custom);
-    }
+    // No operations on x86mmx supported, everything uses intrinsics.
   }
 
+  // MMX-sized vectors (other than x86mmx) are expected to be expanded
+  // into smaller operations.
+  setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
+  setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
+  setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
+  setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::AND,                MVT::v4i16, Expand);
+  setOperationAction(ISD::AND,                MVT::v2i32, Expand);
+  setOperationAction(ISD::AND,                MVT::v1i64, Expand);
+  setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
+  setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
+  setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
+  setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
+  setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
+  setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
+  setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
+  setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
+  setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
+  setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
+  setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
+  setOperationAction(ISD::BIT_CONVERT,        MVT::v8i8,  Expand);
+  setOperationAction(ISD::BIT_CONVERT,        MVT::v4i16, Expand);
+  setOperationAction(ISD::BIT_CONVERT,        MVT::v2i32, Expand);
+  setOperationAction(ISD::BIT_CONVERT,        MVT::v1i64, Expand);
+
   if (!UseSoftFloat && Subtarget->hasSSE1()) {
     addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
 
@@ -821,10 +768,6 @@
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
-    if (!DisableMMX && Subtarget->hasMMX()) {
-      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
-      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
-    }
   }
 
   if (Subtarget->hasSSE41()) {
@@ -1210,8 +1153,7 @@
     RRC = (Subtarget->is64Bit()
            ? X86::GR64RegisterClass : X86::GR32RegisterClass);
     break;
-  case MVT::v8i8: case MVT::v4i16:
-  case MVT::v2i32: case MVT::v1i64: 
+  case MVT::x86mmx:
     RRC = X86::VR64RegisterClass;
     break;
   case MVT::f32: case MVT::f64:
@@ -1345,12 +1287,11 @@
     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
     // which is returned in RAX / RDX.
     if (Subtarget->is64Bit()) {
-      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
-        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+      if (ValVT == MVT::x86mmx) {
         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+          ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
                                   ValToCopy);
-          
           // If we don't have SSE2 available, convert to v4f32 so the generated
           // register is legal.
           if (!Subtarget->hasSSE2())
@@ -1650,7 +1591,7 @@
         RC = X86::VR256RegisterClass;
       else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
         RC = X86::VR128RegisterClass;
-      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+      else if (RegVT == MVT::x86mmx)
         RC = X86::VR64RegisterClass;
       else
         llvm_unreachable("Unknown argument type!");
@@ -1673,9 +1614,8 @@
       if (VA.isExtInLoc()) {
         // Handle MMX values passed in XMM regs.
         if (RegVT.isVector()) {
-          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
-                                 ArgValue, DAG.getConstant(0, MVT::i64));
-          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+          ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
+                                 ArgValue);
         } else
           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
       }
@@ -2876,7 +2816,7 @@
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
 static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
-  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
+  if (VT == MVT::v4f32 || VT == MVT::v4i32 )
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
     return (Mask[0] < 2 && Mask[1] < 2);
@@ -3548,13 +3488,10 @@
                              DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
-  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted
+  // Always build SSE zero vectors as <4 x i32> bitcasted
   // to their dest type. This ensures they get CSE'd.
   SDValue Vec;
-  if (VT.getSizeInBits() == 64) { // MMX
-    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  } else if (VT.getSizeInBits() == 128) {
+  if (VT.getSizeInBits() == 128) {  // SSE
     if (HasSSE2) {  // SSE2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -3582,10 +3519,7 @@
   // type.  This ensures they get CSE'd.
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
   SDValue Vec;
-  if (VT.getSizeInBits() == 64) // MMX
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
-  else // SSE
-    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
 }
 
@@ -4025,8 +3959,7 @@
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
                          const TargetLowering &TLI, DebugLoc dl) {
-  bool isMMX = VT.getSizeInBits() == 64;
-  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
   SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
@@ -4180,10 +4113,10 @@
   if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
       (Op.getValueType().getSizeInBits() != 256 &&
        ISD::isBuildVectorAllOnes(Op.getNode()))) {
-    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+    // Canonicalize this to <4 x i32> (SSE) to
     // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
     // eliminated on x86-32 hosts.
-    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+    if (Op.getValueType() == MVT::v4i32)
       return Op;
 
     if (ISD::isBuildVectorAllOnes(Op.getNode()))
@@ -4234,9 +4167,10 @@
     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
-        // Handle MMX and SSE both.
-        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
-        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
+        // Handle SSE only.
+        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
+        EVT VecVT = MVT::v4i32;
+        unsigned VecElts = 4;
 
         // Truncate the value (which may itself be a constant) to i32, and
         // convert it to a vector with movd (S2V+shuffle to zero extend).
@@ -4275,7 +4209,8 @@
                                            DAG);
       } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
-        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
+        EVT MiddleVT = MVT::v4i32;
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
         Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                            Subtarget->hasSSE2(), DAG);
@@ -5418,11 +5353,8 @@
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
-  // FIXME: this is somehow handled during isel by MMX pattern fragments. Remove
-  // the check or come up with another solution when all MMX move to intrinsics,
-  // but don't allow this to be considered legal, we don't want vector_shuffle
-  // operations to be matched during isel anymore.
-  if (isMMX && SVOp->isSplat())
+  // Shuffle operations on MMX not supported.
+  if (isMMX)
     return Op;
 
   // Vector shuffle lowering takes 3 steps:
@@ -5456,10 +5388,10 @@
       return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
 
   if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
-      RelaxedMayFoldVectorLoad(V1) && !isMMX)
+      RelaxedMayFoldVectorLoad(V1))
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
-  if (!isMMX && X86::isMOVHLPS_v_undef_Mask(SVOp))
+  if (X86::isMOVHLPS_v_undef_Mask(SVOp))
     return getMOVHighToLow(Op, dl, DAG);
 
   // Use to match splats
@@ -5507,7 +5439,7 @@
       return V2;
     if (ISD::isBuildVectorAllZeros(V1.getNode()))
       return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
-    if (!isMMX && !X86::isMOVLPMask(SVOp)) {
+    if (!X86::isMOVLPMask(SVOp)) {
       if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
         return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
 
@@ -5517,22 +5449,20 @@
   }
 
   // FIXME: fold these into legal mask.
-  if (!isMMX) {
-    if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
-      return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
+  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
+    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);
 
-    if (X86::isMOVHLPSMask(SVOp))
-      return getMOVHighToLow(Op, dl, DAG);
+  if (X86::isMOVHLPSMask(SVOp))
+    return getMOVHighToLow(Op, dl, DAG);
 
-    if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
-      return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
+  if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);
 
-    if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
-      return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
+  if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4)
+    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);
 
-    if (X86::isMOVLPMask(SVOp))
-      return getMOVLP(Op, dl, DAG, HasSSE2);
-  }
+  if (X86::isMOVLPMask(SVOp))
+    return getMOVLP(Op, dl, DAG, HasSSE2);
 
   if (ShouldXformToMOVHLPS(SVOp) ||
       ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
@@ -5573,12 +5503,10 @@
   }
 
   if (X86::isUNPCKLMask(SVOp))
-    return (isMMX) ?
-      Op : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
 
   if (X86::isUNPCKHMask(SVOp))
-    return (isMMX) ?
-      Op : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
 
   if (V2IsSplat) {
     // Normalize mask so all entries that point to V2 points to its first
@@ -5602,18 +5530,14 @@
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
     if (X86::isUNPCKLMask(NewSVOp))
-      return (isMMX) ?
-        NewOp : getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
 
     if (X86::isUNPCKHMask(NewSVOp))
-      return (isMMX) ?
-        NewOp : getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
+      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
   }
 
-  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
-
   // Normalize the node to match x86 shuffle ops if needed
-  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
@@ -5627,12 +5551,6 @@
                                 X86::getShufflePALIGNRImmediate(SVOp),
                                 DAG);
 
-  // Only a few shuffle masks are handled for 64-bit vectors (MMX), and
-  // 64-bit vectors which made to this point can't be handled, they are
-  // expanded.
-  if (isMMX)
-    return SDValue();
-
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
     if (VT == MVT::v2f64)
@@ -5681,8 +5599,8 @@
       return NewOp;
   }
 
-  // Handle all 4 wide cases with a number of shuffles except for MMX.
-  if (NumElems == 4 && !isMMX)
+  // Handle all 4 wide cases with a number of shuffles.
+  if (NumElems == 4)
     return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
 
   return SDValue();
@@ -5824,8 +5742,6 @@
     unsigned Opc;
     if (VT == MVT::v8i16)
       Opc = X86ISD::PINSRW;
-    else if (VT == MVT::v4i16)
-      Opc = X86ISD::MMX_PINSRW;
     else if (VT == MVT::v16i8)
       Opc = X86ISD::PINSRB;
     else
@@ -5881,8 +5797,7 @@
       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
     if (N2.getValueType() != MVT::i32)
       N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
-    return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW,
-                       dl, VT, N0, N1, N2);
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
   }
   return SDValue();
 }
@@ -5896,16 +5811,10 @@
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
-  EVT VT = MVT::v2i32;
-  switch (Op.getValueType().getSimpleVT().SimpleTy) {
-  default: break;
-  case MVT::v16i8:
-  case MVT::v8i16:
-    VT = MVT::v4i32;
-    break;
-  }
+  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
+         "Expected an SSE type!");
   return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
-                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
@@ -6322,11 +6231,8 @@
                                            SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
 
-  if (SrcVT.isVector()) {
-    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64)
-      return Op;
+  if (SrcVT.isVector())
     return SDValue();
-  }
 
   assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
          "Unknown SINT_TO_FP to lower!");
@@ -6702,13 +6608,8 @@
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector()) {
-    if (Op.getValueType() == MVT::v2i32 &&
-        Op.getOperand(0).getValueType() == MVT::v2f64) {
-      return Op;
-    }
+  if (Op.getValueType().isVector())
     return SDValue();
-  }
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
@@ -7211,11 +7112,8 @@
 
   switch (VT.getSimpleVT().SimpleTy) {
   default: break;
-  case MVT::v8i8:
   case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
-  case MVT::v4i16:
   case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
-  case MVT::v2i32:
   case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
   case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
   }
@@ -7930,6 +7828,7 @@
       ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
     } else {
       ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+// FIXME this must be lowered to get rid of the invalid type.
     }
 
     EVT VT = Op.getValueType();
@@ -8840,7 +8739,6 @@
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
-  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
@@ -9711,7 +9609,6 @@
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
   case X86::CMOV_GR8:
-  case X86::CMOV_V1I64:
   case X86::CMOV_FR32:
   case X86::CMOV_FR64:
   case X86::CMOV_V4F32:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2a86fa8..1a2da74 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -128,11 +128,15 @@
       /// relative displacements.
       WrapperRIP,
 
-      /// MOVQ2DQ - Copies a 64-bit value from a vector to another vector.
-      /// Can be used to move a vector value from a MMX register to a XMM
-      /// register.
+      /// MOVQ2DQ - Copies a 64-bit value from an MMX vector to the low word
+      /// of an XMM vector, with the high word zero filled.
       MOVQ2DQ,
 
+      /// MOVDQ2Q - Copies a 64-bit value from the low word of an XMM vector
+      /// to an MMX vector.  If you think this is too close to the previous
+      /// mnemonic, so do I; blame Intel.
+      MOVDQ2Q,
+
       /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
       /// i32, corresponds to X86::PEXTRB.
       PEXTRB,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 104f880..70c3d07 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -15,51 +15,8 @@
 // MMX Pattern Fragments
 //===----------------------------------------------------------------------===//
 
-def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
-
-def bc_v8i8  : PatFrag<(ops node:$in), (v8i8  (bitconvert node:$in))>;
-def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
-def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
-def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
-
-//===----------------------------------------------------------------------===//
-// MMX Masks
-//===----------------------------------------------------------------------===//
-
-// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
-// PSHUFW imm.
-def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
-  return getI8Imm(X86::getShuffleSHUFImmediate(N));
-}]>;
-
-// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
-def mmx_unpckh : PatFrag<(ops node:$lhs, node:$rhs),
-                         (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
-def mmx_unpckl : PatFrag<(ops node:$lhs, node:$rhs),
-                         (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
-def mmx_unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
-def mmx_unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
-                               (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
-}]>;
-
-def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
-                         (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
-}], MMX_SHUFFLE_get_shuf_imm>;
+def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
+def bc_mmx  : PatFrag<(ops node:$in), (x86mmx  (bitconvert node:$in))>;
 
 //===----------------------------------------------------------------------===//
 // SSE specific DAG Nodes.
@@ -286,9 +243,7 @@
   return cast<LoadSDNode>(N)->getAlignment() >= 8;
 }]>;
 
-def memopv8i8  : PatFrag<(ops node:$ptr), (v8i8  (memop64 node:$ptr))>;
-def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
-def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+def memopmmx  : PatFrag<(ops node:$ptr), (x86mmx  (memop64 node:$ptr))>;
 
 // MOVNT Support
 // Like 'store', but requires the non-temporal bit to be set
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index b9416be..a927761 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -21,28 +21,7 @@
 //===----------------------------------------------------------------------===//
 
 let Constraints = "$src1 = $dst" in {
-  // MMXI_binop_rm - Simple MMX binary operator based on llvm operator.
-  multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           ValueType OpVT, bit Commutable = 0> {
-    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
-                  (ins VR64:$src1, VR64:$src2),
-                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
-      let isCommutable = Commutable;
-    }
-    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
-                  (ins VR64:$src1, i64mem:$src2),
-                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
-                                         (bitconvert
-                                          (load_mmx addr:$src2)))))]>;
-  }
-
-  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic, with a
-  // different name for the generated instructions than MMXI_binop_rm uses.
-  // Thus int and rm can coexist for different implementations of the same
-  // instruction.  This is temporary during transition to intrinsic-only 
-  // implementation; eventually the non-intrinsic forms will go away.  When
+  // MMXI_binop_rm_int - Simple MMX binary operator based on intrinsic.
   // When this is cleaned up, remove the FIXME from X86RecognizableInstr.cpp.
   multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
                                bit Commutable = 0> {
@@ -59,26 +38,6 @@
                                    (bitconvert (load_mmx addr:$src2))))]>;
   }
 
-  // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
-  //
-  // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
-  // to collapse (bitconvert VT to VT) into its operand.
-  //
-  multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                 bit Commutable = 0> {
-    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
-                                  (ins VR64:$src1, VR64:$src2),
-                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
-      let isCommutable = Commutable;
-    }
-    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
-                                  (ins VR64:$src1, i64mem:$src2),
-                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst,
-                    (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
-  }
-
   multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
                                 string OpcodeStr, Intrinsic IntId,
                                 Intrinsic IntId2> {
@@ -100,7 +59,7 @@
 
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                               PatFrag mem_frag64, Intrinsic IntId64> {
+                               Intrinsic IntId64> {
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))]>;
@@ -108,13 +67,13 @@
   def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
-                     (IntId64 (bitconvert (mem_frag64 addr:$src))))]>;
+                     (IntId64 (bitconvert (memopmmx addr:$src))))]>;
 }
 
 /// Binary MMX instructions requiring SSSE3.
 let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
-                             PatFrag mem_frag64, Intrinsic IntId64> {
+                             Intrinsic IntId64> {
   let isCommutable = 0 in
   def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
@@ -125,18 +84,12 @@
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
          (IntId64 VR64:$src1,
-          (bitconvert (mem_frag64 addr:$src2))))]>;
+          (bitconvert (memopmmx addr:$src2))))]>;
 }
 }
 
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
-      (ins VR64:$src1, VR64:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  def R64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
-      (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
   def R64irr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
@@ -184,12 +137,12 @@
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, 
-                         (v2i32 (scalar_to_vector GR32:$src)))]>;
-let canFoldAsLoad = 1, isReMaterializable = 1 in
+                         (x86mmx (scalar_to_vector GR32:$src)))]>;
+let canFoldAsLoad = 1 in
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
               [(set VR64:$dst,
-               (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+               (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>;
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", []>;
@@ -201,42 +154,41 @@
                              "movd\t{$src, $dst|$dst, $src}",
                              []>;
 
-let neverHasSideEffects = 1 in
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
 // movd.
 def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                (outs GR64:$dst), (ins VR64:$src),
-                               "movd\t{$src, $dst|$dst, $src}", []>;
+                               "movd\t{$src, $dst|$dst, $src}", 
+                             [(set GR64:$dst,
+                              (bitconvert VR64:$src))]>;
 def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
-                              (v1i64 (scalar_to_vector GR64:$src)))]>;
-
+                              (bitconvert GR64:$src))]>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", []>;
-let canFoldAsLoad = 1, isReMaterializable = 1 in
+let canFoldAsLoad = 1 in
 def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, (load_mmx addr:$src))]>;
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
-                        [(store (v1i64 VR64:$src), addr:$dst)]>;
+                        [(store (x86mmx VR64:$src), addr:$dst)]>;
 
 def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
                           "movdq2q\t{$src, $dst|$dst, $src}",
                           [(set VR64:$dst,
-                            (v1i64 (bitconvert
+                            (x86mmx (bitconvert
                             (i64 (vector_extract (v2i64 VR128:$src),
                                   (iPTR 0))))))]>;
 
 def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
                            "movq2dq\t{$src, $dst|$dst, $src}",
           [(set VR128:$dst,
-            (movl immAllZerosV,
-                  (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (v1i64 VR64:$src)))))))]>;
+            (v2i64 (scalar_to_vector
+                              (i64 (bitconvert (x86mmx VR64:$src))))))]>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
@@ -254,54 +206,40 @@
 def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                              "movd\t{$src, $dst|$dst, $src}",
               [(set VR64:$dst,
-                    (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
+                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))]>;
 let AddedComplexity = 20 in
 def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
                            (ins i32mem:$src),
                              "movd\t{$src, $dst|$dst, $src}",
           [(set VR64:$dst,
-                (v2i32 (X86vzmovl (v2i32
+                (x86mmx (X86vzmovl (x86mmx
                                    (scalar_to_vector (loadi32 addr:$src))))))]>;
 
 // Arithmetic Instructions
-defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", memopv8i8,
-                                 int_x86_ssse3_pabs_b>;
-defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", memopv4i16,
-                                 int_x86_ssse3_pabs_w>;
-defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", memopv2i32,
-                                 int_x86_ssse3_pabs_d>;
+defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b>;
+defm MMX_PABSW : SS3I_unop_rm_int_mm<0x1D, "pabsw", int_x86_ssse3_pabs_w>;
+defm MMX_PABSD : SS3I_unop_rm_int_mm<0x1E, "pabsd", int_x86_ssse3_pabs_d>;
 // -- Addition
-defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8,  1>,
-                 MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>;
-defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>,
-                 MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>;
-defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>,
-                 MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>;
-defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>,
-                 MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>;
+defm MMX_PADDB : MMXI_binop_rm_int<0xFC, "paddb", int_x86_mmx_padd_b, 1>;
+defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, 1>;
+defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, 1>;
+defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, 1>;
 defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
 defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
 
 defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
 defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
 
-defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", memopv4i16,
-                                     int_x86_ssse3_phadd_w>;
-defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", memopv2i32,
-                                     int_x86_ssse3_phadd_d>;
-defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw", memopv4i16,
-                                     int_x86_ssse3_phadd_sw>;
+defm MMX_PHADDW  : SS3I_binop_rm_int_mm<0x01, "phaddw", int_x86_ssse3_phadd_w>;
+defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d>;
+defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw>;
 
 
 // -- Subtraction
-defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>,
-                 MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>;
-defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>,
-                 MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>;
-defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>,
-                 MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>;
-defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>,
-                 MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>;
+defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b>;
+defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w>;
+defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d>;
+defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q>;
 
 defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
 defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
@@ -309,28 +247,24 @@
 defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
 defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
 
-defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", memopv4i16,
-                                     int_x86_ssse3_phsub_w>;
-defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", memopv2i32,
-                                     int_x86_ssse3_phsub_d>;
-defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw", memopv4i16,
-                                     int_x86_ssse3_phsub_sw>;
+defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w>;
+defm MMX_PHSUBD  : SS3I_binop_rm_int_mm<0x06, "phsubd", int_x86_ssse3_phsub_d>;
+defm MMX_PHSUBSW : SS3I_binop_rm_int_mm<0x07, "phsubsw",int_x86_ssse3_phsub_sw>;
 
 // -- Multiplication
-defm MMX_PMULLW  : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>,
-                   MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>;
+defm MMX_PMULLW  : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, 1>;
 
 defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
 defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
 defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
 let isCommutable = 1 in
-defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw", memopv4i16,
+defm MMX_PMULHRSW : SS3I_binop_rm_int_mm<0x0B, "pmulhrsw",
                                      int_x86_ssse3_pmul_hr_sw>;
 
 // -- Miscellanea
 defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
 
-defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", memopv8i8,
+defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw",
                                      int_x86_ssse3_pmadd_ub_sw>;
 defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
 defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
@@ -343,57 +277,18 @@
 
 defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
 
-defm MMX_PSIGNB :   SS3I_binop_rm_int_mm<0x08, "psignb", memopv8i8,
-                                     int_x86_ssse3_psign_b>;
-defm MMX_PSIGNW :   SS3I_binop_rm_int_mm<0x09, "psignw", memopv4i16,
-                                     int_x86_ssse3_psign_w>;
-defm MMX_PSIGND :   SS3I_binop_rm_int_mm<0x0A, "psignd", memopv2i32,
-                                     int_x86_ssse3_psign_d>;
+defm MMX_PSIGNB :  SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b>;
+defm MMX_PSIGNW :  SS3I_binop_rm_int_mm<0x09, "psignw", int_x86_ssse3_psign_w>;
+defm MMX_PSIGND :  SS3I_binop_rm_int_mm<0x0A, "psignd", int_x86_ssse3_psign_d>;
 let Constraints = "$src1 = $dst" in
   defm MMX_PALIGN : ssse3_palign_mm<"palignr", int_x86_mmx_palignr_b>;
 
-let AddedComplexity = 5 in {
-
-def : Pat<(v1i64 (palign:$src3 VR64:$src1, VR64:$src2)),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1,
-                       (SHUFFLE_get_palign_imm VR64:$src3))>,
-          Requires<[HasSSSE3]>;
-def : Pat<(v2i32 (palign:$src3 VR64:$src1, VR64:$src2)),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1,
-                       (SHUFFLE_get_palign_imm VR64:$src3))>,
-          Requires<[HasSSSE3]>;
-def : Pat<(v4i16 (palign:$src3 VR64:$src1, VR64:$src2)),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1,
-                       (SHUFFLE_get_palign_imm VR64:$src3))>,
-          Requires<[HasSSSE3]>;
-def : Pat<(v8i8 (palign:$src3 VR64:$src1, VR64:$src2)),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1,
-                       (SHUFFLE_get_palign_imm VR64:$src3))>,
-          Requires<[HasSSSE3]>;
-}
-
 // Logical Instructions
-defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>,
-                MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
-defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>,
-                MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
-defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>,
-                MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
+defm MMX_PAND : MMXI_binop_rm_int<0xDB, "pand", int_x86_mmx_pand, 1>;
+defm MMX_POR  : MMXI_binop_rm_int<0xEB, "por" , int_x86_mmx_por,  1>;
+defm MMX_PXOR : MMXI_binop_rm_int<0xEF, "pxor", int_x86_mmx_pxor, 1>;
 defm MMX_PANDN : MMXI_binop_rm_int<0xDF, "pandn", int_x86_mmx_pandn, 1>;
 
-let Constraints = "$src1 = $dst" in {
-  def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
-                         (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                         "pandn\t{$src2, $dst|$dst, $src2}",
-                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
-                                                  VR64:$src2)))]>;
-  def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
-                         (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                         "pandn\t{$src2, $dst|$dst, $src2}",
-                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
-                                                  (load addr:$src2))))]>;
-}
-
 // Shift Instructions
 defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
                                     int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
@@ -414,12 +309,6 @@
 defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
 
-// Shift up / down and insert zero's.
-def : Pat<(v1i64 (X86vshl     VR64:$src, (i8 imm:$amt))),
-          (MMX_PSLLQri VR64:$src, (GetLo32XForm imm:$amt))>;
-def : Pat<(v1i64 (X86vshr     VR64:$src, (i8 imm:$amt))),
-          (MMX_PSRLQri VR64:$src, (GetLo32XForm imm:$amt))>;
-
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
 defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
@@ -429,84 +318,7 @@
 defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
 defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
 
-// Conversion Instructions
-
 // -- Unpack Instructions
-let Constraints = "$src1 = $dst" in {
-  // Unpack High Packed Data Instructions
-  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v8i8 (mmx_unpckh VR64:$src1,
-                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
-
-  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v4i16 (mmx_unpckh VR64:$src1,
-                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
-
-  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v2i32 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v2i32 (mmx_unpckh VR64:$src1,
-                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
-
-  // Unpack Low Packed Data Instructions
-  def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v8i8 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v8i8 (mmx_unpckl VR64:$src1,
-                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
-
-  def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v4i16 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v4i16 (mmx_unpckl VR64:$src1,
-                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
-
-  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
-                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
-                             "punpckldq\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
-  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
-                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
-                             "punpckldq\t{$src2, $dst|$dst, $src2}",
-                             [(set VR64:$dst,
-                               (v2i32 (mmx_unpckl VR64:$src1,
-                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
-}
 defm MMX_PUNPCKHBW : MMXI_binop_rm_int<0x68, "punpckhbw", 
                                        int_x86_mmx_punpckhbw>;
 defm MMX_PUNPCKHWD : MMXI_binop_rm_int<0x69, "punpckhwd", 
@@ -526,61 +338,9 @@
 defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
 
 // -- Shuffle Instructions
-def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
-                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
-                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                          [(set VR64:$dst,
-                            (v4i16 (mmx_pshufw:$src2 VR64:$src1, (undef))))]>;
-def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
-                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
-                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                          [(set VR64:$dst,
-                            (mmx_pshufw:$src2 (bc_v4i16 (load_mmx addr:$src1)),
-                                              (undef)))]>;
-
-defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", memopv8i8,
-                                     int_x86_ssse3_pshuf_b>;
-// Shuffle with PALIGN
-def : Pat<(v1i64 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
-def : Pat<(v2i32 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
-def : Pat<(v4i16 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
-def : Pat<(v8i8 (X86PAlign VR64:$src1, VR64:$src2, (i8 imm:$imm))),
-          (MMX_PALIGNR64rr VR64:$src2, VR64:$src1, imm:$imm)>;
+defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b>;
 
 // -- Conversion Instructions
-let neverHasSideEffects = 1 in {
-def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
-                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst),
-                            (ins i64mem:$src),
-                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
-
-def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
-                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst),
-                           (ins i64mem:$src),
-                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
-
-def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst),
-                            (ins f128mem:$src),
-                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
-
-def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
-                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
-let mayLoad = 1 in
-def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
-                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
-} // end neverHasSideEffects
-
-// Intrinsic forms.
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
                       SSEPackedSingle>, TB;
@@ -602,43 +362,14 @@
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
                          SSEPackedSingle>, TB;
 }
-// MMX->MMX vector casts.
-def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
-          (MMX_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
-def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
-          (MMX_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
 
 // Extract / Insert
-def MMX_X86pinsrw : SDNode<"X86ISD::MMX_PINSRW",
-                    SDTypeProfile<1, 3, [SDTCisVT<0, v4i16>, SDTCisSameAs<0,1>,
-                                         SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-
-
-def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
-                           (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
-                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [(set GR32:$dst, (X86pextrw (v4i16 VR64:$src1),
-                                             (iPTR imm:$src2)))]>;
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
                            (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
                            "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                            [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
                                              (iPTR imm:$src2)))]>;
 let Constraints = "$src1 = $dst" in {
-  def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
-                      (outs VR64:$dst), 
-                      (ins VR64:$src1, GR32:$src2,i16i8imm:$src3),
-                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
-                                               GR32:$src2,(iPTR imm:$src3))))]>;
-  def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
-                     (outs VR64:$dst),
-                     (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
-                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set VR64:$dst,
-                       (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
-                               (i32 (anyext (loadi16 addr:$src2))),
-                               (iPTR imm:$src3))))]>;
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
                       (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
@@ -655,9 +386,16 @@
                                        (iPTR imm:$src3)))]>;
 }
 
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, 
+                                (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+
 // MMX to XMM for vector types
 def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, v2i64>, SDTCisVT<1, v1i64>]>>;
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
 
 def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
           (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
@@ -665,14 +403,19 @@
 def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
           (v2i64 (MOVQI2PQIrm addr:$src))>;
 
-def : Pat<(v2i64 (MMX_X86movq2dq (v1i64 (bitconvert
-                            (v2i32 (scalar_to_vector (loadi32 addr:$src))))))),
+def : Pat<(v2i64 (MMX_X86movq2dq 
+                    (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
           (v2i64 (MOVDI2PDIrm addr:$src))>;
 
-// Mask creation
-def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
-                          "pmovmskb\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+// Low word of XMM to MMX.
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
+          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
+
+def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
+          (x86mmx (MMX_MOVQ64rm addr:$src))>;
 
 // Misc.
 let Uses = [EDI] in
@@ -684,181 +427,14 @@
                            "maskmovq\t{$mask, $src|$src, $mask}",
                            [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
 
-//===----------------------------------------------------------------------===//
-// Alias Instructions
-//===----------------------------------------------------------------------===//
-
-// Alias instructions that map zero vector to pxor.
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
-  // FIXME: Change encoding to pseudo.
-  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins), "",
-                              [(set VR64:$dst, (v2i32 immAllZerosV))]>;
-  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins), "",
-                              [(set VR64:$dst, (v2i32 immAllOnesV))]>;
-}
-
-let Predicates = [HasMMX] in {
-  def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
-  def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
-  def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
-}
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
-
-// Store 64-bit integer vector values.
-def : Pat<(store (v8i8  VR64:$src), addr:$dst),
-          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-def : Pat<(store (v4i16 VR64:$src), addr:$dst),
-          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-def : Pat<(store (v2i32 VR64:$src), addr:$dst),
-          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-def : Pat<(store (v1i64 VR64:$src), addr:$dst),
-          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
-
-// Bit convert.
-def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
-
 // 64-bit bit convert.
-def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
-          (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
-          (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
-          (MMX_MOVD64to64rr GR64:$src)>;
-def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
+def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVD64from64rr VR64:$src)>;
-def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
-          (MMX_MOVD64from64rr VR64:$src)>;
-def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
-          (MMX_MOVD64from64rr VR64:$src)>;
-def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
-          (MMX_MOVD64from64rr VR64:$src)>;
-def : Pat<(f64 (bitconvert (v1i64 VR64:$src))),
+def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
-def : Pat<(f64 (bitconvert (v2i32 VR64:$src))),
-          (MMX_MOVQ2FR64rr VR64:$src)>;
-def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
-          (MMX_MOVQ2FR64rr VR64:$src)>;
-def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
-          (MMX_MOVQ2FR64rr VR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64 FR64:$src))),
-          (MMX_MOVFR642Qrr FR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64 FR64:$src))),
-          (MMX_MOVFR642Qrr FR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64 FR64:$src))),
-          (MMX_MOVFR642Qrr FR64:$src)>;
-def : Pat<(v8i8 (bitconvert (f64 FR64:$src))),
+def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
 
-let AddedComplexity = 20 in {
-  def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
-            (MMX_MOVZDI2PDIrm addr:$src)>;
-}
 
-// Clear top half.
-let AddedComplexity = 15 in {
-  def : Pat<(v2i32 (X86vzmovl VR64:$src)),
-            (MMX_PUNPCKLDQrr VR64:$src, (v2i32 (MMX_V_SET0)))>;
-}
-
-// Patterns to perform canonical versions of vector shuffling.
-let AddedComplexity = 10 in {
-  def : Pat<(v8i8  (mmx_unpckl_undef VR64:$src, (undef))),
-            (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
-  def : Pat<(v4i16 (mmx_unpckl_undef VR64:$src, (undef))),
-            (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
-  def : Pat<(v2i32 (mmx_unpckl_undef VR64:$src, (undef))),
-            (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
-}
-
-let AddedComplexity = 10 in {
-  def : Pat<(v8i8  (mmx_unpckh_undef VR64:$src, (undef))),
-            (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
-  def : Pat<(v4i16 (mmx_unpckh_undef VR64:$src, (undef))),
-            (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
-  def : Pat<(v2i32 (mmx_unpckh_undef VR64:$src, (undef))),
-            (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
-}
-
-// Some special case PANDN patterns.
-// FIXME: Get rid of these.
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
-                  VR64:$src2)),
-          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
-def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
-                  (load addr:$src2))),
-          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
-
-// Move MMX to lower 64-bit of XMM
-def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v4i16 VR64:$src))))),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v2i32 VR64:$src))))),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v1i64 VR64:$src))))),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-// Move lower 64-bit of XMM to MMX.
-def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
-                                                  (iPTR 0))))),
-          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
-                                                  (iPTR 0))))),
-          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
-                                                  (iPTR 0))))),
-          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
-
-// Patterns for vector comparisons
-def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, VR64:$src2)),
-          (MMX_PCMPEQBirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPEQBirm VR64:$src1, addr:$src2)>;
-def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, VR64:$src2)),
-          (MMX_PCMPEQWirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPEQWirm VR64:$src1, addr:$src2)>;
-def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, VR64:$src2)),
-          (MMX_PCMPEQDirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPEQDirm VR64:$src1, addr:$src2)>;
-
-def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, VR64:$src2)),
-          (MMX_PCMPGTBirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPGTBirm VR64:$src1, addr:$src2)>;
-def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, VR64:$src2)),
-          (MMX_PCMPGTWirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPGTWirm VR64:$src1, addr:$src2)>;
-def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, VR64:$src2)),
-          (MMX_PCMPGTDirr VR64:$src1, VR64:$src2)>;
-def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
-          (MMX_PCMPGTDirm VR64:$src1, addr:$src2)>;
-
-// CMOV* - Used to implement the SELECT DAG operation.  Expanded after
-// instruction selection into a branch sequence.
-let Uses = [EFLAGS], usesCustomInserter = 1 in {
-  def CMOV_V1I64 : I<0, Pseudo,
-                    (outs VR64:$dst), (ins VR64:$t, VR64:$f, i8imm:$cond),
-                    "#CMOV_V1I64 PSEUDO!",
-                    [(set VR64:$dst,
-                      (v1i64 (X86cmov VR64:$t, VR64:$f, imm:$cond,
-                                          EFLAGS)))]>;
-}
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9527964..29ee2bf 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -377,9 +377,6 @@
   case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::MMX_V_SET0:   LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break;
-  case X86::MMX_V_SETALLONES:
-    LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDirr); break;
   case X86::FsFLD0SS:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::FsFLD0SD:      LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
   case X86::V_SET0PS:      LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 7b34391..959c01e 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -792,7 +792,7 @@
 }
 
 // Generic vector registers: VR64 and VR128.
-def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64], 64,
+def VR64: RegisterClass<"X86", [x86mmx], 64,
                           [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
 def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
                           [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 47df235..e8b168e 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -791,7 +791,8 @@
       } else if (New->getType()->isVoidTy()) {
         // Our return value has uses, but they will get removed later on.
         // Replace by null for now.
-        Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
+        if (!Call->getType()->isX86_MMXTy())
+          Call->replaceAllUsesWith(Constant::getNullValue(Call->getType()));
       } else {
         assert(RetTy->isStructTy() &&
                "Return type changed, but not into a void. The old return type"
@@ -854,7 +855,8 @@
     } else {
       // If this argument is dead, replace any uses of it with null constants
       // (these are guaranteed to become unused later on).
-      I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
+      if (!I->getType()->isX86_MMXTy())
+        I->replaceAllUsesWith(Constant::getNullValue(I->getType()));
     }
 
   // If we change the return value of the function we must rewrite any return
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index fee317d..33ecb5b 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -321,6 +321,9 @@
       // Don't break volatile loads.
       if (LI->isVolatile())
         return false;
+      // Don't touch MMX operations.
+      if (LI->getType()->isX86_MMXTy())
+        return false;
       MergeInType(LI->getType(), Offset);
       continue;
     }
@@ -328,6 +331,9 @@
     if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
       // Storing the pointer, not into the value?
       if (SI->getOperand(0) == V || SI->isVolatile()) return false;
+      // Don't touch MMX operations.
+      if (SI->getOperand(0)->getType()->isX86_MMXTy())
+        return false;
       MergeInType(SI->getOperand(0)->getType(), Offset);
       continue;
     }
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 9330e14..5b5955d 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -288,37 +288,224 @@
     break;
   case 'x': 
     // This fixes all MMX shift intrinsic instructions to take a
-    // v1i64 instead of a v2i32 as the second parameter.
-    if (Name.compare(5,10,"x86.mmx.ps",10) == 0 &&
-        (Name.compare(13,4,"psll", 4) == 0 ||
-         Name.compare(13,4,"psra", 4) == 0 ||
-         Name.compare(13,4,"psrl", 4) == 0) && Name[17] != 'i') {
-      
-      const llvm::Type *VT =
-                    VectorType::get(IntegerType::get(FTy->getContext(), 64), 1);
-      
-      // We don't have to do anything if the parameter already has
-      // the correct type.
-      if (FTy->getParamType(1) == VT)
-        break;
-      
-      //  We first need to change the name of the old (bad) intrinsic, because 
-      //  its type is incorrect, but we cannot overload that name. We 
-      //  arbitrarily unique it here allowing us to construct a correctly named 
-      //  and typed function below.
-      F->setName("");
+    // x86_mmx instead of a v1i64, v2i32, v4i16, or v8i8.
+    if (Name.compare(5, 8, "x86.mmx.", 8) == 0) {
+      const Type *X86_MMXTy = VectorType::getX86_MMXTy(FTy->getContext());
 
-      assert(FTy->getNumParams() == 2 && "MMX shift intrinsics take 2 args!");
-      
-      //  Now construct the new intrinsic with the correct name and type. We 
-      //  leave the old function around in order to query its type, whatever it 
-      //  may be, and correctly convert up to the new type.
-      NewFn = cast<Function>(M->getOrInsertFunction(Name, 
-                                                    FTy->getReturnType(),
-                                                    FTy->getParamType(0),
-                                                    VT,
-                                                    (Type *)0));
-      return true;
+      if (Name.compare(13, 4, "padd", 4) == 0   ||
+          Name.compare(13, 4, "psub", 4) == 0   ||
+          Name.compare(13, 4, "pmul", 4) == 0   ||
+          Name.compare(13, 5, "pmadd", 5) == 0  ||
+          Name.compare(13, 4, "pand", 4) == 0   ||
+          Name.compare(13, 3, "por", 3) == 0    ||
+          Name.compare(13, 4, "pxor", 4) == 0   ||
+          Name.compare(13, 4, "pavg", 4) == 0   ||
+          Name.compare(13, 4, "pmax", 4) == 0   ||
+          Name.compare(13, 4, "pmin", 4) == 0   ||
+          Name.compare(13, 4, "psad", 4) == 0   ||
+          Name.compare(13, 4, "psll", 4) == 0   ||
+          Name.compare(13, 4, "psrl", 4) == 0   ||
+          Name.compare(13, 4, "psra", 4) == 0   ||
+          Name.compare(13, 4, "pack", 4) == 0   ||
+          Name.compare(13, 6, "punpck", 6) == 0 ||
+          Name.compare(13, 4, "pcmp", 4) == 0) {
+        assert(FTy->getNumParams() == 2 && "MMX intrinsic takes 2 args!");
+        const Type *SecondParamTy = X86_MMXTy;
+
+        if (Name.compare(13, 5, "pslli", 5) == 0 ||
+            Name.compare(13, 5, "psrli", 5) == 0 ||
+            Name.compare(13, 5, "psrai", 5) == 0)
+          SecondParamTy = FTy->getParamType(1);
+
+        // Don't do anything if it has the correct types.
+        if (FTy->getReturnType() == X86_MMXTy &&
+            FTy->getParamType(0) == X86_MMXTy &&
+            FTy->getParamType(1) == SecondParamTy)
+          break;
+
+        // We first need to change the name of the old (bad) intrinsic, because
+        // its type is incorrect, but we cannot overload that name. We
+        // arbitrarily unique it here allowing us to construct a correctly named
+        // and typed function below.
+        F->setName("");
+
+        // Now construct the new intrinsic with the correct name and type. We
+        // leave the old function around in order to query its type, whatever it
+        // may be, and correctly convert up to the new type.
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      X86_MMXTy, X86_MMXTy,
+                                                      SecondParamTy, (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 8, "maskmovq", 8) == 0) {
+        // Don't do anything if it has the correct types.
+        if (FTy->getParamType(0) == X86_MMXTy &&
+            FTy->getParamType(1) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      FTy->getReturnType(),
+                                                      X86_MMXTy,
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(2),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 8, "pmovmskb", 8) == 0) {
+        if (FTy->getParamType(0) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      FTy->getReturnType(),
+                                                      X86_MMXTy,
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 5, "movnt", 5) == 0) {
+        if (FTy->getParamType(1) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      FTy->getReturnType(),
+                                                      FTy->getParamType(0),
+                                                      X86_MMXTy,
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 7, "palignr", 7) == 0) {
+        if (FTy->getReturnType() == X86_MMXTy &&
+            FTy->getParamType(0) == X86_MMXTy &&
+            FTy->getParamType(1) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      X86_MMXTy,
+                                                      X86_MMXTy,
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(2),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 5, "pextr", 5) == 0) {
+        if (FTy->getParamType(0) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      FTy->getReturnType(),
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(1),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 5, "pinsr", 5) == 0) {
+        if (FTy->getReturnType() == X86_MMXTy &&
+            FTy->getParamType(0) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      X86_MMXTy,
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(1),
+                                                      FTy->getParamType(2),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 12, "cvtsi32.si64", 12) == 0) {
+        if (FTy->getReturnType() == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(0),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 12, "cvtsi64.si32", 12) == 0) {
+        if (FTy->getParamType(0) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      FTy->getReturnType(),
+                                                      X86_MMXTy,
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 8, "vec.init", 8) == 0) {
+        if (FTy->getReturnType() == X86_MMXTy)
+          break;
+
+        F->setName("");
+
+        if (Name.compare(21, 2, ".b", 2) == 0)
+          NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                        X86_MMXTy,
+                                                        FTy->getParamType(0),
+                                                        FTy->getParamType(1),
+                                                        FTy->getParamType(2),
+                                                        FTy->getParamType(3),
+                                                        FTy->getParamType(4),
+                                                        FTy->getParamType(5),
+                                                        FTy->getParamType(6),
+                                                        FTy->getParamType(7),
+                                                        (Type*)0));
+        else if (Name.compare(21, 2, ".w", 2) == 0)
+          NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                        X86_MMXTy,
+                                                        FTy->getParamType(0),
+                                                        FTy->getParamType(1),
+                                                        FTy->getParamType(2),
+                                                        FTy->getParamType(3),
+                                                        (Type*)0));
+        else if (Name.compare(21, 2, ".d", 2) == 0)
+          NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                        X86_MMXTy,
+                                                        FTy->getParamType(0),
+                                                        FTy->getParamType(1),
+                                                        (Type*)0));
+        return true;
+      }
+
+
+      if (Name.compare(13, 9, "vec.ext.d", 9) == 0) {
+        if (FTy->getReturnType() == X86_MMXTy &&
+            FTy->getParamType(0) == X86_MMXTy)
+          break;
+
+        F->setName("");
+        NewFn = cast<Function>(M->getOrInsertFunction(Name, 
+                                                      X86_MMXTy,
+                                                      X86_MMXTy,
+                                                      FTy->getParamType(1),
+                                                      (Type*)0));
+        return true;
+      }
+
+      if (Name.compare(13, 9, "emms", 4) == 0 ||
+          Name.compare(13, 9, "femms", 5) == 0) {
+        NewFn = 0;
+        break;
+      }
+
+      // We really shouldn't get here ever.
+      assert(0 && "Invalid MMX intrinsic!");
+      break;
     } else if (Name.compare(5,17,"x86.sse2.loadh.pd",17) == 0 ||
                Name.compare(5,17,"x86.sse2.loadl.pd",17) == 0 ||
                Name.compare(5,16,"x86.sse2.movl.dq",16) == 0 ||
@@ -432,6 +619,38 @@
                           "upgraded."+CI->getName(), CI);
 }
 
+/// ConstructNewCallInst - Construct a new CallInst with the signature of NewFn.
+static void ConstructNewCallInst(Function *NewFn, CallInst *OldCI,
+                                 Value **Operands, unsigned NumOps,
+                                 bool AssignName = true) {
+  // Construct a new CallInst.
+  CallInst *NewCI =
+    CallInst::Create(NewFn, Operands, Operands + NumOps,
+                     AssignName ? "upgraded." + OldCI->getName() : "", OldCI);
+
+  NewCI->setTailCall(OldCI->isTailCall());
+  NewCI->setCallingConv(OldCI->getCallingConv());
+
+  // Handle any uses of the old CallInst.
+  if (!OldCI->use_empty()) {
+    // If the type has changed, add a cast.
+    Instruction *I = OldCI;
+    if (OldCI->getType() != NewCI->getType()) {
+      Function *OldFn = OldCI->getCalledFunction();
+      CastInst *RetCast =
+        CastInst::Create(CastInst::getCastOpcode(NewCI, true,
+                                                 OldFn->getReturnType(), true),
+                         NewCI, OldFn->getReturnType(), NewCI->getName(),OldCI);
+      I = RetCast;
+    }
+    // Replace all uses of the old call with the new cast which has the 
+    // correct type.
+    OldCI->replaceAllUsesWith(I);
+  }
+  // Clean up the old call now that it has been completely upgraded.
+  OldCI->eraseFromParent();
+}
+
 // UpgradeIntrinsicCall - Upgrade a call to an old intrinsic to be a call the 
 // upgraded intrinsic. All argument and return casting must be provided in 
 // order to seamlessly integrate with existing context.
@@ -759,40 +978,246 @@
     break;
   }        
 
+  case Intrinsic::x86_mmx_padd_b:
+  case Intrinsic::x86_mmx_padd_w:
+  case Intrinsic::x86_mmx_padd_d:
+  case Intrinsic::x86_mmx_padd_q:
+  case Intrinsic::x86_mmx_padds_b:
+  case Intrinsic::x86_mmx_padds_w:
+  case Intrinsic::x86_mmx_paddus_b:
+  case Intrinsic::x86_mmx_paddus_w:
+  case Intrinsic::x86_mmx_psub_b:
+  case Intrinsic::x86_mmx_psub_w:
+  case Intrinsic::x86_mmx_psub_d:
+  case Intrinsic::x86_mmx_psub_q:
+  case Intrinsic::x86_mmx_psubs_b:
+  case Intrinsic::x86_mmx_psubs_w:
+  case Intrinsic::x86_mmx_psubus_b:
+  case Intrinsic::x86_mmx_psubus_w:
+  case Intrinsic::x86_mmx_pmulh_w:
+  case Intrinsic::x86_mmx_pmull_w:
+  case Intrinsic::x86_mmx_pmulhu_w:
+  case Intrinsic::x86_mmx_pmulu_dq:
+  case Intrinsic::x86_mmx_pmadd_wd:
+  case Intrinsic::x86_mmx_pand:
+  case Intrinsic::x86_mmx_pandn:
+  case Intrinsic::x86_mmx_por:
+  case Intrinsic::x86_mmx_pxor:
+  case Intrinsic::x86_mmx_pavg_b:
+  case Intrinsic::x86_mmx_pavg_w:
+  case Intrinsic::x86_mmx_pmaxu_b:
+  case Intrinsic::x86_mmx_pmaxs_w:
+  case Intrinsic::x86_mmx_pminu_b:
+  case Intrinsic::x86_mmx_pmins_w:
+  case Intrinsic::x86_mmx_psad_bw:
+  case Intrinsic::x86_mmx_psll_w:
   case Intrinsic::x86_mmx_psll_d:
   case Intrinsic::x86_mmx_psll_q:
-  case Intrinsic::x86_mmx_psll_w:
-  case Intrinsic::x86_mmx_psra_d:
-  case Intrinsic::x86_mmx_psra_w:
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrl_w:
   case Intrinsic::x86_mmx_psrl_d:
   case Intrinsic::x86_mmx_psrl_q:
-  case Intrinsic::x86_mmx_psrl_w: {
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psra_w:
+  case Intrinsic::x86_mmx_psra_d:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d:
+  case Intrinsic::x86_mmx_packsswb:
+  case Intrinsic::x86_mmx_packssdw:
+  case Intrinsic::x86_mmx_packuswb:
+  case Intrinsic::x86_mmx_punpckhbw:
+  case Intrinsic::x86_mmx_punpckhwd:
+  case Intrinsic::x86_mmx_punpckhdq:
+  case Intrinsic::x86_mmx_punpcklbw:
+  case Intrinsic::x86_mmx_punpcklwd:
+  case Intrinsic::x86_mmx_punpckldq:
+  case Intrinsic::x86_mmx_pcmpeq_b:
+  case Intrinsic::x86_mmx_pcmpeq_w:
+  case Intrinsic::x86_mmx_pcmpeq_d:
+  case Intrinsic::x86_mmx_pcmpgt_b:
+  case Intrinsic::x86_mmx_pcmpgt_w:
+  case Intrinsic::x86_mmx_pcmpgt_d: {
     Value *Operands[2];
     
-    Operands[0] = CI->getArgOperand(0);
-    
-    // Cast the second parameter to the correct type.
-    BitCastInst *BC = new BitCastInst(CI->getArgOperand(1), 
-                                      NewFn->getFunctionType()->getParamType(1),
-                                      "upgraded.", CI);
-    Operands[1] = BC;
-    
-    //  Construct a new CallInst
-    CallInst *NewCI = CallInst::Create(NewFn, Operands, Operands+2, 
-                                       "upgraded."+CI->getName(), CI);
-    NewCI->setTailCall(CI->isTailCall());
-    NewCI->setCallingConv(CI->getCallingConv());
-    
-    //  Handle any uses of the old CallInst.
-    if (!CI->use_empty())
-      //  Replace all uses of the old call with the new cast which has the 
-      //  correct type.
-      CI->replaceAllUsesWith(NewCI);
-    
-    //  Clean up the old call now that it has been completely upgraded.
-    CI->eraseFromParent();
+    // Cast the operand to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0), 
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+
+    switch (NewFn->getIntrinsicID()) {
+    default:
+      // Cast to the X86 MMX type.
+      Operands[1] = new BitCastInst(CI->getArgOperand(1), 
+                                    NewFn->getFunctionType()->getParamType(1),
+                                    "upgraded.", CI);
+      break;
+    case Intrinsic::x86_mmx_pslli_w:
+    case Intrinsic::x86_mmx_pslli_d:
+    case Intrinsic::x86_mmx_pslli_q:
+    case Intrinsic::x86_mmx_psrli_w:
+    case Intrinsic::x86_mmx_psrli_d:
+    case Intrinsic::x86_mmx_psrli_q:
+    case Intrinsic::x86_mmx_psrai_w:
+    case Intrinsic::x86_mmx_psrai_d:
+      // These take an i32 as their second parameter.
+      Operands[1] = CI->getArgOperand(1);
+      break;
+    }
+
+    ConstructNewCallInst(NewFn, CI, Operands, 2);
     break;
-  }        
+  }
+  case Intrinsic::x86_mmx_maskmovq: {
+    Value *Operands[3];
+
+    // Cast the operands to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0), 
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+    Operands[1] = new BitCastInst(CI->getArgOperand(1), 
+                                  NewFn->getFunctionType()->getParamType(1),
+                                  "upgraded.", CI);
+    Operands[2] = CI->getArgOperand(2);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 3, false);
+    break;
+  }
+  case Intrinsic::x86_mmx_pmovmskb: {
+    Value *Operands[1];
+
+    // Cast the operand to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0), 
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 1);
+    break;
+  }
+  case Intrinsic::x86_mmx_movnt_dq: {
+    Value *Operands[2];
+
+    Operands[0] = CI->getArgOperand(0);
+
+    // Cast the operand to the X86 MMX type.
+    Operands[1] = new BitCastInst(CI->getArgOperand(1),
+                                  NewFn->getFunctionType()->getParamType(1),
+                                  "upgraded.", CI);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 2, false);
+    break;
+  }
+  case Intrinsic::x86_mmx_palignr_b: {
+    Value *Operands[3];
+
+    // Cast the operands to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0),
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+    Operands[1] = new BitCastInst(CI->getArgOperand(1),
+                                  NewFn->getFunctionType()->getParamType(1),
+                                  "upgraded.", CI);
+    Operands[2] = CI->getArgOperand(2);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 3);
+    break;
+  }
+  case Intrinsic::x86_mmx_pextr_w: {
+    Value *Operands[2];
+
+    // Cast the operands to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0),
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+    Operands[1] = CI->getArgOperand(1);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 2);
+    break;
+  }
+  case Intrinsic::x86_mmx_pinsr_w: {
+    Value *Operands[3];
+
+    // Cast the operands to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0),
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+    Operands[1] = CI->getArgOperand(1);
+    Operands[2] = CI->getArgOperand(2);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 3);
+    break;
+  }
+#if 0
+  case Intrinsic::x86_mmx_cvtsi32_si64: {
+    // The return type needs to be changed.
+    Value *Operands[1];
+    Operands[0] = CI->getArgOperand(0);
+    ConstructNewCallInst(NewFn, CI, Operands, 1);
+    break;
+  }
+  case Intrinsic::x86_mmx_cvtsi64_si32: {
+    Value *Operands[1];
+
+    // Cast the operand to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0),
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 1);
+    break;
+  }
+  case Intrinsic::x86_mmx_vec_init_b:
+  case Intrinsic::x86_mmx_vec_init_w:
+  case Intrinsic::x86_mmx_vec_init_d: {
+    // The return type needs to be changed.
+    Value *Operands[8];
+    unsigned NumOps = 0;
+
+    switch (NewFn->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_mmx_vec_init_b: NumOps = 8; break;
+    case Intrinsic::x86_mmx_vec_init_w: NumOps = 4; break;
+    case Intrinsic::x86_mmx_vec_init_d: NumOps = 2; break;
+    }
+
+    switch (NewFn->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_mmx_vec_init_b:
+      Operands[7] = CI->getArgOperand(7);
+      Operands[6] = CI->getArgOperand(6);
+      Operands[5] = CI->getArgOperand(5);
+      Operands[4] = CI->getArgOperand(4);
+      // FALLTHRU
+    case Intrinsic::x86_mmx_vec_init_w:
+      Operands[3] = CI->getArgOperand(3);
+      Operands[2] = CI->getArgOperand(2);
+      // FALLTHRU
+    case Intrinsic::x86_mmx_vec_init_d:
+      Operands[1] = CI->getArgOperand(1);
+      Operands[0] = CI->getArgOperand(0);
+      break;
+    }
+
+    ConstructNewCallInst(NewFn, CI, Operands, NumOps);
+    break;
+  }
+  case Intrinsic::x86_mmx_vec_ext_d: {
+    Value *Operands[2];
+
+    // Cast the operand to the X86 MMX type.
+    Operands[0] = new BitCastInst(CI->getArgOperand(0),
+                                  NewFn->getFunctionType()->getParamType(0),
+                                  "upgraded.", CI);
+    Operands[1] = CI->getArgOperand(1);
+
+    ConstructNewCallInst(NewFn, CI, Operands, 2);
+    break;
+  }
+#endif
+
   case Intrinsic::ctlz:
   case Intrinsic::ctpop:
   case Intrinsic::cttz: {
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 7f8df58..754d992 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -2360,6 +2360,8 @@
     } else {                                    // Casting from something else
       return false;
     }
+  } else if (DestTy->isX86_MMXTy()) {     
+    return SrcBits == 64;
   } else {                                      // Casting to something else
     return false;
   }
@@ -2441,6 +2443,10 @@
       return BitCast;                             // vector -> vector
     } else if (DestPTy->getBitWidth() == SrcBits) {
       return BitCast;                               // float/int -> vector
+    } else if (SrcTy->isX86_MMXTy()) {
+      assert(DestPTy->getBitWidth()==64 &&
+             "Casting X86_MMX to vector of wrong width");
+      return BitCast;                             // MMX to 64-bit vector
     } else {
       assert(!"Illegal cast to vector (wrong type or size)");
     }
@@ -2452,6 +2458,14 @@
     } else {
       assert(!"Casting pointer to other than pointer or int");
     }
+  } else if (DestTy->isX86_MMXTy()) {
+    if (const VectorType *SrcPTy = dyn_cast<VectorType>(SrcTy)) {
+      assert(SrcPTy->getBitWidth()==64 &&
+             "Casting vector of wrong width to X86_MMX");
+      return BitCast;                               // 64-bit vector to MMX
+    } else {
+      assert(!"Illegal cast to X86_MMX");
+    }
   } else {
     assert(!"Casting to type that is not first-class");
   }
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 3704999..ba50e07 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -173,10 +173,20 @@
     return false;
 
   // Vector -> Vector conversions are always lossless if the two vector types
-  // have the same size, otherwise not.
-  if (const VectorType *thisPTy = dyn_cast<VectorType>(this))
+  // have the same size, otherwise not.  Also, 64-bit vector types can be
+  // converted to x86mmx.
+  if (const VectorType *thisPTy = dyn_cast<VectorType>(this)) {
     if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
       return thisPTy->getBitWidth() == thatPTy->getBitWidth();
+    if (Ty->getTypeID() == Type::X86_MMXTyID &&
+        thisPTy->getBitWidth() == 64)
+      return true;
+  }
+
+  if (this->getTypeID() == Type::X86_MMXTyID)
+    if (const VectorType *thatPTy = dyn_cast<VectorType>(Ty))
+      if (thatPTy->getBitWidth() == 64)
+        return true;
 
   // At this point we have only various mismatches of the first class types
   // remaining and ptr->ptr. Just select the lossless conversions. Everything