diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index 28a4472..0230790 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -674,6 +674,156 @@
 }
 
 //===----------------------------------------------------------------------===//
+// SSE4.1
+
+// FP rounding ops
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_round_ss        : GCCBuiltin<"__builtin_ia32_roundss">,
+              Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_sse41_round_ps        : GCCBuiltin<"__builtin_ia32_roundps">,
+              Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_sse41_round_sd        : GCCBuiltin<"__builtin_ia32_roundsd">,
+              Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_sse41_round_pd        : GCCBuiltin<"__builtin_ia32_roundpd">,
+              Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty,
+                         llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Vector sign and zero extend
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pmovsxbd        : GCCBuiltin<"__builtin_ia32_pmovsxbd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovsxbq        : GCCBuiltin<"__builtin_ia32_pmovsxbq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovsxbw        : GCCBuiltin<"__builtin_ia32_pmovsxbw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovsxdq        : GCCBuiltin<"__builtin_ia32_pmovsxdq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pmovsxwd        : GCCBuiltin<"__builtin_ia32_pmovsxwd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
+  def int_x86_sse41_pmovsxwq        : GCCBuiltin<"__builtin_ia32_pmovsxwq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
+  def int_x86_sse41_pmovzxbd        : GCCBuiltin<"__builtin_ia32_pmovzxbd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovzxbq        : GCCBuiltin<"__builtin_ia32_pmovzxbq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovzxbw        : GCCBuiltin<"__builtin_ia32_pmovzxbw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmovzxdq        : GCCBuiltin<"__builtin_ia32_pmovzxdq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pmovzxwd        : GCCBuiltin<"__builtin_ia32_pmovzxwd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v8i16_ty]>;
+  def int_x86_sse41_pmovzxwq        : GCCBuiltin<"__builtin_ia32_pmovzxwq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v8i16_ty]>;
+}
+
+// Vector min element
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_phminposuw     : GCCBuiltin<"__builtin_ia32_phminposuw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty]>;
+}
+
+// Vector compare, min, max
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pcmpeqq         : GCCBuiltin<"__builtin_ia32_pcmpeqq">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty]>;
+  def int_x86_sse41_pmaxsb          : GCCBuiltin<"__builtin_ia32_pmaxsb128">,
+              Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pmaxsd          : GCCBuiltin<"__builtin_ia32_pmaxsd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pmaxud          : GCCBuiltin<"__builtin_ia32_pmaxud128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pmaxuw          : GCCBuiltin<"__builtin_ia32_pmaxuw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
+  def int_x86_sse41_pminsb          : GCCBuiltin<"__builtin_ia32_pminsb128">,
+              Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pminsd          : GCCBuiltin<"__builtin_ia32_pminsd128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pminud          : GCCBuiltin<"__builtin_ia32_pminud128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pminuw          : GCCBuiltin<"__builtin_ia32_pminuw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty]>;
+}
+
+// Vector pack
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_packusdw        : GCCBuiltin<"__builtin_ia32_packusdw128">,
+              Intrinsic<[llvm_v8i16_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+}
+
+// Vector multiply
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pmuldq          : GCCBuiltin<"__builtin_ia32_pmuldq128">,
+              Intrinsic<[llvm_v2i64_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+  def int_x86_sse41_pmulld          : GCCBuiltin<"__builtin_ia32_pmulld128">,
+              Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty]>;
+}
+
+// Vector extract
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pextrb         : GCCBuiltin<"__builtin_ia32_vec_ext_v16qi">,
+              Intrinsic<[llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty]>;
+  def int_x86_sse41_pextrd         : GCCBuiltin<"__builtin_ia32_vec_ext_v4si">,
+              Intrinsic<[llvm_i32_ty, llvm_v4i32_ty, llvm_i32_ty]>;
+  def int_x86_sse41_pextrq         : GCCBuiltin<"__builtin_ia32_vec_ext_v2di">,
+              Intrinsic<[llvm_i64_ty, llvm_v2i64_ty, llvm_i32_ty]>;
+  def int_x86_sse41_extractps      : GCCBuiltin<"__builtin_ia32_extractps128">,
+              Intrinsic<[llvm_i32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
+}
+
+// Vector insert
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pinsrb         : GCCBuiltin<"__builtin_ia32_vec_set_v16qi">,
+          Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty]>;
+  def int_x86_sse41_pinsrd         : GCCBuiltin<"__builtin_ia32_vec_set_v4si">,
+          Intrinsic<[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+  def int_x86_sse41_pinsrq         : GCCBuiltin<"__builtin_ia32_vec_set_v2di">,
+          Intrinsic<[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i64_ty, llvm_i32_ty]>;
+  def int_x86_sse41_insertps       : GCCBuiltin<"__builtin_ia32_insertps128">,
+          Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
+}
+
+// Vector blend
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_pblendvb         : GCCBuiltin<"__builtin_ia32_pblendvb128">,
+        Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty]>;
+  def int_x86_sse41_pblendw          : GCCBuiltin<"__builtin_ia32_pblendw128">,
+        Intrinsic<[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty]>;
+  def int_x86_sse41_blendpd          : GCCBuiltin<"__builtin_ia32_blendpd">,
+        Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
+  def int_x86_sse41_blendps          : GCCBuiltin<"__builtin_ia32_blendps">,
+        Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
+  def int_x86_sse41_blendvpd         : GCCBuiltin<"__builtin_ia32_blendvpd">,
+        Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty]>;
+  def int_x86_sse41_blendvps         : GCCBuiltin<"__builtin_ia32_blendvps">,
+        Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty]>;
+}
+
+// Vector dot product
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_dppd            : GCCBuiltin<"__builtin_ia32_dppd">,
+          Intrinsic<[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty]>;
+  def int_x86_sse41_dpps            : GCCBuiltin<"__builtin_ia32_dpps">,
+          Intrinsic<[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty]>;
+}
+
+// Vector sum of absolute differences
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_mpsadbw         : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
+          Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty]>;
+}
+
+// Vector sum of absolute differences
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_sse41_movntdqa        : GCCBuiltin<"__builtin_ia32_movntdqa">,
+          Intrinsic<[llvm_v2i64_ty, llvm_ptr_ty]>;
+}
+
+
+//===----------------------------------------------------------------------===//
 // MMX
 
 // Empty MMX state op.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 905c704..9347310 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -34,6 +34,12 @@
 def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
                                       "Enable SSSE3 instructions",
                                       [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
                                       "Enable 3DNow! instructions">;
 def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
@@ -66,6 +72,7 @@
 def : Proc<"prescott",        [FeatureSSE3]>;
 def : Proc<"nocona",          [FeatureSSE3]>;
 def : Proc<"core2",           [FeatureSSSE3]>;
+def : Proc<"penryn",          [FeatureSSE41]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index a55e731..9528dbd 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -1266,3 +1266,13 @@
 def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+// PEXTRB, unary, TA, 0x14, REX.W
+// PEXTRW, unary, TA, 0x15, REX.W
+// PEXTRQ, unary, TA, 0x16, REX.W
+// EXTRACTPS, unary, TA, 0x17, REX.W
+// PINSRQ, 2addr, binary, TA, 0x22, REX.W
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 478007b..a79947b 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -166,6 +166,8 @@
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 30d088c..5cf9a9c 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3038,3 +3038,98 @@
           (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
 def : Pat<(store (v16i8 VR128:$src), addr:$dst),
           (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+// SSE4.1 Instruction Templates:
+// 
+//   SS418I - SSE 4.1 instructions with T8 prefix.
+//   SS41AI - SSE 4.1 instructions with TA prefix.
+//
+class SS418I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+class SS41AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+
+
+multiclass sse41_fp_unop_rm<bits<8> opcss, bits<8> opcps, 
+                            bits<8> opcsd, bits<8> opcpd, 
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+  // Intrinsic operation, reg.
+  def SSr_Int : SS41AI<opcss, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (F32Int VR128:$src1, imm:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SS41AI<opcss, MRMSrcMem, 
+                    (outs VR128:$dst), (ins ssmem:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr, 
+                    "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, reg
+  def PSr_Int : SS41AI<opcps, MRMSrcReg, 
+                    (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : SS41AI<opcps, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V4F32Int (load addr:$src1), imm:$src2))]>;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS41AI<opcsd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (F64Int VR128:$src1, imm:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SS41AI<opcsd, MRMSrcMem,
+                    (outs VR128:$dst), (ins sdmem:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : SS41AI<opcpd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : SS41AI<opcpd, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V2F64Int (load addr:$src1), imm:$src2))]>;
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND  : sse41_fp_unop_rm<0x0A, 0x08, 0x0B, 0x09, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_ps,
+                               int_x86_sse41_round_sd, int_x86_sse41_round_pd>;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 1480332..35a83e4 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -114,6 +114,8 @@
   if ((EDX >> 26) & 0x1) X86SSELevel = SSE2;
   if (ECX & 0x1)         X86SSELevel = SSE3;
   if ((ECX >> 9)  & 0x1) X86SSELevel = SSSE3;
+  if ((ECX >> 19) & 0x1) X86SSELevel = SSE41;
+  if ((ECX >> 20) & 0x1) X86SSELevel = SSE42;
 
   if (memcmp(text.c, "GenuineIntel", 12) == 0 ||
       memcmp(text.c, "AuthenticAMD", 12) == 0) {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index ee193cf..c268726 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -38,7 +38,7 @@
   };
 protected:
   enum X86SSEEnum {
-    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
   };
 
   enum X863DNowEnum {
@@ -127,6 +127,8 @@
   bool hasSSE2() const { return X86SSELevel >= SSE2; }
   bool hasSSE3() const { return X86SSELevel >= SSE3; }
   bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
 
