Add 'cmp' SSE builtins and get rid of a bunch of other builtins.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@72032 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 93901fd..bb0d59a 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -816,77 +816,13 @@
   case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v2df:
     return Builder.CreateExtractElement(Ops[0], Ops[1], "result");
-  case X86::BI__builtin_ia32_cmpordss:
-  case X86::BI__builtin_ia32_cmpordsd:
-  case X86::BI__builtin_ia32_cmpunordss:
-  case X86::BI__builtin_ia32_cmpunordsd:
-  case X86::BI__builtin_ia32_cmpeqss:
-  case X86::BI__builtin_ia32_cmpeqsd:
-  case X86::BI__builtin_ia32_cmpltss:
-  case X86::BI__builtin_ia32_cmpltsd:
-  case X86::BI__builtin_ia32_cmpless:
-  case X86::BI__builtin_ia32_cmplesd:
-  case X86::BI__builtin_ia32_cmpneqss:
-  case X86::BI__builtin_ia32_cmpneqsd:
-  case X86::BI__builtin_ia32_cmpnltss:
-  case X86::BI__builtin_ia32_cmpnltsd:
-  case X86::BI__builtin_ia32_cmpnless:
-  case X86::BI__builtin_ia32_cmpnlesd: {
-    unsigned i = 0;
-    const char *name = 0;
-    switch (BuiltinID) {
-    default: assert(0 && "Unknown compare builtin!");
-    case X86::BI__builtin_ia32_cmpeqss:
-    case X86::BI__builtin_ia32_cmpeqsd:
-      i = 0;
-      name = "cmpeq";
-      break;
-    case X86::BI__builtin_ia32_cmpltss:
-    case X86::BI__builtin_ia32_cmpltsd:
-      i = 1;
-      name = "cmplt";
-      break;
-    case X86::BI__builtin_ia32_cmpless:
-    case X86::BI__builtin_ia32_cmplesd:
-      i = 2;
-      name = "cmple";
-      break;
-    case X86::BI__builtin_ia32_cmpunordss:
-    case X86::BI__builtin_ia32_cmpunordsd:
-      i = 3;
-      name = "cmpunord";
-      break;
-    case X86::BI__builtin_ia32_cmpneqss:
-    case X86::BI__builtin_ia32_cmpneqsd:
-      i = 4;
-      name = "cmpneq";
-      break;
-    case X86::BI__builtin_ia32_cmpnltss:
-    case X86::BI__builtin_ia32_cmpnltsd:
-      i = 5;
-      name = "cmpntl";
-      break;
-    case X86::BI__builtin_ia32_cmpnless:
-    case X86::BI__builtin_ia32_cmpnlesd:
-      i = 6;
-      name = "cmpnle";
-      break;
-    case X86::BI__builtin_ia32_cmpordss:
-    case X86::BI__builtin_ia32_cmpordsd:
-      i = 7;
-      name = "cmpord";
-      break;
-    }
-
-    llvm::Function *F;
-    if (cast<llvm::VectorType>(Ops[0]->getType())->getElementType() ==
-        llvm::Type::FloatTy)
-      F = CGM.getIntrinsic(Intrinsic::x86_sse_cmp_ss);
-    else
-      F = CGM.getIntrinsic(Intrinsic::x86_sse2_cmp_sd);
-
-    Ops.push_back(llvm::ConstantInt::get(llvm::Type::Int8Ty, i));
-    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), name);
+  case X86::BI__builtin_ia32_cmpps: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse_cmp_ps);
+    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), "cmpps");
+  }
+  case X86::BI__builtin_ia32_cmpss: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse_cmp_ss);
+    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), "cmpss");
   }
   case X86::BI__builtin_ia32_ldmxcsr: {
     llvm::Type *PtrTy = llvm::PointerType::getUnqual(llvm::Type::Int8Ty);
@@ -904,89 +840,13 @@
                              Builder.CreateBitCast(Tmp, PtrTy));
     return Builder.CreateLoad(Tmp, "stmxcsr");
   }
-  case X86::BI__builtin_ia32_cmpordps:
-  case X86::BI__builtin_ia32_cmpordpd:
-  case X86::BI__builtin_ia32_cmpunordps:
-  case X86::BI__builtin_ia32_cmpunordpd:
-  case X86::BI__builtin_ia32_cmpeqps: 
-  case X86::BI__builtin_ia32_cmpeqpd: 
-  case X86::BI__builtin_ia32_cmpltps: 
-  case X86::BI__builtin_ia32_cmpltpd: 
-  case X86::BI__builtin_ia32_cmpleps:
-  case X86::BI__builtin_ia32_cmplepd:
-  case X86::BI__builtin_ia32_cmpneqps:
-  case X86::BI__builtin_ia32_cmpneqpd:
-  case X86::BI__builtin_ia32_cmpngtps:
-  case X86::BI__builtin_ia32_cmpngtpd:
-  case X86::BI__builtin_ia32_cmpnltps: 
-  case X86::BI__builtin_ia32_cmpnltpd: 
-  case X86::BI__builtin_ia32_cmpgtps:
-  case X86::BI__builtin_ia32_cmpgtpd:
-  case X86::BI__builtin_ia32_cmpgeps:
-  case X86::BI__builtin_ia32_cmpgepd:
-  case X86::BI__builtin_ia32_cmpngeps:
-  case X86::BI__builtin_ia32_cmpngepd:
-  case X86::BI__builtin_ia32_cmpnleps: 
-  case X86::BI__builtin_ia32_cmpnlepd: {
-    unsigned i = 0;
-    const char *name = 0;
-    bool ShouldSwap = false;
-    switch (BuiltinID) {
-    default: assert(0 && "Unknown compare builtin!");
-    case X86::BI__builtin_ia32_cmpeqps:
-    case X86::BI__builtin_ia32_cmpeqpd:    i = 0; name = "cmpeq"; break;
-    case X86::BI__builtin_ia32_cmpltps:
-    case X86::BI__builtin_ia32_cmpltpd:    i = 1; name = "cmplt"; break;
-    case X86::BI__builtin_ia32_cmpleps:
-    case X86::BI__builtin_ia32_cmplepd:    i = 2; name = "cmple"; break;
-    case X86::BI__builtin_ia32_cmpunordps:
-    case X86::BI__builtin_ia32_cmpunordpd: i = 3; name = "cmpunord"; break;
-    case X86::BI__builtin_ia32_cmpneqps:
-    case X86::BI__builtin_ia32_cmpneqpd:   i = 4; name = "cmpneq"; break;
-    case X86::BI__builtin_ia32_cmpnltps:
-    case X86::BI__builtin_ia32_cmpnltpd:   i = 5; name = "cmpntl"; break;
-    case X86::BI__builtin_ia32_cmpnleps:
-    case X86::BI__builtin_ia32_cmpnlepd:   i = 6; name = "cmpnle"; break;
-    case X86::BI__builtin_ia32_cmpordps:
-    case X86::BI__builtin_ia32_cmpordpd:   i = 7; name = "cmpord"; break;
-    case X86::BI__builtin_ia32_cmpgtps:
-    case X86::BI__builtin_ia32_cmpgtpd:
-      ShouldSwap = true;
-      i = 1;
-      name = "cmpgt";
-      break;
-    case X86::BI__builtin_ia32_cmpgeps:
-    case X86::BI__builtin_ia32_cmpgepd:
-      i = 2;
-      name = "cmpge";
-      ShouldSwap = true;
-      break;
-    case X86::BI__builtin_ia32_cmpngtps:
-    case X86::BI__builtin_ia32_cmpngtpd:
-      i = 5;
-      name = "cmpngt";
-      ShouldSwap = true;
-      break;
-    case X86::BI__builtin_ia32_cmpngeps:
-    case X86::BI__builtin_ia32_cmpngepd:
-      i = 6;
-      name = "cmpnge";
-      ShouldSwap = true;
-      break;
-    }
-
-    if (ShouldSwap)
-      std::swap(Ops[0], Ops[1]);
-
-    llvm::Function *F;
-    if (cast<llvm::VectorType>(Ops[0]->getType())->getElementType() ==
-        llvm::Type::FloatTy)
-      F = CGM.getIntrinsic(Intrinsic::x86_sse_cmp_ps);
-    else
-      F = CGM.getIntrinsic(Intrinsic::x86_sse2_cmp_pd);
-    
-    Ops.push_back(llvm::ConstantInt::get(llvm::Type::Int8Ty, i));
-    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), name);
+  case X86::BI__builtin_ia32_cmppd: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse2_cmp_pd);
+    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), "cmppd");
+  }
+  case X86::BI__builtin_ia32_cmpsd: {
+    llvm::Function *F = CGM.getIntrinsic(Intrinsic::x86_sse2_cmp_sd);
+    return Builder.CreateCall(F, &Ops[0], &Ops[0] + Ops.size(), "cmpsd");
   }
   case X86::BI__builtin_ia32_movss:
     return EmitShuffleVector(Ops[0], Ops[1], 4, 1, 2, 3, "movss");
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 84ce06a..12b548d 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -149,145 +149,145 @@
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpeq_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpeqpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 0);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmplt_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpltpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 1);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmple_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmplepd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 2);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpgt_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpltpd(b, a);
+  return (__m128d)__builtin_ia32_cmppd(b, a, 1);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpge_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmplepd(b, a);
+  return (__m128d)__builtin_ia32_cmppd(b, a, 2);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpord_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpordpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 7);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpunord_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpunordpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 3);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpneq_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpneqpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 4);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnlt_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnltpd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 5);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnle_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnlepd(a, b);
+  return (__m128d)__builtin_ia32_cmppd(a, b, 6);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpngt_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnltpd(b, a);
+  return (__m128d)__builtin_ia32_cmppd(b, a, 5);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnge_pd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnlepd(b, a);
+  return (__m128d)__builtin_ia32_cmppd(b, a, 6);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpeq_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpeqsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 0);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmplt_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpltsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 1);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmple_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmplesd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 2);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpgt_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpltsd(b, a);
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 1);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpge_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmplesd(b, a);
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 2);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpord_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpordsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 7);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpunord_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpunordsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 3);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpneq_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpneqsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 4);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnlt_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnltsd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 5);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnle_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnlesd(a, b);
+  return (__m128d)__builtin_ia32_cmpsd(a, b, 6);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpngt_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnltsd(b, a);
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 5);
 }
 
 static inline __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnge_sd(__m128d a, __m128d b)
 {
-  return (__m128d)__builtin_ia32_cmpnlesd(b, a);
+  return (__m128d)__builtin_ia32_cmpsd(b, a, 6);
 }
 
 static inline int __attribute__((__always_inline__, __nodebug__))
diff --git a/lib/Headers/xmmintrin.h b/lib/Headers/xmmintrin.h
index c863144..264d2d6 100644
--- a/lib/Headers/xmmintrin.h
+++ b/lib/Headers/xmmintrin.h
@@ -170,145 +170,145 @@
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpeq_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpeqss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 0);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpeq_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpeqps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 0);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmplt_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpltss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 1);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmplt_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpltps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 1);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmple_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpless(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 2);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmple_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpleps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 2);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpgt_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpltss(b, a);
+  return (__m128)__builtin_ia32_cmpss(b, a, 1);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpgt_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpltps(b, a);
+  return (__m128)__builtin_ia32_cmpps(b, a, 1);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpge_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpless(b, a);
+  return (__m128)__builtin_ia32_cmpss(b, a, 2);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpge_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpleps(b, a);
+  return (__m128)__builtin_ia32_cmpps(b, a, 2);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpneq_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpneqss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 4);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpneq_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpneqps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 4);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnlt_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnltss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 5);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnlt_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnltps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 5);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnle_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnless(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 6);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnle_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnleps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 6);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpngt_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnltss(b, a);
+  return (__m128)__builtin_ia32_cmpss(b, a, 5);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpngt_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnltps(b, a);
+  return (__m128)__builtin_ia32_cmpps(b, a, 5);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnge_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnless(b, a);
+  return (__m128)__builtin_ia32_cmpss(b, a, 6);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpnge_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpnleps(b, a);
+  return (__m128)__builtin_ia32_cmpps(b, a, 6);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpord_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpordss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 7);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpord_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpordps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 7);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpunord_ss(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpunordss(a, b);
+  return (__m128)__builtin_ia32_cmpss(a, b, 3);
 }
 
 static inline __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_cmpunord_ps(__m128 a, __m128 b)
 {
-  return (__m128)__builtin_ia32_cmpunordps(a, b);
+  return (__m128)__builtin_ia32_cmpps(a, b, 3);
 }
 
 static inline int __attribute__((__always_inline__, __nodebug__))