[X86] Lowering addus/subus intrinsics to native IR

Summary: This is the patch that lowers x86 intrinsics to native IR in order to enable optimizations.

Reviewers: craig.topper, spatel, RKSimon

Reviewed By: craig.topper

Subscribers: cfe-commits

Differential Revision: https://reviews.llvm.org/D46892

llvm-svn: 339651
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7456e18..bbd2904 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8907,6 +8907,39 @@
   return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2");
 }
 
+// Emit addition or subtraction with saturation.
+// Handles both signed and unsigned intrinsics.
+static Value *EmitX86AddSubSatExpr(CodeGenFunction &CGF, const CallExpr *E,
+                                   SmallVectorImpl<Value *> &Ops,
+                                   bool IsAddition) {
+
+  // Collect vector elements and type data.
+  llvm::Type *ResultType = CGF.ConvertType(E->getType());
+
+  Value *Res;
+  if (IsAddition) {
+    // ADDUS: a > (a+b) ? ~0 : (a+b)
+    // If Ops[0] > Add, overflow occured.
+    Value *Add = CGF.Builder.CreateAdd(Ops[0], Ops[1]);
+    Value *ICmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Add);
+    Value *Max = llvm::Constant::getAllOnesValue(ResultType);
+    Res = CGF.Builder.CreateSelect(ICmp, Max, Add);
+  } else {
+    // SUBUS: max(a, b) - b
+    Value *ICmp = CGF.Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]);
+    Value *Select = CGF.Builder.CreateSelect(ICmp, Ops[0], Ops[1]);
+    Res = CGF.Builder.CreateSub(Select, Ops[1]);
+  }
+
+  if (E->getNumArgs() == 4) { // For masked intrinsics.
+    Value *VecSRC = Ops[2];
+    Value *Mask = Ops[3];
+    return EmitX86Select(CGF, Mask, Res, VecSRC);
+  }
+
+  return Res;
+}
+
 Value *CodeGenFunction::EmitX86CpuIs(const CallExpr *E) {
   const Expr *CPUExpr = E->getArg(0)->IgnoreParenCasts();
   StringRef CPUStr = cast<clang::StringLiteral>(CPUExpr)->getString();
@@ -10530,10 +10563,23 @@
     Load->setVolatile(true);
     return Load;
   }
+  case X86::BI__builtin_ia32_paddusb512_mask:
+  case X86::BI__builtin_ia32_paddusw512_mask:
+  case X86::BI__builtin_ia32_paddusb256:
+  case X86::BI__builtin_ia32_paddusw256:
+  case X86::BI__builtin_ia32_paddusb128:
+  case X86::BI__builtin_ia32_paddusw128:
+    return EmitX86AddSubSatExpr(*this, E, Ops, true /* IsAddition */);
+  case X86::BI__builtin_ia32_psubusb512_mask:
+  case X86::BI__builtin_ia32_psubusw512_mask:
+  case X86::BI__builtin_ia32_psubusb256:
+  case X86::BI__builtin_ia32_psubusw256:
+  case X86::BI__builtin_ia32_psubusb128:
+  case X86::BI__builtin_ia32_psubusw128:
+    return EmitX86AddSubSatExpr(*this, E, Ops, false /* IsAddition */);
   }
 }
 
-
 Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
                                            const CallExpr *E) {
   SmallVector<Value*, 4> Ops;