[VectorLegalizer] Expansion of CTLZ using CTPOP when possible

This patch avoids scalarization of CTLZ by instead expanding to use CTPOP (ref: "Hacker's Delight") when the necessary operations are available.

This also adds the necessary cost models for X86 SSE2 targets (the main beneficiary) to ensure vectorization only happens when its useful.

Differential Revision: https://reviews.llvm.org/D25910

llvm-svn: 286233
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index fd433e3..72d2626 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -106,7 +106,8 @@
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
-  SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op);
+  SDValue ExpandCTLZ(SDValue Op);
+  SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
 
   /// \brief Implements vector promotion.
   ///
@@ -693,9 +694,11 @@
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
     return ExpandBITREVERSE(Op);
+  case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
+    return ExpandCTLZ(Op);
   case ISD::CTTZ_ZERO_UNDEF:
-    return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op);
+    return ExpandCTTZ_ZERO_UNDEF(Op);
   default:
     return DAG.UnrollVectorOp(Op.getNode());
   }
@@ -1022,12 +1025,53 @@
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
-SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) {
+SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
+  EVT VT = Op.getValueType();
+  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+
   // If the non-ZERO_UNDEF version is supported we can use that instead.
-  unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ;
-  if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) {
+  if (Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF &&
+      TLI.isOperationLegalOrCustom(ISD::CTLZ, VT)) {
     SDLoc DL(Op);
-    return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(0));
+    return DAG.getNode(ISD::CTLZ, DL, Op.getValueType(), Op.getOperand(0));
+  }
+
+  // If CTPOP is available we can lower with a CTPOP based method:
+  // u16 ctlz(u16 x) {
+  //   x |= (x >> 1);
+  //   x |= (x >> 2);
+  //   x |= (x >> 4);
+  //   x |= (x >> 8);
+  //   return ctpop(~x);
+  // }
+  // Ref: "Hacker's Delight" by Henry Warren
+  if (isPowerOf2_32(NumBitsPerElt) &&
+      TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::SRL, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT) &&
+      TLI.isOperationLegalOrCustomOrPromote(ISD::XOR, VT)) {
+    SDLoc DL(Op);
+    SDValue Res = Op.getOperand(0);
+    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
+    for (unsigned i = 1; i != NumBitsPerElt; i *= 2)
+      Res = DAG.getNode(
+          ISD::OR, DL, VT, Res,
+          DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(i, DL, ShiftTy)));
+
+    Res = DAG.getNOT(DL, Res, VT);
+    return DAG.getNode(ISD::CTPOP, DL, VT, Res);
+  }
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandCTTZ_ZERO_UNDEF(SDValue Op) {
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  if (TLI.isOperationLegalOrCustom(ISD::CTTZ, Op.getValueType())) {
+    SDLoc DL(Op);
+    return DAG.getNode(ISD::CTTZ, DL, Op.getValueType(), Op.getOperand(0));
   }
 
   // Otherwise go ahead and unroll.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index afa8e00..7029a02 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1214,7 +1214,10 @@
     { ISD::BSWAP,      MVT::v2i64,   7 },
     { ISD::BSWAP,      MVT::v4i32,   7 },
     { ISD::BSWAP,      MVT::v8i16,   7 },
-    /* ISD::CTLZ - currently scalarized pre-SSSE3 */
+    { ISD::CTLZ,       MVT::v2i64,  25 },
+    { ISD::CTLZ,       MVT::v4i32,  26 },
+    { ISD::CTLZ,       MVT::v8i16,  20 },
+    { ISD::CTLZ,       MVT::v16i8,  17 },
     { ISD::CTPOP,      MVT::v2i64,  12 },
     { ISD::CTPOP,      MVT::v4i32,  15 },
     { ISD::CTPOP,      MVT::v8i16,  13 },