[X86] Take advantage of the lzcnt instruction on btver2 architectures when ORing comparisons to zero.

This change adds transformations such as:
  zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
  To:
  srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
This optimisation is beneficial on Jaguar architecture only, where lzcnt has a good reciprocal throughput.
Other architectures such as Intel's Haswell/Broadwell or AMD's Bulldozer/PileDriver do not benefit from it.
For this reason the change also adds a "HasFastLZCNT" feature which gets enabled for Jaguar.

Differential Revision: https://reviews.llvm.org/D23446

llvm-svn: 284248
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a5cd83d..434edf6 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -215,6 +215,9 @@
   /// 64-bit divisions and should be used when possible.
   bool HasSlowDivide64;
 
+  /// True if LZCNT instruction is fast.
+  bool HasFastLZCNT;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -444,6 +447,7 @@
   bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+  bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }