[X86] Take advantage of the lzcnt instruction on btver2 architectures when ORing comparisons to zero.
This change adds transformations such as:
zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
To:
srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
This optimisation is beneficial on Jaguar architecture only, where lzcnt has a good reciprocal throughput.
Other architectures such as Intel's Haswell/Broadwell or AMD's Bulldozer/PileDriver do not benefit from it.
For this reason the change also adds a "HasFastLZCNT" feature which gets enabled for Jaguar.
Differential Revision: https://reviews.llvm.org/D23446
llvm-svn: 284248
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index a5cd83d..434edf6 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -215,6 +215,9 @@
/// 64-bit divisions and should be used when possible.
bool HasSlowDivide64;
+ /// True if LZCNT instruction is fast.
+ bool HasFastLZCNT;
+
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
@@ -444,6 +447,7 @@
bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+ bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }