[fast-isel] Fold "urem x, pow2" -> "and x, pow2-1".  This should fix the 271%
execution-time regression for nsieve-bits on the ARMv7 -O0 -g nightly tester.
This may also improve compile-time on architectures that would otherwise 
generate a libcall for urem (e.g., ARM) or fall back to the DAG selector.
rdar://10810716


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@153230 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 9f4a44a..4db10b7 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -395,6 +395,13 @@
       ISDOpcode = ISD::SRA;
     }
 
+    // Transform "urem x, pow2" -> "and x, pow2-1".
+    if (ISDOpcode == ISD::UREM && isa<BinaryOperator>(I) &&
+        isPowerOf2_64(Imm)) {
+      --Imm;
+      ISDOpcode = ISD::AND;
+    }
+
     unsigned ResultReg = FastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
                                       Op0IsKill, Imm, VT.getSimpleVT());
     if (ResultReg == 0) return false;