[AArch64][GlobalISel] Inline tiny memcpy et al at -O0.

FastISel already does this since the initial arm64 port was upstreamed, so
it seems there are no issues with doing this at -O0 for very small memcpys.

Gives a 0.2% geomean code size improvement on CTMark.

Differential Revision: https://reviews.llvm.org/D65758

llvm-svn: 367919
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 373d52f..9149813 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -861,7 +861,7 @@
   return true;
 }
 
-bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) {
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   // This combine is fairly complex so it's not written with a separate
   // matcher function.
   assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
@@ -900,6 +900,9 @@
     return true;
   }
 
+  if (MaxLen && KnownLen > MaxLen)
+    return false;
+
   if (ID == Intrinsic::memcpy)
     return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
   if (ID == Intrinsic::memmove)