ARM: align loops to 4 bytes on Cortex-M3 and Cortex-M4. The Technical Reference Manuals for these two CPUs state that branching to an unaligned 32-bit instruction incurs an extra pipeline reload penalty. That's bad. This also enables the optimization at -Os since it costs on average one byte per loop in return for 1 cycle per iteration, which is pretty good going. llvm-svn: 342127

commit: c15d47bb013e975da582c8fd786ba8234d70d75d [log] [tgz]
author: Tim Northover <tnorthover@apple.com> Thu Sep 13 10:28:05 2018 +0000
committer: Tim Northover <tnorthover@apple.com> Thu Sep 13 10:28:05 2018 +0000
tree: e13262451793600a29c0df26342fc954e1a8a79a
parent: 95ac65bc32180744cbc67d4e82a0f6417fb92aa9 [diff]
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 21350df6..624d336 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp

@@ -2497,7 +2497,8 @@
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F->getFunction().optForSize())
+  if (F->getFunction().optForMinSize() ||
+      (F->getFunction().optForSize() && !TLI->alignLoopsWithOptSize()))
     return;
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
   if (FunctionChain.begin() == FunctionChain.end())

diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 276ea78..2d0f270 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td

@@ -265,6 +265,9 @@
 def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true",
                                              "Prefer 32-bit Thumb instrs">;
 
+def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopAlignment","2",
+                                              "Prefer 32-bit alignment for loops">;
+
 /// Some instructions update CPSR partially, which can add false dependency for
 /// out-of-order implementation, e.g. Cortex-A9, unless each individual bit is
 /// mapped to a separate physical register. Avoid partial CPSR update for these
@@ -936,6 +939,7 @@
 
 def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
                                                          ProcM3,
+                                                         FeaturePrefLoopAlign32,
                                                          FeatureHasNoBranchPredictor]>;
 
 def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
@@ -946,6 +950,7 @@
                                                          FeatureVFP4,
                                                          FeatureVFPOnlySP,
                                                          FeatureD16,
+                                                         FeaturePrefLoopAlign32,
                                                          FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
@@ -960,6 +965,7 @@
                                                          FeatureFPARMv8,
                                                          FeatureD16,
                                                          FeatureVFPOnlySP,
+                                                         FeaturePrefLoopAlign32,
                                                          FeatureHasNoBranchPredictor]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 4fed202..8b7b6b3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp

@@ -1199,6 +1199,8 @@
   // Prefer likely predicted branches to selects on out-of-order cores.
   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
+  setPrefLoopAlignment(Subtarget->getPrefLoopAlignment());
+
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
@@ -14695,6 +14697,11 @@
               Addr});
 }
 
+
+bool ARMTargetLowering::alignLoopsWithOptSize() const {
+  return Subtarget->isMClass();
+}
+
 /// A helper function for determining the number of interleaved accesses we
 /// will generate when lowering accesses of the given type.
 unsigned

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 734b1ee..bf652bc 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h

@@ -575,6 +575,8 @@
     bool isLegalInterleavedAccessType(VectorType *VecTy,
                                       const DataLayout &DL) const;
 
+    bool alignLoopsWithOptSize() const override;
+
     /// Returns the number of interleaved accesses that will be generated when
     /// lowering accesses of the given type.
     unsigned getNumInterleavedAccesses(VectorType *VecTy,

diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index d7bfa89..34938c3 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h

@@ -438,6 +438,9 @@
   /// operand cycle returned by the itinerary data for pre-ISel operands.
   int PreISelOperandLatencyAdjustment = 2;
 
+  /// What alignment is preferred for loop bodies, in log2(bytes).
+  unsigned PrefLoopAlignment = 0;
+
   /// IsLittle - The target is Little Endian
   bool IsLittle;
 
@@ -804,6 +807,10 @@
   bool allowPositionIndependentMovt() const {
     return isROPI() || !isTargetELF();
   }
+
+  unsigned getPrefLoopAlignment() const {
+    return PrefLoopAlignment;
+  }
 };
 
 } // end namespace llvm
commit	c15d47bb013e975da582c8fd786ba8234d70d75d	[log] [tgz]
author	Tim Northover <tnorthover@apple.com>	Thu Sep 13 10:28:05 2018 +0000
committer	Tim Northover <tnorthover@apple.com>	Thu Sep 13 10:28:05 2018 +0000
tree	e13262451793600a29c0df26342fc954e1a8a79a
parent	95ac65bc32180744cbc67d4e82a0f6417fb92aa9 [diff]