Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 48575f6ea7d5cd21ab29ca370f58fcf9ca31400b [log] [tgz]
author: Evan Cheng <evan.cheng@apple.com> Sun Dec 05 22:04:16 2010 +0000
committer: Evan Cheng <evan.cheng@apple.com> Sun Dec 05 22:04:16 2010 +0000
tree: fd7f84a4921afa7c4baac36c5772ae688f4f31da
parent: 0a3fdd6e11cd351737b4451c05ec5d794e6855cf [diff] [blame]
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 96bdf48..bfba114 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp

@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMAddressingModes.h"
 #include "ARMTargetMachine.h"
 #include "llvm/CallingConv.h"
@@ -41,6 +42,11 @@
   cl::desc("Disable isel of shifter-op"),
   cl::init(false));
 
+static cl::opt<bool>
+CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
+  cl::desc("Check fp vmla / vmls hazard at isel time"),
+  cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -54,6 +60,7 @@
 
 class ARMDAGToDAGISel : public SelectionDAGISel {
   ARMBaseTargetMachine &TM;
+  const ARMBaseInstrInfo *TII;
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -63,7 +70,8 @@
   explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
                            CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+      TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
+      Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
   virtual const char *getPassName() const {
@@ -78,6 +86,8 @@
 
   SDNode *Select(SDNode *N);
 
+
+  bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
                              ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
   bool SelectShifterOperandReg(SDValue N, SDValue &A,
@@ -272,6 +282,50 @@
          isInt32Immediate(N->getOperand(1).getNode(), Imm);
 }
 
+/// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
+/// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
+/// least on current ARM implementations) which should be avoidded.
+bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
+  if (OptLevel == CodeGenOpt::None)
+    return true;
+
+  if (!CheckVMLxHazard)
+    return true;
+
+  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
+    return true;
+
+  if (!N->hasOneUse())
+    return false;
+
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() == ISD::CopyToReg)
+    return true;
+  if (Use->isMachineOpcode()) {
+    const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode());
+    if (TID.mayStore())
+      return true;
+    unsigned Opcode = TID.getOpcode();
+    if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
+      return true;
+    // vmlx feeding into another vmlx. We actually want to unfold
+    // the use later in the MLxExpansion pass. e.g.
+    // vmla
+    // vmla (stall 8 cycles)
+    //
+    // vmul (5 cycles)
+    // vadd (5 cycles)
+    // vmla
+    // This adds up to about 18 - 19 cycles.
+    //
+    // vmla
+    // vmul (stall 4 cycles)
+    // vadd adds up to about 14 cycles.
+    return TII->isFpMLxInstruction(Opcode);
+  }
+
+  return false;
+}
 
 bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
                                             ARM_AM::ShiftOpc ShOpcVal,
commit	48575f6ea7d5cd21ab29ca370f58fcf9ca31400b	[log] [tgz]
author	Evan Cheng <evan.cheng@apple.com>	Sun Dec 05 22:04:16 2010 +0000
committer	Evan Cheng <evan.cheng@apple.com>	Sun Dec 05 22:04:16 2010 +0000
tree	fd7f84a4921afa7c4baac36c5772ae688f4f31da
parent	0a3fdd6e11cd351737b4451c05ec5d794e6855cf [diff] [blame]