Generate a VTBL instruction instead of a series of loads and stores when we
can. As Nate pointed out, VTBL isn't super performant, but it *has* to be better
than this:

_shuf:
@ BB#0:       @ %entry
  push        {r4, r7, lr}
  add         r7, sp, #4
  sub         sp, #12
  mov         r4, sp
  bic         r4, r4, #7
  mov         sp, r4
  mov         r2, sp
  vmov        d16, r0, r1
  orr         r0, r2, #6
  orr         r3, r2, #7
  vst1.8      {d16[0]}, [r3]
  vst1.8      {d16[5]}, [r0]
  subs        r4, r7, #4
  orr         r0, r2, #5
  vst1.8      {d16[4]}, [r0]
  orr         r0, r2, #4
  vst1.8      {d16[4]}, [r0]
  orr         r0, r2, #3
  vst1.8      {d16[0]}, [r0]
  orr         r0, r2, #2
  vst1.8      {d16[2]}, [r0]
  orr         r0, r2, #1
  vst1.8      {d16[1]}, [r0]
  vst1.8      {d16[3]}, [r2]
  vldr.64     d16, [sp]
  vmov        r0, r1, d16
  mov         sp, r4
  pop         {r4, r7, pc}

The "illegal" testcase in vext.ll is no longer illegal.
<rdar://problem/9078775>


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127630 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 8ef44d1..c7d847a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -2842,6 +2842,35 @@
     break;
   }
 
+  case ARMISD::VTBL1: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+    SmallVector<SDValue, 6> Ops;
+
+    Ops.push_back(N->getOperand(0));
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops.data(), Ops.size());
+  }
+  case ARMISD::VTBL2: {
+    DebugLoc dl = N->getDebugLoc();
+    EVT VT = N->getValueType(0);
+
+    // Form a REG_SEQUENCE to force register allocation.
+    SDValue V0 = N->getOperand(0);
+    SDValue V1 = N->getOperand(1);
+    SDValue RegSeq = SDValue(PairDRegs(MVT::v16i8, V0, V1), 0);
+
+    SmallVector<SDValue, 6> Ops;
+    Ops.push_back(RegSeq);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(getAL(CurDAG));                    // Predicate
+    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
+    return CurDAG->getMachineNode(ARM::VTBL2Pseudo, dl, VT,
+                                  Ops.data(), Ops.size());
+  }
+
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
   }