[Hexagon] Generate HVX code for vector construction and access

Support for:
  - build vector,
  - extract vector element, subvector,
  - insert vector element, subvector,
  - shuffle.

llvm-svn: 319901
diff --git a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
index ac6a5fc..9f30f2b 100644
--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
+++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
@@ -35,7 +35,9 @@
   HexagonHazardRecognizer.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
+  HexagonISelDAGToDAGHVX.cpp
   HexagonISelLowering.cpp
+  HexagonISelLoweringHVX.cpp
   HexagonLoopIdiomRecognition.cpp
   HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 2551fe5..d0cd143 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -754,7 +754,6 @@
   CurDAG->RemoveDeadNode(N);
 }
 
-
 void HexagonDAGToDAGISel::Select(SDNode *N) {
   if (N->isMachineOpcode())
     return N->setNodeId(-1);  // Already selected.
@@ -772,6 +771,13 @@
   case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
   }
 
+  if (HST->useHVXOps()) {
+    switch (N->getOpcode()) {
+    case ISD::VECTOR_SHUFFLE:     return SelectHvxShuffle(N);
+    case HexagonISD::VROR:        return SelectHvxRor(N);
+    }
+  }
+
   SelectCode(N);
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index 4a7f4b7..e3e22a3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -26,6 +26,7 @@
 class MachineFunction;
 class HexagonInstrInfo;
 class HexagonRegisterInfo;
+class HexagonTargetLowering;
 
 class HexagonDAGToDAGISel : public SelectionDAGISel {
   const HexagonSubtarget *HST;
@@ -100,13 +101,25 @@
   void SelectConstant(SDNode *N);
   void SelectConstantFP(SDNode *N);
   void SelectBitcast(SDNode *N);
-  void SelectVectorShuffle(SDNode *N);
 
-  // Include the pieces autogenerated from the target description.
+  // Include the declarations autogenerated from the selection patterns.
   #define GET_DAGISEL_DECL
   #include "HexagonGenDAGISel.inc"
 
 private:
+  // This is really only to get access to ReplaceNode (which is a protected
+  // member). Any other members used by HvxSelector can be moved around to
+  // make them accessible).
+  friend struct HvxSelector;
+
+  SDValue selectUndef(const SDLoc &dl, MVT ResTy) {
+    SDNode *U = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy);
+    return SDValue(U, 0);
+  }
+
+  void SelectHvxShuffle(SDNode *N);
+  void SelectHvxRor(SDNode *N);
+
   bool keepsLowBits(const SDValue &Val, unsigned NumBits, SDValue &Src);
   bool isOrEquivalentToAdd(const SDNode *N) const;
   bool isAlignedMemNode(const MemSDNode *N) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
new file mode 100644
index 0000000..a636e4e
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -0,0 +1,1924 @@
+//===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Hexagon.h"
+#include "HexagonISelDAGToDAG.h"
+#include "HexagonISelLowering.h"
+#include "HexagonTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+#include <deque>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "hexagon-isel"
+
+using namespace llvm;
+
+// --------------------------------------------------------------------
+// Implementation of permutation networks.
+
+// Implementation of the node routing through butterfly networks:
+// - Forward delta.
+// - Reverse delta.
+// - Benes.
+//
+//
+// Forward delta network consists of log(N) steps, where N is the number
+// of inputs. In each step, an input can stay in place, or it can get
+// routed to another position[1]. The step after that consists of two
+// networks, each half in size in terms of the number of nodes. In those
+// terms, in the given step, an input can go to either the upper or the
+// lower network in the next step.
+//
+// [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
+// positions as long as there is no conflict.
+
+// Here's a delta network for 8 inputs, only the switching routes are
+// shown:
+//
+//         Steps:
+//         |- 1 ---------------|- 2 -----|- 3 -|
+//
+// Inp[0] ***                 ***       ***   *** Out[0]
+//           \               /   \     /   \ /
+//            \             /     \   /     X
+//             \           /       \ /     / \
+// Inp[1] ***   \         /   ***   X   ***   *** Out[1]
+//           \   \       /   /   \ / \ /
+//            \   \     /   /     X   X
+//             \   \   /   /     / \ / \
+// Inp[2] ***   \   \ /   /   ***   X   ***   *** Out[2]
+//           \   \   X   /   /     / \     \ /
+//            \   \ / \ /   /     /   \     X
+//             \   X   X   /     /     \   / \
+// Inp[3] ***   \ / \ / \ /   ***       ***   *** Out[3]
+//           \   X   X   X   /
+//            \ / \ / \ / \ /
+//             X   X   X   X
+//            / \ / \ / \ / \
+//           /   X   X   X   \
+// Inp[4] ***   / \ / \ / \   ***       ***   *** Out[4]
+//             /   X   X   \     \     /   \ /
+//            /   / \ / \   \     \   /     X
+//           /   /   X   \   \     \ /     / \
+// Inp[5] ***   /   / \   \   ***   X   ***   *** Out[5]
+//             /   /   \   \     \ / \ /
+//            /   /     \   \     X   X
+//           /   /       \   \   / \ / \
+// Inp[6] ***   /         \   ***   X   ***   *** Out[6]
+//             /           \       / \     \ /
+//            /             \     /   \     X
+//           /               \   /     \   / \
+// Inp[7] ***                 ***       ***   *** Out[7]
+//
+//
+// Reverse delta network is same as delta network, with the steps in
+// the opposite order.
+//
+//
+// Benes network is a forward delta network immediately followed by
+// a reverse delta network.
+
+
+// Graph coloring utility used to partition nodes into two groups:
+// they will correspond to nodes routed to the upper and lower networks.
+struct Coloring {
+  enum : uint8_t {
+    None = 0,
+    Red,
+    Black
+  };
+
+  using Node = int;
+  using MapType = std::map<Node,uint8_t>;
+  static constexpr Node Ignore = Node(-1);
+
+  Coloring(ArrayRef<Node> Ord) : Order(Ord) {
+    build();
+    if (!color())
+      Colors.clear();
+  }
+
+  const MapType &colors() const {
+    return Colors;
+  }
+
+  uint8_t other(uint8_t Color) {
+    if (Color == None)
+      return Red;
+    return Color == Red ? Black : Red;
+  }
+
+  void dump() const;
+
+private:
+  ArrayRef<Node> Order;
+  MapType Colors;
+  std::set<Node> Needed;
+
+  using NodeSet = std::set<Node>;
+  std::map<Node,NodeSet> Edges;
+
+  Node conj(Node Pos) {
+    Node Num = Order.size();
+    return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2;
+  }
+
+  uint8_t getColor(Node N) {
+    auto F = Colors.find(N);
+    return F != Colors.end() ? F->second : None;
+  }
+
+  std::pair<bool,uint8_t> getUniqueColor(const NodeSet &Nodes);
+
+  void build();
+  bool color();
+};
+
+std::pair<bool,uint8_t> Coloring::getUniqueColor(const NodeSet &Nodes) {
+  uint8_t Color = None;
+  for (Node N : Nodes) {
+    uint8_t ColorN = getColor(N);
+    if (ColorN == None)
+      continue;
+    if (Color == None)
+      Color = ColorN;
+    else if (Color != None && Color != ColorN)
+      return { false, None };
+  }
+  return { true, Color };
+}
+
+void Coloring::build() {
+  // Add Order[P] and Order[conj(P)] to Edges.
+  for (unsigned P = 0; P != Order.size(); ++P) {
+    Node I = Order[P];
+    if (I != Ignore) {
+      Needed.insert(I);
+      Node PC = Order[conj(P)];
+      if (PC != Ignore && PC != I)
+        Edges[I].insert(PC);
+    }
+  }
+  // Add I and conj(I) to Edges.
+  for (unsigned I = 0; I != Order.size(); ++I) {
+    if (!Needed.count(I))
+      continue;
+    Node C = conj(I);
+    // This will create an entry in the edge table, even if I is not
+    // connected to any other node. This is necessary, because it still
+    // needs to be colored.
+    NodeSet &Is = Edges[I];
+    if (Needed.count(C))
+      Is.insert(C);
+  }
+}
+
+bool Coloring::color() {
+  SetVector<Node> FirstQ;
+  auto Enqueue = [this,&FirstQ] (Node N) {
+    SetVector<Node> Q;
+    Q.insert(N);
+    for (unsigned I = 0; I != Q.size(); ++I) {
+      NodeSet &Ns = Edges[Q[I]];
+      Q.insert(Ns.begin(), Ns.end());
+    }
+    FirstQ.insert(Q.begin(), Q.end());
+  };
+  for (Node N : Needed)
+    Enqueue(N);
+
+  for (Node N : FirstQ) {
+    if (Colors.count(N))
+      continue;
+    NodeSet &Ns = Edges[N];
+    auto P = getUniqueColor(Ns);
+    if (!P.first)
+      return false;
+    Colors[N] = other(P.second);
+  }
+
+  // First, color nodes that don't have any dups.
+  for (auto E : Edges) {
+    Node N = E.first;
+    if (!Needed.count(conj(N)) || Colors.count(N))
+      continue;
+    auto P = getUniqueColor(E.second);
+    if (!P.first)
+      return false;
+    Colors[N] = other(P.second);
+  }
+
+  // Now, nodes that are still uncolored. Since the graph can be modified
+  // in this step, create a work queue.
+  std::vector<Node> WorkQ;
+  for (auto E : Edges) {
+    Node N = E.first;
+    if (!Colors.count(N))
+      WorkQ.push_back(N);
+  }
+
+  for (unsigned I = 0; I < WorkQ.size(); ++I) {
+    Node N = WorkQ[I];
+    NodeSet &Ns = Edges[N];
+    auto P = getUniqueColor(Ns);
+    if (P.first) {
+      Colors[N] = other(P.second);
+      continue;
+    }
+
+    // Coloring failed. Split this node.
+    Node C = conj(N);
+    uint8_t ColorN = other(None);
+    uint8_t ColorC = other(ColorN);
+    NodeSet &Cs = Edges[C];
+    NodeSet CopyNs = Ns;
+    for (Node M : CopyNs) {
+      uint8_t ColorM = getColor(M);
+      if (ColorM == ColorC) {
+        // Connect M with C, disconnect M from N.
+        Cs.insert(M);
+        Edges[M].insert(C);
+        Ns.erase(M);
+        Edges[M].erase(N);
+      }
+    }
+    Colors[N] = ColorN;
+    Colors[C] = ColorC;
+  }
+
+  // Explicitly assign "None" all all uncolored nodes.
+  for (unsigned I = 0; I != Order.size(); ++I)
+    if (Colors.count(I) == 0)
+      Colors[I] = None;
+
+  return true;
+}
+
+LLVM_DUMP_METHOD
+void Coloring::dump() const {
+  dbgs() << "{ Order:   {";
+  for (unsigned I = 0; I != Order.size(); ++I) {
+    Node P = Order[I];
+    if (P != Ignore)
+      dbgs() << ' ' << P;
+    else
+      dbgs() << " -";
+  }
+  dbgs() << " }\n";
+  dbgs() << "  Needed: {";
+  for (Node N : Needed)
+    dbgs() << ' ' << N;
+  dbgs() << " }\n";
+
+  dbgs() << "  Edges: {\n";
+  for (auto E : Edges) {
+    dbgs() << "    " << E.first << " -> {";
+    for (auto N : E.second)
+      dbgs() << ' ' << N;
+    dbgs() << " }\n";
+  }
+  dbgs() << "  }\n";
+
+  static const char *const Names[] = { "None", "Red", "Black" };
+  dbgs() << "  Colors: {\n";
+  for (auto C : Colors)
+    dbgs() << "    " << C.first << " -> " << Names[C.second] << "\n";
+  dbgs() << "  }\n}\n";
+}
+
+// Base class of for reordering networks. They don't strictly need to be
+// permutations, as outputs with repeated occurrences of an input element
+// are allowed.
+struct PermNetwork {
+  using Controls = std::vector<uint8_t>;
+  using ElemType = int;
+  static constexpr ElemType Ignore = ElemType(-1);
+
+  enum : uint8_t {
+    None,
+    Pass,
+    Switch
+  };
+  enum : uint8_t {
+    Forward,
+    Reverse
+  };
+
+  PermNetwork(ArrayRef<ElemType> Ord, unsigned Mult = 1) {
+    Order.assign(Ord.data(), Ord.data()+Ord.size());
+    Log = 0;
+
+    unsigned S = Order.size();
+    while (S >>= 1)
+      ++Log;
+
+    Table.resize(Order.size());
+    for (RowType &Row : Table)
+      Row.resize(Mult*Log, None);
+  }
+
+  void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const {
+    unsigned Size = Order.size();
+    V.resize(Size);
+    for (unsigned I = 0; I != Size; ++I) {
+      unsigned W = 0;
+      for (unsigned L = 0; L != Log; ++L) {
+        unsigned C = ctl(I, StartAt+L) == Switch;
+        if (Dir == Forward)
+          W |= C << (Log-1-L);
+        else
+          W |= C << L;
+      }
+      assert(isUInt<8>(W));
+      V[I] = uint8_t(W);
+    }
+  }
+
+  uint8_t ctl(ElemType Pos, unsigned Step) const {
+    return Table[Pos][Step];
+  }
+  unsigned size() const {
+    return Order.size();
+  }
+  unsigned steps() const {
+    return Log;
+  }
+
+protected:
+  unsigned Log;
+  std::vector<ElemType> Order;
+  using RowType = std::vector<uint8_t>;
+  std::vector<RowType> Table;
+};
+
+struct ForwardDeltaNetwork : public PermNetwork {
+  ForwardDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}
+
+  bool run(Controls &V) {
+    if (!route(Order.data(), Table.data(), size(), 0))
+      return false;
+    getControls(V, 0, Forward);
+    return true;
+  }
+
+private:
+  bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
+};
+
+struct ReverseDeltaNetwork : public PermNetwork {
+  ReverseDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}
+
+  bool run(Controls &V) {
+    if (!route(Order.data(), Table.data(), size(), 0))
+      return false;
+    getControls(V, 0, Reverse);
+    return true;
+  }
+
+private:
+  bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
+};
+
+struct BenesNetwork : public PermNetwork {
+  BenesNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord, 2) {}
+
+  bool run(Controls &F, Controls &R) {
+    if (!route(Order.data(), Table.data(), size(), 0))
+      return false;
+
+    getControls(F, 0, Forward);
+    getControls(R, Log, Reverse);
+    return true;
+  }
+
+private:
+  bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
+};
+
+
+bool ForwardDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
+                                unsigned Step) {
+  bool UseUp = false, UseDown = false;
+  ElemType Num = Size;
+
+  // Cannot use coloring here, because coloring is used to determine
+  // the "big" switch, i.e. the one that changes halves, and in a forward
+  // network, a color can be simultaneously routed to both halves in the
+  // step we're working on.
+  for (ElemType J = 0; J != Num; ++J) {
+    ElemType I = P[J];
+    // I is the position in the input,
+    // J is the position in the output.
+    if (I == Ignore)
+      continue;
+    uint8_t S;
+    if (I < Num/2)
+      S = (J < Num/2) ? Pass : Switch;
+    else
+      S = (J < Num/2) ? Switch : Pass;
+
+    // U is the element in the table that needs to be updated.
+    ElemType U = (S == Pass) ? I : (I < Num/2 ? I+Num/2 : I-Num/2);
+    if (U < Num/2)
+      UseUp = true;
+    else
+      UseDown = true;
+    if (T[U][Step] != S && T[U][Step] != None)
+      return false;
+    T[U][Step] = S;
+  }
+
+  for (ElemType J = 0; J != Num; ++J)
+    if (P[J] != Ignore && P[J] >= Num/2)
+      P[J] -= Num/2;
+
+  if (Step+1 < Log) {
+    if (UseUp   && !route(P,        T,        Size/2, Step+1))
+      return false;
+    if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
+      return false;
+  }
+  return true;
+}
+
+bool ReverseDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
+                                unsigned Step) {
+  unsigned Pets = Log-1 - Step;
+  bool UseUp = false, UseDown = false;
+  ElemType Num = Size;
+
+  // In this step half-switching occurs, so coloring can be used.
+  Coloring G({P,Size});
+  const Coloring::MapType &M = G.colors();
+  if (M.empty())
+    return false;
+
+  uint8_t ColorUp = Coloring::None;
+  for (ElemType J = 0; J != Num; ++J) {
+    ElemType I = P[J];
+    // I is the position in the input,
+    // J is the position in the output.
+    if (I == Ignore)
+      continue;
+    uint8_t C = M.at(I);
+    if (C == Coloring::None)
+      continue;
+    // During "Step", inputs cannot switch halves, so if the "up" color
+    // is still unknown, make sure that it is selected in such a way that
+    // "I" will stay in the same half.
+    bool InpUp = I < Num/2;
+    if (ColorUp == Coloring::None)
+      ColorUp = InpUp ? C : G.other(C);
+    if ((C == ColorUp) != InpUp) {
+      // If I should go to a different half than where is it now, give up.
+      return false;
+    }
+
+    uint8_t S;
+    if (InpUp) {
+      S = (J < Num/2) ? Pass : Switch;
+      UseUp = true;
+    } else {
+      S = (J < Num/2) ? Switch : Pass;
+      UseDown = true;
+    }
+    T[J][Pets] = S;
+  }
+
+  // Reorder the working permutation according to the computed switch table
+  // for the last step (i.e. Pets).
+  for (ElemType J = 0; J != Size/2; ++J) {
+    ElemType PJ = P[J];         // Current values of P[J]
+    ElemType PC = P[J+Size/2];  // and P[conj(J)]
+    ElemType QJ = PJ;           // New values of P[J]
+    ElemType QC = PC;           // and P[conj(J)]
+    if (T[J][Pets] == Switch)
+      QC = PJ;
+    if (T[J+Size/2][Pets] == Switch)
+      QJ = PC;
+    P[J] = QJ;
+    P[J+Size/2] = QC;
+  }
+
+  for (ElemType J = 0; J != Num; ++J)
+    if (P[J] != Ignore && P[J] >= Num/2)
+      P[J] -= Num/2;
+
+  if (Step+1 < Log) {
+    if (UseUp && !route(P, T, Size/2, Step+1))
+      return false;
+    if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
+      return false;
+  }
+  return true;
+}
+
+bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size,
+                         unsigned Step) {
+  Coloring G({P,Size});
+  const Coloring::MapType &M = G.colors();
+  if (M.empty())
+    return false;
+  ElemType Num = Size;
+
+  unsigned Pets = 2*Log-1 - Step;
+  bool UseUp = false, UseDown = false;
+
+  // Both assignments, i.e. Red->Up and Red->Down are valid, but they will
+  // result in different controls. Let's pick the one where the first
+  // control will be "Pass".
+  uint8_t ColorUp = Coloring::None;
+  for (ElemType J = 0; J != Num; ++J) {
+    ElemType I = P[J];
+    if (I == Ignore)
+      continue;
+    uint8_t C = M.at(I);
+    if (C == Coloring::None)
+      continue;
+    if (ColorUp == Coloring::None) {
+      ColorUp = (I < Num/2) ? Coloring::Red : Coloring::Black;
+    }
+    unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2;
+    if (C == ColorUp) {
+      if (I < Num/2)
+        T[I][Step] = Pass;
+      else
+        T[CI][Step] = Switch;
+      T[J][Pets] = (J < Num/2) ? Pass : Switch;
+      UseUp = true;
+    } else { // Down
+      if (I < Num/2)
+        T[CI][Step] = Switch;
+      else
+        T[I][Step] = Pass;
+      T[J][Pets] = (J < Num/2) ? Switch : Pass;
+      UseDown = true;
+    }
+  }
+
+  // Reorder the working permutation according to the computed switch table
+  // for the last step (i.e. Pets).
+  for (ElemType J = 0; J != Num/2; ++J) {
+    ElemType PJ = P[J];         // Current values of P[J]
+    ElemType PC = P[J+Num/2];   // and P[conj(J)]
+    ElemType QJ = PJ;           // New values of P[J]
+    ElemType QC = PC;           // and P[conj(J)]
+    if (T[J][Pets] == Switch)
+      QC = PJ;
+    if (T[J+Num/2][Pets] == Switch)
+      QJ = PC;
+    P[J] = QJ;
+    P[J+Num/2] = QC;
+  }
+
+  for (ElemType J = 0; J != Num; ++J)
+    if (P[J] != Ignore && P[J] >= Num/2)
+      P[J] -= Num/2;
+
+  if (Step+1 < Log) {
+    if (UseUp && !route(P, T, Size/2, Step+1))
+      return false;
+    if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
+      return false;
+  }
+  return true;
+}
+
+// --------------------------------------------------------------------
+// Support for building selection results (output instructions that are
+// parts of the final selection).
+
+struct OpRef {
+  OpRef(SDValue V) : OpV(V) {}
+  bool isValue() const { return OpV.getNode() != nullptr; }
+  bool isValid() const { return isValue() || !(OpN & Invalid); }
+  static OpRef res(int N) { return OpRef(Whole | (N & Index)); }
+  static OpRef fail() { return OpRef(Invalid); }
+
+  static OpRef lo(const OpRef &R) {
+    assert(!R.isValue());
+    return OpRef(R.OpN & (Undef | Index | LoHalf));
+  }
+  static OpRef hi(const OpRef &R) {
+    assert(!R.isValue());
+    return OpRef(R.OpN & (Undef | Index | HiHalf));
+  }
+  static OpRef undef(MVT Ty) { return OpRef(Undef | Ty.SimpleTy); }
+
+  // Direct value.
+  SDValue OpV = SDValue();
+
+  // Reference to the operand of the input node:
+  // If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
+  // operand index:
+  // If bit 30 is set, it's the high half of the operand.
+  // If bit 29 is set, it's the low half of the operand.
+  unsigned OpN = 0;
+
+  enum : unsigned {
+    Invalid = 0x10000000,
+    LoHalf  = 0x20000000,
+    HiHalf  = 0x40000000,
+    Whole   = LoHalf | HiHalf,
+    Undef   = 0x80000000,
+    Index   = 0x0FFFFFFF,  // Mask of the index value.
+    IndexBits = 28,
+  };
+
+  void print(raw_ostream &OS, const SelectionDAG &G) const;
+
+private:
+  OpRef(unsigned N) : OpN(N) {}
+};
+
+struct NodeTemplate {
+  NodeTemplate() = default;
+  unsigned Opc = 0;
+  MVT Ty = MVT::Other;
+  std::vector<OpRef> Ops;
+
+  void print(raw_ostream &OS, const SelectionDAG &G) const;
+};
+
+struct ResultStack {
+  ResultStack(SDNode *Inp)
+    : InpNode(Inp), InpTy(Inp->getValueType(0).getSimpleVT()) {}
+  SDNode *InpNode;
+  MVT InpTy;
+  unsigned push(const NodeTemplate &Res) {
+    List.push_back(Res);
+    return List.size()-1;
+  }
+  unsigned push(unsigned Opc, MVT Ty, std::vector<OpRef> &&Ops) {
+    NodeTemplate Res;
+    Res.Opc = Opc;
+    Res.Ty = Ty;
+    Res.Ops = Ops;
+    return push(Res);
+  }
+  bool empty() const { return List.empty(); }
+  unsigned size() const { return List.size(); }
+  unsigned top() const { return size()-1; }
+  const NodeTemplate &operator[](unsigned I) const { return List[I]; }
+  unsigned reset(unsigned NewTop) {
+    List.resize(NewTop+1);
+    return NewTop;
+  }
+
+  using BaseType = std::vector<NodeTemplate>;
+  BaseType::iterator begin() { return List.begin(); }
+  BaseType::iterator end()   { return List.end(); }
+  BaseType::const_iterator begin() const { return List.begin(); }
+  BaseType::const_iterator end() const   { return List.end(); }
+
+  BaseType List;
+
+  void print(raw_ostream &OS, const SelectionDAG &G) const;
+};
+
+void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
+  if (isValue()) {
+    OpV.getNode()->print(OS, &G);
+    return;
+  }
+  if (OpN & Invalid) {
+    OS << "invalid";
+    return;
+  }
+  if (OpN & Undef) {
+    OS << "undef";
+    return;
+  }
+  if ((OpN & Whole) != Whole) {
+    assert((OpN & Whole) == LoHalf || (OpN & Whole) == HiHalf);
+    if (OpN & LoHalf)
+      OS << "lo ";
+    else
+      OS << "hi ";
+  }
+  OS << '#' << SignExtend32(OpN & Index, IndexBits);
+}
+
+void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const {
+  const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo();
+  OS << format("%8s", EVT(Ty).getEVTString().c_str()) << "  "
+     << TII.getName(Opc);
+  bool Comma = false;
+  for (const auto &R : Ops) {
+    if (Comma)
+      OS << ',';
+    Comma = true;
+    OS << ' ';
+    R.print(OS, G);
+  }
+}
+
+void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
+  OS << "Input node:\n";
+  InpNode->dumpr(&G);
+  OS << "Result templates:\n";
+  for (unsigned I = 0, E = List.size(); I != E; ++I) {
+    OS << '[' << I << "] ";
+    List[I].print(OS, G);
+    OS << '\n';
+  }
+}
+
+struct ShuffleMask {
+  ShuffleMask(ArrayRef<int> M) : Mask(M) {
+    for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
+      int M = Mask[I];
+      if (M == -1)
+        continue;
+      MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M);
+      MaxSrc = (MaxSrc == -1) ? M : std::max(MaxSrc, M);
+    }
+  }
+
+  ArrayRef<int> Mask;
+  int MinSrc = -1, MaxSrc = -1;
+
+  ShuffleMask lo() const {
+    size_t H = Mask.size()/2;
+    return ShuffleMask({Mask.data(), H});
+  }
+  ShuffleMask hi() const {
+    size_t H = Mask.size()/2;
+    return ShuffleMask({Mask.data()+H, H});
+  }
+};
+
+// --------------------------------------------------------------------
+// The HvxSelector class.
+
+static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
+  return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
+}
+static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
+  return static_cast<const HexagonSubtarget&>(G.getSubtarget());
+}
+
+namespace llvm {
+  struct HvxSelector {
+    const HexagonTargetLowering &Lower;
+    HexagonDAGToDAGISel &ISel;
+    SelectionDAG &DAG;
+    const HexagonSubtarget &HST;
+    const unsigned HwLen;
+
+    HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G)
+      : Lower(getHexagonLowering(G)),  ISel(HS), DAG(G),
+        HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}
+
+    MVT getSingleVT(MVT ElemTy) const {
+      unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8);
+      return MVT::getVectorVT(ElemTy, NumElems);
+    }
+
+    MVT getPairVT(MVT ElemTy) const {
+      unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8);
+      return MVT::getVectorVT(ElemTy, NumElems);
+    }
+
+    void selectShuffle(SDNode *N);
+    void selectRor(SDNode *N);
+
+  private:
+    void materialize(const ResultStack &Results);
+
+    SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
+
+    enum : unsigned {
+      None,
+      PackMux,
+    };
+    OpRef concat(OpRef Va, OpRef Vb, ResultStack &Results);
+    OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
+                MutableArrayRef<int> NewMask, unsigned Options = None);
+    OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
+                MutableArrayRef<int> NewMask);
+    OpRef zerous(ShuffleMask SM, OpRef Va, ResultStack &Results);
+    OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
+                ResultStack &Results);
+    OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
+                ResultStack &Results);
+
+    OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results);
+    OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
+    OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results);
+    OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
+
+    OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results);
+    OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
+    OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results);
+    OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results);
+
+    bool selectVectorConstants(SDNode *N);
+    bool scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl, MVT ResTy,
+                          SDValue Va, SDValue Vb, SDNode *N);
+
+  };
+}
+
+// Return a submask of A that is shorter than A by |C| elements:
+// - if C > 0, return a submask of A that starts at position C,
+// - if C <= 0, return a submask of A that starts at 0 (reduce A by |C|).
+static ArrayRef<int> subm(ArrayRef<int> A, int C) {
+  if (C > 0)
+    return { A.data()+C, A.size()-C };
+  return { A.data(), A.size()+C };
+}
+
+static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
+                      MutableArrayRef<int> MaskR) {
+  unsigned VecLen = Mask.size();
+  assert(MaskL.size() == VecLen && MaskR.size() == VecLen);
+  for (unsigned I = 0; I != VecLen; ++I) {
+    int M = Mask[I];
+    if (M < 0) {
+      MaskL[I] = MaskR[I] = -1;
+    } else if (unsigned(M) < VecLen) {
+      MaskL[I] = M;
+      MaskR[I] = -1;
+    } else {
+      MaskL[I] = -1;
+      MaskR[I] = M-VecLen;
+    }
+  }
+}
+
+static std::pair<int,unsigned> findStrip(ArrayRef<int> A, int Inc,
+                                         unsigned MaxLen) {
+  assert(A.size() > 0 && A.size() >= MaxLen);
+  int F = A[0];
+  int E = F;
+  for (unsigned I = 1; I != MaxLen; ++I) {
+    if (A[I] - E != Inc)
+      return { F, I };
+    E = A[I];
+  }
+  return { F, MaxLen };
+}
+
+static bool isUndef(ArrayRef<int> Mask) {
+  for (int Idx : Mask)
+    if (Idx != -1)
+      return false;
+  return true;
+}
+
+static bool isIdentity(ArrayRef<int> Mask) {
+  unsigned Size = Mask.size();
+  return findStrip(Mask, 1, Size) == std::make_pair(0, Size);
+}
+
+static bool isPermutation(ArrayRef<int> Mask) {
+  // Check by adding all numbers only works if there is no overflow.
+  assert(Mask.size() < 0x00007FFF && "Sanity failure");
+  int Sum = 0;
+  for (int Idx : Mask) {
+    if (Idx == -1)
+      return false;
+    Sum += Idx;
+  }
+  int N = Mask.size();
+  return 2*Sum == N*(N-1);
+}
+
+bool HvxSelector::selectVectorConstants(SDNode *N) {
+  // Constant vectors are generated as loads from constant pools.
+  // Since they are generated during the selection process, the main
+  // selection algorithm is not aware of them. Select them directly
+  // here.
+  if (!N->isMachineOpcode() && N->getOpcode() == ISD::LOAD) {
+    SDValue Addr = cast<LoadSDNode>(N)->getBasePtr();
+    unsigned AddrOpc = Addr.getOpcode();
+    if (AddrOpc == HexagonISD::AT_PCREL || AddrOpc == HexagonISD::CP) {
+      if (Addr.getOperand(0).getOpcode() == ISD::TargetConstantPool) {
+        ISel.Select(N);
+        return true;
+      }
+    }
+  }
+
+  bool Selected = false;
+  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
+    Selected = selectVectorConstants(N->getOperand(I).getNode()) || Selected;
+  return Selected;
+}
+
+void HvxSelector::materialize(const ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Materializing\n";
+    Results.print(dbgs(), DAG);
+  });
+  if (Results.empty())
+    return;
+  const SDLoc &dl(Results.InpNode);
+  std::vector<SDValue> Output;
+
+  for (unsigned I = 0, E = Results.size(); I != E; ++I) {
+    const NodeTemplate &Node = Results[I];
+    std::vector<SDValue> Ops;
+    for (const OpRef &R : Node.Ops) {
+      assert(R.isValid());
+      if (R.isValue()) {
+        Ops.push_back(R.OpV);
+        continue;
+      }
+      if (R.OpN & OpRef::Undef) {
+        MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index);
+        Ops.push_back(ISel.selectUndef(dl, MVT(SVT)));
+        continue;
+      }
+      // R is an index of a result.
+      unsigned Part = R.OpN & OpRef::Whole;
+      int Idx = SignExtend32(R.OpN & OpRef::Index, OpRef::IndexBits);
+      if (Idx < 0)
+        Idx += I;
+      assert(Idx >= 0 && unsigned(Idx) < Output.size());
+      SDValue Op = Output[Idx];
+      MVT OpTy = Op.getValueType().getSimpleVT();
+      if (Part != OpRef::Whole) {
+        assert(Part == OpRef::LoHalf || Part == OpRef::HiHalf);
+        if (Op.getOpcode() == HexagonISD::VCOMBINE) {
+          Op = (Part == OpRef::HiHalf) ? Op.getOperand(0) : Op.getOperand(1);
+        } else {
+          MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
+                                        OpTy.getVectorNumElements()/2);
+          unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
+                                                 : Hexagon::vsub_hi;
+          Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
+        }
+      }
+      Ops.push_back(Op);
+    } // for (Node : Results)
+
+    assert(Node.Ty != MVT::Other);
+    SDNode *ResN = (Node.Opc == TargetOpcode::COPY)
+                      ? Ops.front().getNode()
+                      : DAG.getMachineNode(Node.Opc, dl, Node.Ty, Ops);
+    Output.push_back(SDValue(ResN, 0));
+  }
+
+  SDNode *OutN = Output.back().getNode();
+  SDNode *InpN = Results.InpNode;
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Generated node:\n";
+    OutN->dumpr(&DAG);
+  });
+
+  ISel.ReplaceNode(InpN, OutN);
+  selectVectorConstants(OutN);
+  DAG.RemoveDeadNodes();
+}
+
+OpRef HvxSelector::concat(OpRef Lo, OpRef Hi, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  const SDLoc &dl(Results.InpNode);
+  Results.push(TargetOpcode::REG_SEQUENCE, getPairVT(MVT::i8), {
+    DAG.getTargetConstant(Hexagon::HvxWRRegClassID, dl, MVT::i32),
+    Lo, DAG.getTargetConstant(Hexagon::vsub_lo, dl, MVT::i32),
+    Hi, DAG.getTargetConstant(Hexagon::vsub_hi, dl, MVT::i32),
+  });
+  return OpRef::res(Results.top());
+}
+
+// Va, Vb are single vectors, SM can be arbitrarily long.
+OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
+                         ResultStack &Results, MutableArrayRef<int> NewMask,
+                         unsigned Options) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  if (!Va.isValid() || !Vb.isValid())
+    return OpRef::fail();
+
+  int VecLen = SM.Mask.size();
+  MVT Ty = getSingleVT(MVT::i8);
+
+  if (SM.MaxSrc - SM.MinSrc < int(HwLen)) {
+    if (SM.MaxSrc < int(HwLen)) {
+      memcpy(NewMask.data(), SM.Mask.data(), sizeof(int)*VecLen);
+      return Va;
+    }
+    if (SM.MinSrc >= int(HwLen)) {
+      for (int I = 0; I != VecLen; ++I) {
+        int M = SM.Mask[I];
+        if (M != -1)
+          M -= HwLen;
+        NewMask[I] = M;
+      }
+      return Vb;
+    }
+    const SDLoc &dl(Results.InpNode);
+    SDValue S = DAG.getTargetConstant(SM.MinSrc, dl, MVT::i32);
+    if (isUInt<3>(SM.MinSrc)) {
+      Results.push(Hexagon::V6_valignbi, Ty, {Vb, Va, S});
+    } else {
+      Results.push(Hexagon::A2_tfrsi, MVT::i32, {S});
+      unsigned Top = Results.top();
+      Results.push(Hexagon::V6_valignb, Ty, {Vb, Va, OpRef::res(Top)});
+    }
+    for (int I = 0; I != VecLen; ++I) {
+      int M = SM.Mask[I];
+      if (M != -1)
+        M -= SM.MinSrc;
+      NewMask[I] = M;
+    }
+    return OpRef::res(Results.top());
+  }
+
+  if (Options & PackMux) {
+    // If elements picked from Va and Vb have all different (source) indexes
+    // (relative to the start of the argument), do a mux, and update the mask.
+    BitVector Picked(HwLen);
+    SmallVector<uint8_t,128> MuxBytes(HwLen);
+    bool CanMux = true;
+    for (int I = 0; I != VecLen; ++I) {
+      int M = SM.Mask[I];
+      if (M == -1)
+        continue;
+      if (M >= int(HwLen))
+        M -= HwLen;
+      else
+        MuxBytes[M] = 0xFF;
+      if (Picked[M]) {
+        CanMux = false;
+        break;
+      }
+      NewMask[I] = M;
+    }
+    if (CanMux)
+      return vmuxs(MuxBytes, Va, Vb, Results);
+  }
+
+  return OpRef::fail();
+}
+
+OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
+                         ResultStack &Results, MutableArrayRef<int> NewMask) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  unsigned HalfMask = 0;
+  unsigned LogHw = Log2_32(HwLen);
+  for (int M : SM.Mask) {
+    if (M == -1)
+      continue;
+    HalfMask |= (1u << (M >> LogHw));
+  }
+
+  if (HalfMask == 0)
+    return OpRef::undef(getPairVT(MVT::i8));
+
+  // If more than two halves are used, bail.
+  // TODO: be more aggressive here?
+  if (countPopulation(HalfMask) > 2)
+    return OpRef::fail();
+
+  MVT HalfTy = getSingleVT(MVT::i8);
+
+  OpRef Inp[2] = { Va, Vb };
+  OpRef Out[2] = { OpRef::undef(HalfTy), OpRef::undef(HalfTy) };
+
+  uint8_t HalfIdx[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
+  unsigned Idx = 0;
+  for (unsigned I = 0; I != 4; ++I) {
+    if ((HalfMask & (1u << I)) == 0)
+      continue;
+    assert(Idx < 2);
+    OpRef Op = Inp[I/2];
+    Out[Idx] = (I & 1) ? OpRef::hi(Op) : OpRef::lo(Op);
+    HalfIdx[I] = Idx++;
+  }
+
+  int VecLen = SM.Mask.size();
+  for (int I = 0; I != VecLen; ++I) {
+    int M = SM.Mask[I];
+    if (M >= 0) {
+      uint8_t Idx = HalfIdx[M >> LogHw];
+      assert(Idx == 0 || Idx == 1);
+      M = (M & (HwLen-1)) + HwLen*Idx;
+    }
+    NewMask[I] = M;
+  }
+
+  return concat(Out[0], Out[1], Results);
+}
+
+OpRef HvxSelector::zerous(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+
+  int VecLen = SM.Mask.size();
+  SmallVector<uint8_t,128> UsedBytes(VecLen);
+  bool HasUnused = false;
+  for (int I = 0; I != VecLen; ++I) {
+    if (SM.Mask[I] != -1)
+      UsedBytes[I] = 0xFF;
+    else
+      HasUnused = true;
+  }
+  if (!HasUnused)
+    return Va;
+  SDValue B = getVectorConstant(UsedBytes, SDLoc(Results.InpNode));
+  Results.push(Hexagon::V6_vand, getSingleVT(MVT::i8), {Va, OpRef(B)});
+  return OpRef::res(Results.top());
+}
+
+OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
+                         ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  MVT ByteTy = getSingleVT(MVT::i8);
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, 8*HwLen); // XXX
+  const SDLoc &dl(Results.InpNode);
+  SDValue B = getVectorConstant(Bytes, dl);
+  Results.push(Hexagon::V6_vd0, ByteTy, {});
+  Results.push(Hexagon::V6_veqb, BoolTy, {OpRef(B), OpRef::res(-1)});
+  Results.push(Hexagon::V6_vmux, ByteTy, {OpRef::res(-1), Va, Vb});
+  return OpRef::res(Results.top());
+}
+
+OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
+                         ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  size_t S = Bytes.size() / 2;
+  OpRef L = vmuxs({Bytes.data(),   S}, OpRef::lo(Va), OpRef::lo(Vb), Results);
+  OpRef H = vmuxs({Bytes.data()+S, S}, OpRef::hi(Va), OpRef::hi(Vb), Results);
+  return concat(L, H, Results);
+}
+
+OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  unsigned VecLen = SM.Mask.size();
+  assert(HwLen == VecLen);
+  assert(all_of(SM.Mask, [this](int M) { return M == -1 || M < int(HwLen); }));
+
+  if (isIdentity(SM.Mask))
+    return Va;
+  if (isUndef(SM.Mask))
+    return OpRef::undef(getSingleVT(MVT::i8));
+
+  return butterfly(SM, Va, Results);
+}
+
+OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb,
+                           ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  OpRef C = contracting(SM, Va, Vb, Results);
+  if (C.isValid())
+    return C;
+
+  int VecLen = SM.Mask.size();
+  SmallVector<int,128> NewMask(VecLen);
+  OpRef P = packs(SM, Va, Vb, Results, NewMask);
+  if (P.isValid())
+    return shuffs1(ShuffleMask(NewMask), P, Results);
+
+  SmallVector<int,128> MaskL(VecLen), MaskR(VecLen);
+  splitMask(SM.Mask, MaskL, MaskR);
+
+  OpRef L = shuffs1(ShuffleMask(MaskL), Va, Results);
+  OpRef R = shuffs1(ShuffleMask(MaskR), Vb, Results);
+  if (!L.isValid() || !R.isValid())
+    return OpRef::fail();
+
+  SmallVector<uint8_t,128> Bytes(VecLen);
+  for (int I = 0; I != VecLen; ++I) {
+    if (MaskL[I] != -1)
+      Bytes[I] = 0xFF;
+  }
+  return vmuxs(Bytes, L, R, Results);
+}
+
+OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  int VecLen = SM.Mask.size();
+
+  SmallVector<int,128> PackedMask(VecLen);
+  OpRef P = packs(SM, OpRef::lo(Va), OpRef::hi(Va), Results, PackedMask);
+  if (P.isValid()) {
+    ShuffleMask PM(PackedMask);
+    OpRef E = expanding(PM, P, Results);
+    if (E.isValid())
+      return E;
+
+    OpRef L = shuffs1(PM.lo(), P, Results);
+    OpRef H = shuffs1(PM.hi(), P, Results);
+    if (L.isValid() && H.isValid())
+      return concat(L, H, Results);
+  }
+
+  OpRef R = perfect(SM, Va, Results);
+  if (R.isValid())
+    return R;
+  // TODO commute the mask and try the opposite order of the halves.
+
+  OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results);
+  OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results);
+  if (L.isValid() && H.isValid())
+    return concat(L, H, Results);
+
+  return OpRef::fail();
+}
+
+OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
+                           ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  int VecLen = SM.Mask.size();
+
+  SmallVector<int,256> PackedMask(VecLen);
+  OpRef P = packp(SM, Va, Vb, Results, PackedMask);
+  if (P.isValid())
+    return shuffp1(ShuffleMask(PackedMask), P, Results);
+
+  SmallVector<int,256> MaskL(VecLen), MaskR(VecLen);
+  OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results);
+  OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results);
+  if (!L.isValid() || !R.isValid())
+    return OpRef::fail();
+
+  // Mux the results.
+  SmallVector<uint8_t,256> Bytes(VecLen);
+  for (int I = 0; I != VecLen; ++I) {
+    if (MaskL[I] != -1)
+      Bytes[I] = 0xFF;
+  }
+  return vmuxp(Bytes, L, R, Results);
+}
+
+bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
+                                   MVT ResTy, SDValue Va, SDValue Vb,
+                                   SDNode *N) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  MVT ElemTy = ResTy.getVectorElementType();
+  assert(ElemTy == MVT::i8);
+  unsigned VecLen = Mask.size();
+  bool HavePairs = (2*HwLen == VecLen);
+  MVT SingleTy = getSingleVT(MVT::i8);
+
+  SmallVector<SDValue,128> Ops;
+  for (int I : Mask) {
+    if (I < 0) {
+      Ops.push_back(ISel.selectUndef(dl, ElemTy));
+      continue;
+    }
+    SDValue Vec;
+    unsigned M = I;
+    if (M < VecLen) {
+      Vec = Va;
+    } else {
+      Vec = Vb;
+      M -= VecLen;
+    }
+    if (HavePairs) {
+      if (M < HwLen) {
+        Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, Vec);
+      } else {
+        Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, Vec);
+        M -= HwLen;
+      }
+    }
+    SDValue Idx = DAG.getConstant(M, dl, MVT::i32);
+    SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemTy, {Vec, Idx});
+    SDValue L = Lower.LowerOperation(Ex, DAG);
+    assert(L.getNode());
+    Ops.push_back(L);
+  }
+
+  SDValue LV;
+  if (2*HwLen == VecLen) {
+    SDValue B0 = DAG.getBuildVector(SingleTy, dl, {Ops.data(), HwLen});
+    SDValue L0 = Lower.LowerOperation(B0, DAG);
+    SDValue B1 = DAG.getBuildVector(SingleTy, dl, {Ops.data()+HwLen, HwLen});
+    SDValue L1 = Lower.LowerOperation(B1, DAG);
+    // XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
+    // functions may expect to be called only for illegal operations, so
+    // make sure that they are not called for legal ones. Develop a better
+    // mechanism for dealing with this.
+    LV = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, {L0, L1});
+  } else {
+    SDValue BV = DAG.getBuildVector(ResTy, dl, Ops);
+    LV = Lower.LowerOperation(BV, DAG);
+  }
+
+  assert(!N->use_empty());
+  ISel.ReplaceNode(N, LV.getNode());
+  DAG.RemoveDeadNodes();
+
+  std::deque<SDNode*> SubNodes;
+  SubNodes.push_back(LV.getNode());
+  for (unsigned I = 0; I != SubNodes.size(); ++I) {
+    for (SDValue Op : SubNodes[I]->ops())
+      SubNodes.push_back(Op.getNode());
+  }
+  while (!SubNodes.empty()) {
+    SDNode *S = SubNodes.front();
+    SubNodes.pop_front();
+    if (S->use_empty())
+      continue;
+    // This isn't great, but users need to be selected before any nodes that
+    // they use. (The reason is to match larger patterns, and avoid nodes that
+    // cannot be matched on their own, e.g. ValueType, TokenFactor, etc.).
+    bool PendingUser = llvm::any_of(S->uses(), [&SubNodes](const SDNode *U) {
+                         return llvm::any_of(SubNodes, [U](const SDNode *T) {
+                           return T == U;
+                         });
+                       });
+    if (PendingUser)
+      SubNodes.push_back(S);
+    else
+      ISel.Select(S);
+  }
+
+  DAG.RemoveDeadNodes();
+  return true;
+}
+
+OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
+                               ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  if (!Va.isValid() || !Vb.isValid())
+    return OpRef::fail();
+
+  // Contracting shuffles, i.e. instructions that always discard some bytes
+  // from the operand vectors.
+  //
+  // V6_vshuff{e,o}b
+  // V6_vdealb4w
+  // V6_vpack{e,o}{b,h}
+
+  int VecLen = SM.Mask.size();
+  std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);
+  MVT ResTy = getSingleVT(MVT::i8);
+
+  // The following shuffles only work for bytes and halfwords. This requires
+  // the strip length to be 1 or 2.
+  if (Strip.second != 1 && Strip.second != 2)
+    return OpRef::fail();
+
+  // The patterns for the shuffles, in terms of the starting offsets of the
+  // consecutive strips (L = length of the strip, N = VecLen):
+  //
+  // vpacke:    0, 2L, 4L ... N+0, N+2L, N+4L ...      L = 1 or 2
+  // vpacko:    L, 3L, 5L ... N+L, N+3L, N+5L ...      L = 1 or 2
+  //
+  // vshuffe:   0, N+0, 2L, N+2L, 4L ...               L = 1 or 2
+  // vshuffo:   L, N+L, 3L, N+3L, 5L ...               L = 1 or 2
+  //
+  // vdealb4w:  0, 4, 8 ... 2, 6, 10 ... N+0, N+4, N+8 ... N+2, N+6, N+10 ...
+
+  // The value of the element in the mask following the strip will decide
+  // what kind of a shuffle this can be.
+  int NextInMask = SM.Mask[Strip.second];
+
+  // Check if NextInMask could be 2L, 3L or 4, i.e. if it could be a mask
+  // for vpack or vdealb4w. VecLen > 4, so NextInMask for vdealb4w would
+  // satisfy this.
+  if (NextInMask < VecLen) {
+    // vpack{e,o} or vdealb4w
+    if (Strip.first == 0 && Strip.second == 1 && NextInMask == 4) {
+      int N = VecLen;
+      // Check if this is vdealb4w (L=1).
+      for (int I = 0; I != N/4; ++I)
+        if (SM.Mask[I] != 4*I)
+          return OpRef::fail();
+      for (int I = 0; I != N/4; ++I)
+        if (SM.Mask[I+N/4] != 2 + 4*I)
+          return OpRef::fail();
+      for (int I = 0; I != N/4; ++I)
+        if (SM.Mask[I+N/2] != N + 4*I)
+          return OpRef::fail();
+      for (int I = 0; I != N/4; ++I)
+        if (SM.Mask[I+3*N/4] != N+2 + 4*I)
+          return OpRef::fail();
+      // Matched mask for vdealb4w.
+      Results.push(Hexagon::V6_vdealb4w, ResTy, {Vb, Va});
+      return OpRef::res(Results.top());
+    }
+
+    // Check if this is vpack{e,o}.
+    int N = VecLen;
+    int L = Strip.second;
+    // Check if the first strip starts at 0 or at L.
+    if (Strip.first != 0 && Strip.first != L)
+      return OpRef::fail();
+    // Examine the rest of the mask.
+    for (int I = L; I < N/2; I += L) {
+      auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+      // Check whether the mask element at the beginning of each strip
+      // increases by 2L each time.
+      if (S.first - Strip.first != 2*I)
+        return OpRef::fail();
+      // Check whether each strip is of the same length.
+      if (S.second != unsigned(L))
+        return OpRef::fail();
+    }
+
+    // Strip.first == 0  =>  vpacke
+    // Strip.first == L  =>  vpacko
+    assert(Strip.first == 0 || Strip.first == L);
+    using namespace Hexagon;
+    NodeTemplate Res;
+    Res.Opc = Strip.second == 1 // Number of bytes.
+                  ? (Strip.first == 0 ? V6_vpackeb : V6_vpackob)
+                  : (Strip.first == 0 ? V6_vpackeh : V6_vpackoh);
+    Res.Ty = ResTy;
+    Res.Ops = { Vb, Va };
+    Results.push(Res);
+    return OpRef::res(Results.top());
+  }
+
+  // Check if this is vshuff{e,o}.
+  int N = VecLen;
+  int L = Strip.second;
+  std::pair<int,unsigned> PrevS = Strip;
+  bool Flip = false;
+  for (int I = L; I < N; I += L) {
+    auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+    if (S.second != PrevS.second)
+      return OpRef::fail();
+    int Diff = Flip ? PrevS.first - S.first + 2*L
+                    : S.first - PrevS.first;
+    if (Diff != N)
+      return OpRef::fail();
+    Flip ^= true;
+    PrevS = S;
+  }
+  // Strip.first == 0  =>  vshuffe
+  // Strip.first == L  =>  vshuffo
+  assert(Strip.first == 0 || Strip.first == L);
+  using namespace Hexagon;
+  NodeTemplate Res;
+  Res.Opc = Strip.second == 1 // Number of bytes.
+                ? (Strip.first == 0 ? V6_vshuffeb : V6_vshuffob)
+                : (Strip.first == 0 ?  V6_vshufeh :  V6_vshufoh);
+  Res.Ty = ResTy;
+  Res.Ops = { Vb, Va };
+  Results.push(Res);
+  return OpRef::res(Results.top());
+}
+
+OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  // Expanding shuffles (using all elements and inserting into larger vector):
+  //
+  // V6_vunpacku{b,h} [*]
+  //
+  // [*] Only if the upper elements (filled with 0s) are "don't care" in Mask.
+  //
+  // Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
+  // they are not shuffles.
+  //
+  // The argument is a single vector.
+
+  int VecLen = SM.Mask.size();
+  assert(2*HwLen == unsigned(VecLen) && "Expecting vector-pair type");
+
+  std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);
+
+  // The patterns for the unpacks, in terms of the starting offsets of the
+  // consecutive strips (L = length of the strip, N = VecLen):
+  //
+  // vunpacku:  0, -1, L, -1, 2L, -1 ...
+
+  if (Strip.first != 0)
+    return OpRef::fail();
+
+  // The vunpackus only handle byte and half-word.
+  if (Strip.second != 1 && Strip.second != 2)
+    return OpRef::fail();
+
+  int N = VecLen;
+  int L = Strip.second;
+
+  // First, check the non-ignored strips.
+  for (int I = 2*L; I < 2*N; I += 2*L) {
+    auto S = findStrip(subm(SM.Mask,I), 1, N-I);
+    if (S.second != unsigned(L))
+      return OpRef::fail();
+    if (2*S.first != I)
+      return OpRef::fail();
+  }
+  // Check the -1s.
+  for (int I = L; I < 2*N; I += 2*L) {
+    auto S = findStrip(subm(SM.Mask,I), 0, N-I);
+    if (S.first != -1 || S.second != unsigned(L))
+      return OpRef::fail();
+  }
+
+  unsigned Opc = Strip.second == 1 ? Hexagon::V6_vunpackub
+                                   : Hexagon::V6_vunpackuh;
+  Results.push(Opc, getPairVT(MVT::i8), {Va});
+  return OpRef::res(Results.top());
+}
+
+OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  // V6_vdeal{b,h}
+  // V6_vshuff{b,h}
+
+  // V6_vshufoe{b,h}  those are quivalent to vshuffvdd(..,{1,2})
+  // V6_vshuffvdd (V6_vshuff)
+  // V6_dealvdd (V6_vdeal)
+
+  // TODO Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
+
+  int VecLen = SM.Mask.size();
+  assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= 8);
+  unsigned LogLen = Log2_32(VecLen);
+
+  if (!isPermutation(SM.Mask))
+    return OpRef::fail();
+
+  SmallVector<unsigned,8> Perm(LogLen);
+
+  // Check if this could be a perfect shuffle, or a combination of perfect
+  // shuffles.
+  //
+  // Consider this permutation (using hex digits to make the ASCII diagrams
+  // easier to read):
+  //   { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
+  // This is a "deal" operation: divide the input into two halves, and
+  // create the output by picking elements by alternating between these two
+  // halves:
+  //   0 1 2 3 4 5 6 7    -->    0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F  [*]
+  //   8 9 A B C D E F
+  //
+  // Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
+  // a somwehat different mechanism that could be used to perform shuffle/
+  // deal operations: a 2x2 transpose.
+  // Consider the halves of inputs again, they can be interpreted as a 2x8
+  // matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
+  // together. Now, when considering 2 elements at a time, it will be a 2x4
+  // matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
+  //   01 23  45 67
+  //   89 AB  CD EF
+  // With groups of 4, this will become a single 2x2 matrix, and so on.
+  //
+  // The 2x2 transpose instruction works by transposing each of the 2x2
+  // matrices (or "sub-matrices"), given a specific group size. For example,
+  // if the group size is 1 (i.e. each element is its own group), there
+  // will be four transposes of the four 2x2 matrices that form the 2x8.
+  // For example, with the inputs as above, the result will be:
+  //   0 8  2 A  4 C  6 E
+  //   1 9  3 B  5 D  7 F
+  // Now, this result can be tranposed again, but with the group size of 2:
+  //   08 19  4C 5D
+  //   2A 3B  6E 7F
+  // If we then transpose that result, but with the group size of 4, we get:
+  //   0819 2A3B
+  //   4C5D 6E7F
+  // If we concatenate these two rows, it will be
+  //   0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
+  // which is the same as the "deal" [*] above.
+  //
+  // In general, a "deal" of individual elements is a series of 2x2 transposes,
+  // with changing group size. HVX has two instructions:
+  //   Vdd = V6_vdealvdd Vu, Vv, Rt
+  //   Vdd = V6_shufvdd  Vu, Vv, Rt
+  // that perform exactly that. The register Rt controls which transposes are
+  // going to happen: a bit at position n (counting from 0) indicates that a
+  // transpose with a group size of 2^n will take place. If multiple bits are
+  // set, multiple transposes will happen: vdealvdd will perform them starting
+  // with the largest group size, vshuffvdd will do them in the reverse order.
+  //
+  // The main observation is that each 2x2 transpose corresponds to swapping
+  // columns of bits in the binary representation of the values.
+  //
+  // The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
+  // in a given column. The * denote the columns that will be swapped.
+  // The transpose with the group size 2^n corresponds to swapping columns
+  // 3 (the highest log) and log2(n):
+  //
+  //     3 2 1 0         0 2 1 3         0 2 3 1
+  //     *     *             * *           * *
+  //  0  0 0 0 0      0  0 0 0 0      0  0 0 0 0      0  0 0 0 0
+  //  1  0 0 0 1      8  1 0 0 0      8  1 0 0 0      8  1 0 0 0
+  //  2  0 0 1 0      2  0 0 1 0      1  0 0 0 1      1  0 0 0 1
+  //  3  0 0 1 1      A  1 0 1 0      9  1 0 0 1      9  1 0 0 1
+  //  4  0 1 0 0      4  0 1 0 0      4  0 1 0 0      2  0 0 1 0
+  //  5  0 1 0 1      C  1 1 0 0      C  1 1 0 0      A  1 0 1 0
+  //  6  0 1 1 0      6  0 1 1 0      5  0 1 0 1      3  0 0 1 1
+  //  7  0 1 1 1      E  1 1 1 0      D  1 1 0 1      B  1 0 1 1
+  //  8  1 0 0 0      1  0 0 0 1      2  0 0 1 0      4  0 1 0 0
+  //  9  1 0 0 1      9  1 0 0 1      A  1 0 1 0      C  1 1 0 0
+  //  A  1 0 1 0      3  0 0 1 1      3  0 0 1 1      5  0 1 0 1
+  //  B  1 0 1 1      B  1 0 1 1      B  1 0 1 1      D  1 1 0 1
+  //  C  1 1 0 0      5  0 1 0 1      6  0 1 1 0      6  0 1 1 0
+  //  D  1 1 0 1      D  1 1 0 1      E  1 1 1 0      E  1 1 1 0
+  //  E  1 1 1 0      7  0 1 1 1      7  0 1 1 1      7  0 1 1 1
+  //  F  1 1 1 1      F  1 1 1 1      F  1 1 1 1      F  1 1 1 1
+
+  auto XorPow2 = [] (ArrayRef<int> Mask, unsigned Num) {
+    unsigned X = Mask[0] ^ Mask[Num/2];
+    // Check that the first half has the X's bits clear.
+    if ((Mask[0] & X) != 0)
+      return 0u;
+    for (unsigned I = 1; I != Num/2; ++I) {
+      if (unsigned(Mask[I] ^ Mask[I+Num/2]) != X)
+        return 0u;
+      if ((Mask[I] & X) != 0)
+        return 0u;
+    }
+    return X;
+  };
+
+  // Create a vector of log2's for each column: Perm[i] corresponds to
+  // the i-th bit (lsb is 0).
+  assert(VecLen > 2);
+  for (unsigned I = VecLen; I >= 2; I >>= 1) {
+    // Examine the initial segment of Mask of size I.
+    unsigned X = XorPow2(SM.Mask, I);
+    if (!isPowerOf2_32(X))
+      return OpRef::fail();
+    // Check the other segments of Mask.
+    for (int J = 0; J < VecLen; J += I) {
+      if (XorPow2(subm(SM.Mask, -J), I) != X)
+        return OpRef::fail();
+    }
+    Perm[Log2_32(X)] = Log2_32(I)-1;
+  }
+
+  // Once we have Perm, represent it as cycles. Denote the maximum log2
+  // (equal to log2(VecLen)-1) as M. The cycle containing M can then be
+  // written as (M a1 a2 a3 ... an). That cycle can be broken up into
+  // simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
+  // order being from left to right. Any (contiguous) segment where the
+  // values ai, ai+1...aj are either all increasing or all decreasing,
+  // can be implemented via a single vshuffvdd/vdealvdd respectively.
+  //
+  // If there is a cycle (a1 a2 ... an) that does not involve M, it can
+  // be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
+  // then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
+  // can be used to generate a sequence of vshuffvdd/vdealvdd.
+  //
+  // Example:
+  // Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
+  // as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
+  //   (4 0 1)(4 0)(4 2 3)(4 2).
+  // It can then be expanded into swaps as
+  //   (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
+  // and broken up into "increasing" segments as
+  //   [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
+  // This is equivalent to
+  //   (4 0 1)(4 0 2 3)(4 2),
+  // which can be implemented as 3 vshufvdd instructions.
+
+  using CycleType = SmallVector<unsigned,8>;
+  std::set<CycleType> Cycles;
+  std::set<unsigned> All;
+
+  for (unsigned I : Perm)
+    All.insert(I);
+
+  // If the cycle contains LogLen-1, move it to the front of the cycle.
+  // Otherwise, return the cycle unchanged.
+  auto canonicalize = [LogLen](const CycleType &C) -> CycleType {
+    unsigned LogPos, N = C.size();
+    for (LogPos = 0; LogPos != N; ++LogPos)
+      if (C[LogPos] == LogLen-1)
+        break;
+    if (LogPos == N)
+      return C;
+
+    CycleType NewC(C.begin()+LogPos, C.end());
+    NewC.append(C.begin(), C.begin()+LogPos);
+    return NewC;
+  };
+
+  while (!All.empty()) {
+    unsigned A = *All.begin();
+    All.erase(A);
+    CycleType C;
+    C.push_back(A);
+    for (unsigned B = Perm[A]; B != A; B = Perm[B]) {
+      C.push_back(B);
+      All.erase(B);
+    }
+    if (C.size() <= 1)
+      continue;
+    Cycles.insert(canonicalize(C));
+  }
+
+  SmallVector<unsigned,8> SwapElems;
+  if (HwLen == unsigned(VecLen))
+    SwapElems.push_back(LogLen-1);
+
+  for (const CycleType &C : Cycles) {
+    unsigned First = (C[0] == LogLen-1) ? 1 : 0;
+    SwapElems.append(C.begin()+First, C.end());
+    if (First == 0)
+      SwapElems.push_back(C[0]);
+  }
+
+  const SDLoc &dl(Results.InpNode);
+  OpRef Arg = Va;
+  MVT PairTy = getPairVT(MVT::i8);
+
+  for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
+    bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1];
+    unsigned S = (1u << SwapElems[I]);
+    if (I < E-1) {
+      while (++I < E-1 && IsInc == (SwapElems[I] < SwapElems[I+1]))
+        S |= 1u << SwapElems[I];
+      // The above loop will not add a bit for the final SwapElems[I+1],
+      // so add it here.
+      S |= 1u << SwapElems[I];
+    }
+    ++I;
+
+    NodeTemplate Res;
+    Results.push(Hexagon::A2_tfrsi, MVT::i32,
+                 { DAG.getTargetConstant(S, dl, MVT::i32) });
+    Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
+    Res.Ty = PairTy;
+    Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) };
+    Results.push(Res);
+    Arg = OpRef::res(Results.top());
+  }
+
+  return Arg;
+}
+
+OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
+  DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
+  // Butterfly shuffles.
+  //
+  // V6_vdelta
+  // V6_vrdelta
+  // V6_vror
+
+  // The assumption here is that all elements picked by Mask are in the
+  // first operand to the vector_shuffle. This assumption is enforced
+  // by the caller.
+
+  MVT ResTy = getSingleVT(MVT::i8);
+  PermNetwork::Controls FC, RC;
+  const SDLoc &dl(Results.InpNode);
+  int VecLen = SM.Mask.size();
+
+  for (int M : SM.Mask) {
+    if (M != -1 && M >= VecLen)
+      return OpRef::fail();
+  }
+
+  // Try the deltas/benes for both single vectors and vector pairs.
+  ForwardDeltaNetwork FN(SM.Mask);
+  if (FN.run(FC)) {
+    SDValue Ctl = getVectorConstant(FC, dl);
+    Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(Ctl)});
+    return OpRef::res(Results.top());
+  }
+
+  // Try reverse delta.
+  ReverseDeltaNetwork RN(SM.Mask);
+  if (RN.run(RC)) {
+    SDValue Ctl = getVectorConstant(RC, dl);
+    Results.push(Hexagon::V6_vrdelta, ResTy, {Va, OpRef(Ctl)});
+    return OpRef::res(Results.top());
+  }
+
+  // Do Benes.
+  BenesNetwork BN(SM.Mask);
+  if (BN.run(FC, RC)) {
+    SDValue CtlF = getVectorConstant(FC, dl);
+    SDValue CtlR = getVectorConstant(RC, dl);
+    Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(CtlF)});
+    Results.push(Hexagon::V6_vrdelta, ResTy,
+                 {OpRef::res(-1), OpRef(CtlR)});
+    return OpRef::res(Results.top());
+  }
+
+  return OpRef::fail();
+}
+
+SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
+                                       const SDLoc &dl) {
+  SmallVector<SDValue, 128> Elems;
+  for (uint8_t C : Data)
+    Elems.push_back(DAG.getConstant(C, dl, MVT::i8));
+  MVT VecTy = MVT::getVectorVT(MVT::i8, Data.size());
+  SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
+  SDValue LV = Lower.LowerOperation(BV, DAG);
+  DAG.RemoveDeadNode(BV.getNode());
+  return LV;
+}
+
+void HvxSelector::selectShuffle(SDNode *N) {
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Starting " << __func__ << " on node:\n";
+    N->dump(&DAG);
+  });
+  MVT ResTy = N->getValueType(0).getSimpleVT();
+  // Assume that vector shuffles operate on vectors of bytes.
+  assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8);
+
+  auto *SN = cast<ShuffleVectorSDNode>(N);
+  std::vector<int> Mask(SN->getMask().begin(), SN->getMask().end());
+  // This shouldn't really be necessary. Is it?
+  for (int &Idx : Mask)
+    if (Idx != -1 && Idx < 0)
+      Idx = -1;
+
+  unsigned VecLen = Mask.size();
+  bool HavePairs = (2*HwLen == VecLen);
+  assert(ResTy.getSizeInBits() / 8 == VecLen);
+
+  // Vd = vector_shuffle Va, Vb, Mask
+  //
+
+  bool UseLeft = false, UseRight = false;
+  for (unsigned I = 0; I != VecLen; ++I) {
+    if (Mask[I] == -1)
+      continue;
+    unsigned Idx = Mask[I];
+    assert(Idx < 2*VecLen);
+    if (Idx < VecLen)
+      UseLeft = true;
+    else
+      UseRight = true;
+  }
+
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft="
+           << UseLeft << " UseRight=" << UseRight << " HavePairs="
+           << HavePairs << '\n';
+  });
+  // If the mask is all -1's, generate "undef".
+  if (!UseLeft && !UseRight) {
+    ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
+    DAG.RemoveDeadNode(N);
+    return;
+  }
+
+  SDValue Vec0 = N->getOperand(0);
+  SDValue Vec1 = N->getOperand(1);
+  ResultStack Results(SN);
+  Results.push(TargetOpcode::COPY, ResTy, {Vec0});
+  Results.push(TargetOpcode::COPY, ResTy, {Vec1});
+  OpRef Va = OpRef::res(Results.top()-1);
+  OpRef Vb = OpRef::res(Results.top());
+
+  OpRef Res = !HavePairs ? shuffs2(ShuffleMask(Mask), Va, Vb, Results)
+                         : shuffp2(ShuffleMask(Mask), Va, Vb, Results);
+
+  bool Done = Res.isValid();
+  if (Done)
+    materialize(Results);
+  else
+    Done = scalarizeShuffle(Mask, SDLoc(N), ResTy, Vec0, Vec1, N);
+
+  if (!Done) {
+#ifndef NDEBUG
+    dbgs() << "Unhandled shuffle:\n";
+    SN->dumpr(&DAG);
+#endif
+    llvm_unreachable("Failed to select vector shuffle");
+  }
+}
+
+void HvxSelector::selectRor(SDNode *N) {
+  // If this is a rotation by less than 8, use V6_valignbi.
+  MVT Ty = N->getValueType(0).getSimpleVT();
+  const SDLoc &dl(N);
+  SDValue VecV = N->getOperand(0);
+  SDValue RotV = N->getOperand(1);
+  SDNode *NewN = nullptr;
+
+  if (auto *CN = dyn_cast<ConstantSDNode>(RotV.getNode())) {
+    unsigned S = CN->getZExtValue();
+    if (S % HST.getVectorLength() == 0) {
+      NewN = VecV.getNode();
+    } else if (isUInt<3>(S)) {
+      SDValue C = DAG.getTargetConstant(S, dl, MVT::i32);
+      NewN = DAG.getMachineNode(Hexagon::V6_valignbi, dl, Ty,
+                                {VecV, VecV, C});
+    }
+  }
+
+  if (!NewN)
+    NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
+
+  ISel.ReplaceNode(N, NewN);
+  DAG.RemoveDeadNode(N);
+}
+
+void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
+  HvxSelector(*this, *CurDAG).selectShuffle(N);
+}
+
+void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
+  HvxSelector(*this, *CurDAG).selectRor(N);
+}
+
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 22bbb3e..859f697 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -129,6 +129,19 @@
 
 // Implement calling convention for Hexagon.
 
+static const MVT LegalV64[] = {
+  MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64
+};
+static const MVT LegalW64[] = {
+  MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64
+};
+static const MVT LegalV128[] = {
+  MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64
+};
+static const MVT LegalW128[] = {
+  MVT::v256i8, MVT::v128i16, MVT::v64i32, MVT::v32i64
+};
+
 static bool
 CC_Hexagon(unsigned ValNo, MVT ValVT,
            MVT LocVT, CCValAssign::LocInfo LocInfo,
@@ -1978,36 +1991,52 @@
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8,  Custom);
 
+  auto setPromoteTo = [this] (unsigned Opc, MVT FromTy, MVT ToTy) {
+    setOperationAction(Opc, FromTy, Promote);
+    AddPromotedToType(Opc, FromTy, ToTy);
+  };
+
   if (Subtarget.useHVXOps()) {
-    if (Subtarget.useHVX64BOps()) {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8,  Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16,  Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32,  Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64,  Custom);
-      // We try to generate the vpack{e/o} instructions. If we fail
-      // we fall back upon ExpandOp.
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8,  Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i8, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i16, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
-    } else if (Subtarget.useHVX128BOps()) {
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8,  Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32,  Custom);
-      setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64,  Custom);
-      // We try to generate the vpack{e/o} instructions. If we fail
-      // we fall back upon ExpandOp.
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v128i8,  Custom);
-      setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i16,  Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v128i8, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64i16, Custom);
-      setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
-    } else {
-      llvm_unreachable("Unrecognized HVX mode");
+    bool Use64b = Subtarget.useHVX64BOps();
+    ArrayRef<MVT> LegalV = Use64b ? LegalV64 : LegalV128;
+    ArrayRef<MVT> LegalW = Use64b ? LegalW64 : LegalW128;
+    MVT ByteV  = Use64b ?  MVT::v64i8 : MVT::v128i8;
+    MVT ByteW  = Use64b ? MVT::v128i8 : MVT::v256i8;
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
+    setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal);
+    setOperationAction(ISD::OR,             ByteV, Legal);
+
+    for (MVT T : LegalV) {
+      setIndexedLoadAction(ISD::POST_INC,  T, Legal);
+      setIndexedStoreAction(ISD::POST_INC, T, Legal);
+
+      setOperationAction(ISD::BUILD_VECTOR,       T, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   T, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  T, Custom);
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,  T, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, T, Custom);
+    }
+
+    for (MVT T : LegalV) {
+      if (T == ByteV)
+        continue;
+      // Promote all shuffles and concats to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
+      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV);
+      setPromoteTo(ISD::OR,             T, ByteV);
+    }
+
+    for (MVT T : LegalW) {
+      if (T == ByteW)
+        continue;
+      // Promote all shuffles and concats to operate on vectors of bytes.
+      setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteW);
+      setPromoteTo(ISD::CONCAT_VECTORS, T, ByteW);
     }
   }
+
   // Subtarget-specific operation actions.
   //
   if (Subtarget.hasV5TOps()) {
@@ -2069,20 +2098,6 @@
     setIndexedStoreAction(ISD::POST_INC, VT, Legal);
   }
 
-  if (Subtarget.useHVX64BOps()) {
-    for (MVT VT : {MVT::v64i8,  MVT::v32i16, MVT::v16i32, MVT::v8i64,
-                   MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) {
-      setIndexedLoadAction(ISD::POST_INC, VT, Legal);
-      setIndexedStoreAction(ISD::POST_INC, VT, Legal);
-    }
-  } else if (Subtarget.useHVX128BOps()) {
-    for (MVT VT : {MVT::v128i8, MVT::v64i16,  MVT::v32i32, MVT::v16i64,
-                   MVT::v256i8, MVT::v128i16, MVT::v64i32, MVT::v32i64}) {
-      setIndexedLoadAction(ISD::POST_INC, VT, Legal);
-      setIndexedStoreAction(ISD::POST_INC, VT, Legal);
-    }
-  }
-
   computeRegisterProperties(&HRI);
 
   //
@@ -2225,6 +2240,9 @@
   case HexagonISD::VASR:          return "HexagonISD::VASR";
   case HexagonISD::VLSR:          return "HexagonISD::VLSR";
   case HexagonISD::VSPLAT:        return "HexagonISD::VSPLAT";
+  case HexagonISD::VEXTRACTW:     return "HexagonISD::VEXTRACTW";
+  case HexagonISD::VINSERTW0:     return "HexagonISD::VINSERTW0";
+  case HexagonISD::VROR:          return "HexagonISD::VROR";
   case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::OP_END:        break;
   }
@@ -2252,43 +2270,11 @@
 // Should we expand the build vector with shuffles?
 bool HexagonTargetLowering::shouldExpandBuildVectorWithShuffles(EVT VT,
       unsigned DefinedValues) const {
-  // Hexagon vector shuffle operates on element sizes of bytes or halfwords
-  EVT EltVT = VT.getVectorElementType();
-  int EltBits = EltVT.getSizeInBits();
-  if ((EltBits != 8) && (EltBits != 16))
-    return false;
-
-  return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
-}
-
-static StridedLoadKind isStridedLoad(const ArrayRef<int> &Mask) {
-  int even_start = -2;
-  int odd_start = -1;
-  size_t mask_len = Mask.size();
-  for (auto idx : Mask) {
-    if ((idx - even_start) == 2)
-      even_start = idx;
-    else
-      break;
-  }
-  if (even_start == (int)(mask_len * 2) - 2)
-    return StridedLoadKind::Even;
-  for (auto idx : Mask) {
-    if ((idx - odd_start) == 2)
-      odd_start = idx;
-    else
-      break;
-  }
-  if (odd_start == (int)(mask_len * 2) - 1)
-    return StridedLoadKind::Odd;
-
-  return StridedLoadKind::NoPattern;
+  return false;
 }
 
 bool HexagonTargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask,
                                                EVT VT) const {
-  if (Subtarget.useHVXOps())
-    return isStridedLoad(Mask) != StridedLoadKind::NoPattern;
   return true;
 }
 
@@ -2302,7 +2288,6 @@
   SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
-  bool UseHVX = Subtarget.useHVXOps();
 
   if (V2.isUndef())
     V2 = V1;
@@ -2334,27 +2319,6 @@
                        DAG.getConstant(Lane, dl, MVT::i32));
   }
 
-  if (UseHVX) {
-    ArrayRef<int> Mask = SVN->getMask();
-    size_t MaskLen = Mask.size();
-    unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen;
-
-    if ((Subtarget.useHVX64BOps() && SizeInBits == 64 * 8) ||
-        (Subtarget.useHVX128BOps() && SizeInBits == 128 * 8)) {
-      StridedLoadKind Pattern = isStridedLoad(Mask);
-      if (Pattern == StridedLoadKind::NoPattern)
-        return SDValue();
-
-      unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE
-                                                      : HexagonISD::VPACKO;
-      return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)});
-    }
-    // We used to assert in the "else" part here, but that is bad for Halide
-    // Halide creates intermediate double registers by interleaving two
-    // concatenated vector registers. The interleaving requires vector_shuffle
-    // nodes and we shouldn't barf on a double register result of a
-    // vector_shuffle because it is most likely an intermediate result.
-  }
   // FIXME: We need to support more general vector shuffles.  See
   // below the comment from the ARM backend that deals in the general
   // case with the vector shuffles.  For now, let expand handle these.
@@ -2445,7 +2409,7 @@
   SmallVector<ConstantSDNode*,4> Consts;
   bool AllConst = true;
   for (SDValue V : Elem) {
-    if (V.getOpcode() == ISD::UNDEF)
+    if (isUndef(V))
       V = DAG.getConstant(0, dl, ElemTy);
     auto *C = dyn_cast<ConstantSDNode>(V.getNode());
     Consts.push_back(C);
@@ -2454,7 +2418,7 @@
 
   unsigned First, Num = Elem.size();
   for (First = 0; First != Num; ++First)
-    if (Elem[First].getOpcode() != ISD::UNDEF)
+    if (!isUndef(Elem[First]))
       break;
   if (First == Num)
     return DAG.getUNDEF(VecTy);
@@ -2466,9 +2430,9 @@
                    Consts[1]->getZExtValue() << 16;
       return DAG.getBitcast(MVT::v2i16, DAG.getConstant(V, dl, MVT::i32));
     }
-    SDNode *N = DAG.getMachineNode(Hexagon::A2_combine_ll, dl, MVT::i32,
-                                   { Elem[1], Elem[0] });
-    return DAG.getBitcast(MVT::v2i16, SDValue(N,0));
+    SDValue N = getNode(Hexagon::A2_combine_ll, dl, MVT::i32,
+                        {Elem[1], Elem[0]}, DAG);
+    return DAG.getBitcast(MVT::v2i16, N);
   }
 
   // First try generating a constant.
@@ -2486,7 +2450,7 @@
   for (unsigned i = 0; i != Num; ++i) {
     if (i == First)
       continue;
-    if (Elem[i] == Elem[First] || Elem[i].getOpcode() == ISD::UNDEF)
+    if (Elem[i] == Elem[First] || isUndef(Elem[i]))
       continue;
     IsSplat = false;
     break;
@@ -2507,9 +2471,9 @@
   SDValue V5 = DAG.getNode(ISD::SHL, dl, MVT::i32, {V3, S8});
   SDValue V6 = DAG.getNode(ISD::OR, dl, MVT::i32, {V0, V4});
   SDValue V7 = DAG.getNode(ISD::OR, dl, MVT::i32, {V2, V5});
-  SDNode *T0 = DAG.getMachineNode(Hexagon::A2_combine_ll, dl, MVT::i32,
-                                  {V7, V6});
-  return DAG.getBitcast(MVT::v4i8, SDValue(T0,0));
+
+  SDValue T0 = getNode(Hexagon::A2_combine_ll, dl, MVT::i32, {V7, V6}, DAG);
+  return DAG.getBitcast(MVT::v4i8, T0);
 }
 
 SDValue
@@ -2521,7 +2485,7 @@
   SmallVector<ConstantSDNode*,8> Consts;
   bool AllConst = true;
   for (SDValue V : Elem) {
-    if (V.getOpcode() == ISD::UNDEF)
+    if (isUndef(V))
       V = DAG.getConstant(0, dl, ElemTy);
     auto *C = dyn_cast<ConstantSDNode>(V.getNode());
     Consts.push_back(C);
@@ -2530,7 +2494,7 @@
 
   unsigned First, Num = Elem.size();
   for (First = 0; First != Num; ++First)
-    if (Elem[First].getOpcode() != ISD::UNDEF)
+    if (!isUndef(Elem[First]))
       break;
   if (First == Num)
     return DAG.getUNDEF(VecTy);
@@ -2541,7 +2505,7 @@
     for (unsigned i = 0; i != Num; ++i) {
       if (i == First)
         continue;
-      if (Elem[i] == Elem[First] || Elem[i].getOpcode() == ISD::UNDEF)
+      if (Elem[i] == Elem[First] || isUndef(Elem[i]))
         continue;
       IsSplat = false;
       break;
@@ -2570,12 +2534,7 @@
   SDValue H = (ElemTy == MVT::i32)
                 ? Elem[1]
                 : buildVector32({Elem.data()+Num/2, Num/2}, dl, HalfTy, DAG);
-  unsigned Id = Hexagon::DoubleRegsRegClassID;
-  SDNode *N = DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, VecTy,
-                { DAG.getTargetConstant(Id, dl, MVT::i32),
-                  L, DAG.getTargetConstant(Hexagon::isub_lo, dl, MVT::i32),
-                  H, DAG.getTargetConstant(Hexagon::isub_hi, dl, MVT::i32) });
-  return SDValue(N, 0);
+  return DAG.getNode(HexagonISD::COMBINE, dl, VecTy, {H, L});
 }
 
 SDValue
@@ -2675,120 +2634,33 @@
 
 SDValue
 HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
-  unsigned BW = VT.getSizeInBits();
+  MVT VecTy = ty(Op);
+  unsigned BW = VecTy.getSizeInBits();
   if (BW == 32 || BW == 64) {
     SmallVector<SDValue,8> Ops;
     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i)
       Ops.push_back(Op.getOperand(i));
     if (BW == 32)
-      return buildVector32(Ops, SDLoc(Op), VT, DAG);
-    return buildVector64(Ops, SDLoc(Op), VT, DAG);
+      return buildVector32(Ops, SDLoc(Op), VecTy, DAG);
+    return buildVector64(Ops, SDLoc(Op), VecTy, DAG);
   }
 
+  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
+    return LowerHvxBuildVector(Op, DAG);
+
   return SDValue();
 }
 
 SDValue
 HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                            SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  bool UseHVX = Subtarget.useHVXOps();
-  EVT VT = Op.getValueType();
-  unsigned NElts = Op.getNumOperands();
-  SDValue Vec0 = Op.getOperand(0);
-  EVT VecVT = Vec0.getValueType();
-  unsigned Width = VecVT.getSizeInBits();
+  MVT VecTy = ty(Op);
+  assert(!Subtarget.useHVXOps() || !Subtarget.isHVXVectorType(VecTy));
 
-  if (NElts == 2) {
-    MVT ST = VecVT.getSimpleVT();
-    // We are trying to concat two v2i16 to a single v4i16, or two v4i8
-    // into a single v8i8.
-    if (ST == MVT::v2i16 || ST == MVT::v4i8)
-      return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0);
-
-    if (UseHVX) {
-      assert((Width == 64 * 8 && Subtarget.useHVX64BOps()) ||
-             (Width == 128 * 8 && Subtarget.useHVX128BOps()));
-      SDValue Vec1 = Op.getOperand(1);
-      MVT OpTy = Subtarget.useHVX64BOps() ? MVT::v16i32 : MVT::v32i32;
-      MVT ReTy = Subtarget.useHVX64BOps() ? MVT::v32i32 : MVT::v64i32;
-      SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0);
-      SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1);
-      SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0);
-      return DAG.getNode(ISD::BITCAST, dl, VT, VC);
-    }
-  }
-
-  if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64)
-    return SDValue();
-
-  SDValue C0 = DAG.getConstant(0, dl, MVT::i64);
-  SDValue C32 = DAG.getConstant(32, dl, MVT::i64);
-  SDValue W = DAG.getConstant(Width, dl, MVT::i64);
-  // Create the "width" part of the argument to insert_rp/insertp_rp.
-  SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32);
-  SDValue V = C0;
-
-  for (unsigned i = 0, e = NElts; i != e; ++i) {
-    unsigned N = NElts-i-1;
-    SDValue OpN = Op.getOperand(N);
-
-    if (VT.getSizeInBits() == 64 && OpN.getValueSizeInBits() == 32) {
-      SDValue C = DAG.getConstant(0, dl, MVT::i32);
-      OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN);
-    }
-    SDValue Idx = DAG.getConstant(N, dl, MVT::i64);
-    SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W);
-    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset);
-    if (VT.getSizeInBits() == 32)
-      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or});
-    else if (VT.getSizeInBits() == 64)
-      V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or});
-    else
-      return SDValue();
-  }
-
-  return DAG.getNode(ISD::BITCAST, dl, VT, V);
-}
-
-SDValue
-HexagonTargetLowering::LowerEXTRACT_SUBVECTOR_HVX(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  bool UseHVX = Subtarget.useHVXOps();
-  bool UseHVX64B = Subtarget.useHVX64BOps();
-  // Just in case...
-
-  if (!VT.isVector() || !UseHVX)
-    return SDValue();
-
-  EVT ResVT = Op.getValueType();
-  unsigned ResSize = ResVT.getSizeInBits();
-  unsigned VectorSizeInBits = UseHVX64B ? (64 * 8) : (128 * 8);
-  unsigned OpSize = VT.getSizeInBits();
-
-  // We deal only with cases where the result is the vector size
-  // and the vector operand is a double register.
-  if (!(ResVT.isByteSized() && ResSize == VectorSizeInBits) ||
-      !(VT.isByteSized() && OpSize == 2 * VectorSizeInBits))
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
-
-  // These two will get lowered to an appropriate EXTRACT_SUBREG in ISel.
-  if (Val == 0) {
-    SDValue Vec = Op.getOperand(0);
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, ResVT, Vec);
-  }
-
-  if (ResVT.getVectorNumElements() == Val) {
-    SDValue Vec = Op.getOperand(0);
-    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, ResVT, Vec);
+  if (VecTy.getSizeInBits() == 64) {
+    assert(Op.getNumOperands() == 2);
+    return DAG.getNode(HexagonISD::COMBINE, SDLoc(Op), VecTy, Op.getOperand(1),
+                       Op.getOperand(0));
   }
 
   return SDValue();
@@ -2798,6 +2670,10 @@
 HexagonTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);
+  MVT VecTy = ty(Vec);
+  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
+    return LowerHvxExtractElement(Op, DAG);
+
   MVT ElemTy = ty(Vec).getVectorElementType();
   return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ElemTy, ty(Op), DAG);
 }
@@ -2808,7 +2684,7 @@
   SDValue Vec = Op.getOperand(0);
   MVT VecTy = ty(Vec);
   if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
-    return LowerEXTRACT_SUBVECTOR_HVX(Op, DAG);
+    return LowerHvxExtractSubvector(Op, DAG);
 
   return extractVector(Vec, Op.getOperand(1), SDLoc(Op), ty(Op), ty(Op), DAG);
 }
@@ -2817,6 +2693,9 @@
 HexagonTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                               SelectionDAG &DAG) const {
   MVT VecTy = ty(Op);
+  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(VecTy))
+    return LowerHvxInsertElement(Op, DAG);
+
   return insertVector(Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
                       SDLoc(Op), VecTy.getVectorElementType(), DAG);
 }
@@ -2824,6 +2703,9 @@
 SDValue
 HexagonTargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
+  if (Subtarget.useHVXOps() && Subtarget.isHVXVectorType(ty(Op)))
+    return LowerHvxInsertSubvector(Op, DAG);
+
   SDValue ValV = Op.getOperand(1);
   return insertVector(Op.getOperand(0), ValV, Op.getOperand(2),
                       SDLoc(Op), ty(ValV), DAG);
@@ -2911,6 +2793,7 @@
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
     case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
   }
+  return SDValue();
 }
 
 /// Returns relocation base for the given PIC jumptable.
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 9f7891e..1731091 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -63,6 +63,9 @@
       VCOMBINE,
       VPACKE,
       VPACKO,
+      VEXTRACTW,
+      VINSERTW0,
+      VROR,
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
@@ -88,6 +91,8 @@
     explicit HexagonTargetLowering(const TargetMachine &TM,
                                    const HexagonSubtarget &ST);
 
+    bool isHVXVectorType(MVT Ty) const;
+
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
     /// optimization should implement this function.
@@ -121,9 +126,8 @@
     SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_SUBVECTOR_HVX(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const;
 
@@ -281,7 +285,24 @@
         return Ty;
       return MVT::getIntegerVT(Ty.getSizeInBits());
     }
+    MVT tyVector(MVT Ty, MVT ElemTy) const {
+      if (Ty.isVector() && Ty.getVectorElementType() == ElemTy)
+        return Ty;
+      unsigned TyWidth = Ty.getSizeInBits(), ElemWidth = ElemTy.getSizeInBits();
+      assert((TyWidth % ElemWidth) == 0);
+      return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth);
+    }
 
+    bool isUndef(SDValue Op) const {
+      if (Op.isMachineOpcode())
+        return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+      return Op.getOpcode() == ISD::UNDEF;
+    }
+    SDValue getNode(unsigned MachineOpc, const SDLoc &dl, MVT Ty,
+                    ArrayRef<SDValue> Ops, SelectionDAG &DAG) const {
+      SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops);
+      return SDValue(N, 0);
+    }
     SDValue buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
                           SelectionDAG &DAG) const;
     SDValue buildVector64(ArrayRef<SDValue> Elem, const SDLoc &dl, MVT VecTy,
@@ -291,6 +312,38 @@
     SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV,
                          const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const;
 
+    using VectorPair = std::pair<SDValue, SDValue>;
+    using TypePair = std::pair<MVT, MVT>;
+
+    SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
+                   const SDLoc &dl, SelectionDAG &DAG) const;
+
+    TypePair ty(const VectorPair &Ops) const {
+      return { Ops.first.getValueType().getSimpleVT(),
+               Ops.second.getValueType().getSimpleVT() };
+    }
+
+    MVT typeJoin(const TypePair &Tys) const;
+    TypePair typeSplit(MVT Ty) const;
+    MVT typeCastElem(MVT VecTy, MVT ElemTy) const;
+    MVT typeExtElem(MVT VecTy, unsigned Factor) const;
+    MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
+
+    SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
+                   SelectionDAG &DAG) const;
+    VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
+    SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
+
+    SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
+                               SelectionDAG &DAG) const;
+    SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
+
+    SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
+
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
         const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
new file mode 100644
index 0000000..3a9e508
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -0,0 +1,299 @@
+//===-- HexagonISelLoweringHVX.cpp --- Lowering HVX operations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "HexagonISelLowering.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+
+using namespace llvm;
+
+SDValue
+HexagonTargetLowering::getInt(unsigned IntId, MVT ResTy, ArrayRef<SDValue> Ops,
+                              const SDLoc &dl, SelectionDAG &DAG) const {
+  SmallVector<SDValue,4> IntOps;
+  IntOps.push_back(DAG.getConstant(IntId, dl, MVT::i32));
+  for (const SDValue &Op : Ops)
+    IntOps.push_back(Op);
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, ResTy, IntOps);
+}
+
+MVT
+HexagonTargetLowering::typeJoin(const TypePair &Tys) const {
+  assert(Tys.first.getVectorElementType() == Tys.second.getVectorElementType());
+
+  MVT ElemTy = Tys.first.getVectorElementType();
+  return MVT::getVectorVT(ElemTy, Tys.first.getVectorNumElements() +
+                                  Tys.second.getVectorNumElements());
+}
+
+HexagonTargetLowering::TypePair
+HexagonTargetLowering::typeSplit(MVT VecTy) const {
+  assert(VecTy.isVector());
+  unsigned NumElem = VecTy.getVectorNumElements();
+  assert((NumElem % 2) == 0 && "Expecting even-sized vector type");
+  MVT HalfTy = MVT::getVectorVT(VecTy.getVectorElementType(), NumElem/2);
+  return { HalfTy, HalfTy };
+}
+
+MVT
+HexagonTargetLowering::typeExtElem(MVT VecTy, unsigned Factor) const {
+  MVT ElemTy = VecTy.getVectorElementType();
+  MVT NewElemTy = MVT::getIntegerVT(ElemTy.getSizeInBits() * Factor);
+  return MVT::getVectorVT(NewElemTy, VecTy.getVectorNumElements());
+}
+
+MVT
+HexagonTargetLowering::typeTruncElem(MVT VecTy, unsigned Factor) const {
+  MVT ElemTy = VecTy.getVectorElementType();
+  MVT NewElemTy = MVT::getIntegerVT(ElemTy.getSizeInBits() / Factor);
+  return MVT::getVectorVT(NewElemTy, VecTy.getVectorNumElements());
+}
+
+SDValue
+HexagonTargetLowering::opCastElem(SDValue Vec, MVT ElemTy,
+                                  SelectionDAG &DAG) const {
+  if (ty(Vec).getVectorElementType() == ElemTy)
+    return Vec;
+  MVT CastTy = tyVector(Vec.getValueType().getSimpleVT(), ElemTy);
+  return DAG.getBitcast(CastTy, Vec);
+}
+
+SDValue
+HexagonTargetLowering::opJoin(const VectorPair &Ops, const SDLoc &dl,
+                              SelectionDAG &DAG) const {
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, typeJoin(ty(Ops)),
+                     Ops.second, Ops.first);
+}
+
+HexagonTargetLowering::VectorPair
+HexagonTargetLowering::opSplit(SDValue Vec, const SDLoc &dl,
+                               SelectionDAG &DAG) const {
+  TypePair Tys = typeSplit(ty(Vec));
+  return DAG.SplitVector(Vec, dl, Tys.first, Tys.second);
+}
+
+SDValue
+HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
+                                          SelectionDAG &DAG) const {
+  if (ElemIdx.getValueType().getSimpleVT() != MVT::i32)
+    ElemIdx = DAG.getBitcast(MVT::i32, ElemIdx);
+
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  if (ElemWidth == 8)
+    return ElemIdx;
+
+  unsigned L = Log2_32(ElemWidth/8);
+  const SDLoc &dl(ElemIdx);
+  return DAG.getNode(ISD::SHL, dl, MVT::i32,
+                     {ElemIdx, DAG.getConstant(L, dl, MVT::i32)});
+}
+
+SDValue
+HexagonTargetLowering::getIndexInWord32(SDValue Idx, MVT ElemTy,
+                                        SelectionDAG &DAG) const {
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  assert(ElemWidth >= 8 && ElemWidth <= 32);
+  if (ElemWidth == 32)
+    return Idx;
+
+  if (ty(Idx) != MVT::i32)
+    Idx = DAG.getBitcast(MVT::i32, Idx);
+  const SDLoc &dl(Idx);
+  SDValue Mask = DAG.getConstant(32/ElemWidth - 1, dl, MVT::i32);
+  SDValue SubIdx = DAG.getNode(ISD::AND, dl, MVT::i32, {Idx, Mask});
+  return SubIdx;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  BuildVectorSDNode *BN = cast<BuildVectorSDNode>(Op.getNode());
+  bool IsConst = BN->isConstant();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MVT VecTy = ty(Op);
+
+  if (IsConst) {
+    SmallVector<Constant*, 128> Elems;
+    for (SDValue V : BN->op_values()) {
+      if (auto *C = dyn_cast<ConstantSDNode>(V.getNode()))
+        Elems.push_back(const_cast<ConstantInt*>(C->getConstantIntValue()));
+    }
+    Constant *CV = ConstantVector::get(Elems);
+    unsigned Align = VecTy.getSizeInBits() / 8;
+    SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
+    return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
+                       MachinePointerInfo::getConstantPool(MF), Align);
+  }
+
+  unsigned NumOps = Op.getNumOperands();
+  unsigned HwLen = Subtarget.getVectorLength();
+  unsigned ElemSize = VecTy.getVectorElementType().getSizeInBits() / 8;
+  assert(ElemSize*NumOps == HwLen);
+
+  SmallVector<SDValue,32> Words;
+  SmallVector<SDValue,32> Ops;
+  for (unsigned i = 0; i != NumOps; ++i)
+    Ops.push_back(Op.getOperand(i));
+
+  if (VecTy.getVectorElementType() != MVT::i32) {
+    assert(ElemSize < 4 && "vNi64 should have been promoted to vNi32");
+    assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");
+    unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;
+    MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord);
+    for (unsigned i = 0; i != NumOps; i += OpsPerWord) {
+      SDValue W = buildVector32({&Ops[i], OpsPerWord}, dl, PartVT, DAG);
+      Words.push_back(DAG.getBitcast(MVT::i32, W));
+    }
+  } else {
+    Words.assign(Ops.begin(), Ops.end());
+  }
+
+  // Construct two halves in parallel, then or them together.
+  assert(4*Words.size() == Subtarget.getVectorLength());
+  SDValue HalfV0 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+  SDValue HalfV1 = getNode(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
+  SDValue S = DAG.getConstant(4, dl, MVT::i32);
+  unsigned NumWords = Words.size();
+  for (unsigned i = 0; i != NumWords/2; ++i) {
+    SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                            {HalfV0, Words[i]});
+    SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                            {HalfV1, Words[i+NumWords/2]});
+    HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S});
+    HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S});
+  }
+
+  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
+                       {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
+  SDValue DstV = DAG.getNode(ISD::OR, dl, VecTy, {HalfV0, HalfV1});
+  return DstV;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG)
+      const {
+  // Change the type of the extracted element to i32.
+  SDValue VecV = Op.getOperand(0);
+  MVT ElemTy = ty(VecV).getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  assert(ElemWidth >= 8 && ElemWidth <= 32); // TODO i64
+
+  const SDLoc &dl(Op);
+  SDValue IdxV = Op.getOperand(1);
+  if (ty(IdxV) != MVT::i32)
+    IdxV = DAG.getBitcast(MVT::i32, IdxV);
+
+  SDValue ByteIdx = convertToByteIndex(IdxV, ElemTy, DAG);
+  SDValue ExWord = DAG.getNode(HexagonISD::VEXTRACTW, dl, MVT::i32,
+                               {VecV, ByteIdx});
+  if (ElemTy == MVT::i32)
+    return ExWord;
+
+  // Have an extracted word, need to extract the smaller element out of it.
+  // 1. Extract the bits of (the original) IdxV that correspond to the index
+  //    of the desired element in the 32-bit word.
+  SDValue SubIdx = getIndexInWord32(IdxV, ElemTy, DAG);
+  // 2. Extract the element from the word.
+  SDValue ExVec = DAG.getBitcast(tyVector(ty(ExWord), ElemTy), ExWord);
+  return extractVector(ExVec, SubIdx, dl, ElemTy, MVT::i32, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  SDValue VecV = Op.getOperand(0);
+  SDValue ValV = Op.getOperand(1);
+  SDValue IdxV = Op.getOperand(2);
+  MVT ElemTy = ty(VecV).getVectorElementType();
+  unsigned ElemWidth = ElemTy.getSizeInBits();
+  assert(ElemWidth >= 8 && ElemWidth <= 32); // TODO i64
+
+  auto InsertWord = [&DAG,&dl,this] (SDValue VecV, SDValue ValV,
+                                     SDValue ByteIdxV) {
+    MVT VecTy = ty(VecV);
+    unsigned HwLen = Subtarget.getVectorLength();
+    SDValue MaskV = DAG.getNode(ISD::AND, dl, MVT::i32,
+                                {ByteIdxV, DAG.getConstant(-4, dl, MVT::i32)});
+    SDValue RotV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {VecV, MaskV});
+    SDValue InsV = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy, {RotV, ValV});
+    SDValue SubV = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                               {DAG.getConstant(HwLen/4, dl, MVT::i32), MaskV});
+    SDValue TorV = DAG.getNode(HexagonISD::VROR, dl, VecTy, {InsV, SubV});
+    return TorV;
+  };
+
+  SDValue ByteIdx = convertToByteIndex(IdxV, ElemTy, DAG);
+  if (ElemTy == MVT::i32)
+    return InsertWord(VecV, ValV, ByteIdx);
+
+  // If this is not inserting a 32-bit word, convert it into such a thing.
+  // 1. Extract the existing word from the target vector.
+  SDValue WordIdx = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                                {ByteIdx, DAG.getConstant(2, dl, MVT::i32)});
+  SDValue Ex0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                            {opCastElem(VecV, MVT::i32, DAG), WordIdx});
+  SDValue Ext = LowerHvxExtractElement(Ex0, DAG);
+
+  // 2. Treating the extracted word as a 32-bit vector, insert the given
+  //    value into it.
+  SDValue SubIdx = getIndexInWord32(IdxV, ElemTy, DAG);
+  MVT SubVecTy = tyVector(ty(Ext), ElemTy);
+  SDValue Ins = insertVector(DAG.getBitcast(SubVecTy, Ext),
+                             ValV, SubIdx, dl, SubVecTy, DAG);
+
+  // 3. Insert the 32-bit word back into the original vector.
+  return InsertWord(VecV, Ins, ByteIdx);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)
+      const {
+  SDValue SrcV = Op.getOperand(0);
+  MVT SrcTy = ty(SrcV);
+  unsigned SrcElems = SrcTy.getVectorNumElements();
+  SDValue IdxV = Op.getOperand(1);
+  unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
+  MVT DstTy = ty(Op);
+  assert(Idx == 0 || DstTy.getVectorNumElements() % Idx == 0);
+  const SDLoc &dl(Op);
+  if (Idx == 0)
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, DstTy, SrcV);
+  if (Idx == SrcElems/2)
+    return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, DstTy, SrcV);
+  return SDValue();
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG)
+      const {
+  // Idx may be variable
+  SDValue IdxV = Op.getOperand(2);
+  auto *IdxN = dyn_cast<ConstantSDNode>(IdxV.getNode());
+  if (!IdxN)
+    return SDValue();
+  unsigned Idx = IdxN->getZExtValue();
+
+  SDValue DstV = Op.getOperand(0);
+  SDValue SrcV = Op.getOperand(1);
+  MVT DstTy = ty(DstV);
+  MVT SrcTy = ty(SrcV);
+  unsigned DstElems = DstTy.getVectorNumElements();
+  unsigned SrcElems = SrcTy.getVectorNumElements();
+  if (2*SrcElems != DstElems)
+    return SDValue();
+
+  const SDLoc &dl(Op);
+  if (Idx == 0)
+    return DAG.getTargetInsertSubreg(Hexagon::vsub_lo, dl, DstTy, DstV, SrcV);
+  if (Idx == SrcElems)
+    return DAG.getTargetInsertSubreg(Hexagon::vsub_hi, dl, DstTy, DstV, SrcV);
+  return SDValue();
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 270575a..f1d01b0 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -19,10 +19,10 @@
 //     (8) Shift/permute
 //     (9) Arithmetic/bitwise
 //    (10) Bit
-//    (11) Load
-//    (12) Store
-//    (13) Memop
-//    (14) PIC
+//    (11) PIC
+//    (12) Load
+//    (13) Store
+//    (14) Memop
 //    (15) Call
 //    (16) Branch
 //    (17) Misc
@@ -340,6 +340,8 @@
 def: Pat<(HexagonCONST32_GP tglobaladdr:$A),    (A2_tfrsi imm:$A)>;
 def: Pat<(HexagonJT         tjumptable:$A),     (A2_tfrsi imm:$A)>;
 def: Pat<(HexagonCP         tconstpool:$A),     (A2_tfrsi imm:$A)>;
+// The HVX load patterns also match CP directly. Make sure that if
+// the selection of this opcode changes, it's updated in all places.
 
 def: Pat<(i1 0),        (PS_false)>;
 def: Pat<(i1 1),        (PS_true)>;
@@ -1630,7 +1632,31 @@
            (I1toI32 (S4_ntstbit_r IntRegs:$Rs, IntRegs:$Rt))>;
 }
 
-// --(11) Load -----------------------------------------------------------
+// --(11) PIC ------------------------------------------------------------
+//
+
+def SDT_HexagonAtGot
+  : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
+def SDT_HexagonAtPcrel
+  : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// AT_GOT address-of-GOT, address-of-global, offset-in-global
+def HexagonAtGot       : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
+// AT_PCREL address-of-global
+def HexagonAtPcrel     : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
+
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
+         (L2_loadri_io I32:$got, imm:$addr)>;
+def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
+         (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
+def: Pat<(HexagonAtPcrel I32:$addr),
+         (C4_addipc imm:$addr)>;
+
+// The HVX load patterns also match AT_PCREL directly. Make sure that
+// if the selection of this opcode changes, it's updated in all places.
+
+
+// --(12) Load -----------------------------------------------------------
 //
 
 def extloadv2i8: PatFrag<(ops node:$ptr), (extload node:$ptr), [{
@@ -1971,6 +1997,12 @@
                      PatFrag ImmPred> {
   def: Pat<(VT (Load I32:$Rt)),                   (MI I32:$Rt, 0)>;
   def: Pat<(VT (Load (add I32:$Rt, ImmPred:$s))), (MI I32:$Rt, imm:$s)>;
+  // The HVX selection code for shuffles can generate vector constants.
+  // Calling "Select" on the resulting loads from CP fails without these
+  // patterns.
+  def: Pat<(VT (Load (HexagonCP tconstpool:$A))), (MI (A2_tfrsi imm:$A), 0)>;
+  def: Pat<(VT (Load (HexagonAtPcrel tconstpool:$A))),
+           (MI (C4_addipc imm:$A), 0)>;
 }
 
 
@@ -1997,7 +2029,7 @@
 }
 
 
-// --(12) Store ----------------------------------------------------------
+// --(13) Store ----------------------------------------------------------
 //
 
 
@@ -2466,7 +2498,7 @@
 }
 
 
-// --(13) Memop ----------------------------------------------------------
+// --(14) Memop ----------------------------------------------------------
 //
 
 def m5_0Imm8Pred : PatLeaf<(i32 imm), [{
@@ -2744,27 +2776,6 @@
 }
 
 
-// --(14) PIC ------------------------------------------------------------
-//
-
-def SDT_HexagonAtGot
-  : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>;
-def SDT_HexagonAtPcrel
-  : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-// AT_GOT address-of-GOT, address-of-global, offset-in-global
-def HexagonAtGot       : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>;
-// AT_PCREL address-of-global
-def HexagonAtPcrel     : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>;
-
-def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)),
-         (L2_loadri_io I32:$got, imm:$addr)>;
-def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off),
-         (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>;
-def: Pat<(HexagonAtPcrel I32:$addr),
-         (C4_addipc imm:$addr)>;
-
-
 // --(15) Call -----------------------------------------------------------
 //
 
@@ -2894,3 +2905,32 @@
   [SDNPHasChain]>;
 
 def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
+
+def SDTHexagonVEXTRACTW: SDTypeProfile<1, 2,
+  [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisVT<2, i32>]>;
+def HexagonVEXTRACTW : SDNode<"HexagonISD::VEXTRACTW", SDTHexagonVEXTRACTW>;
+
+def SDTHexagonVINSERTW0: SDTypeProfile<1, 2,
+  [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>;
+def HexagonVINSERTW0 : SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>;
+
+let Predicates = [UseHVX] in {
+  def: Pat<(concat_vectors HVI8:$Vs, HVI8:$Vt),
+           (V6_vcombine HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(or HVI8:$Vs, HVI8:$Vt),
+           (V6_vor HvxVR:$Vt, HvxVR:$Vs)>;
+
+  def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+  def: Pat<(HexagonVEXTRACTW HVI16:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+  def: Pat<(HexagonVEXTRACTW HVI32:$Vu, I32:$Rs),
+           (V6_extractw HvxVR:$Vu, I32:$Rs)>;
+
+  def: Pat<(HexagonVINSERTW0 HVI8:$Vu,  I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVI16:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}