Subzero. Rematerializes shufflevector instructions.

This CL is a first step towards optimizing vector shuffles in
Subzero.

PNaCl bitcode does not support the shufflevector instruction,
so pnacl-clang emits a series of extractelement/insertelement.
pnacl-llc is then responsible for performing a pattern match
on the output bitcode and rematerialize the shufflevector.

With this CL, we enable shufflevector rematerialization in
Subzero. To keep this CL simple, we introduce no efficient
shufflevector lowering. Instead, we scalarize the
rematerialized instructions.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4136
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1897243002 .
diff --git a/src/IceCfg.cpp b/src/IceCfg.cpp
index 23c6728..9a47dd6 100644
--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -802,6 +802,312 @@
     Node->doAddressOpt();
 }
 
+namespace {
+// ShuffleVectorUtils implements helper functions for rematerializing
+// shufflevector instructions from a sequence of extractelement/insertelement
+// instructions. It looks for the following pattern:
+//
+// %t0 = extractelement A, %n0
+// %t1 = extractelement B, %n1
+// %t2 = extractelement C, %n2
+// ...
+// %tN = extractelement N, %nN
+// %d0 = insertelement undef, %t0, 0
+// %d1 = insertelement %d0, %t1, 1
+// %d2 = insertelement %d1, %t2, 2
+// ...
+// %dest = insertelement %d_N-1, %tN, N
+//
+// where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
+// distinct variables.
+namespace ShuffleVectorUtils {
+// findAllInserts is used when searching for all the insertelements that are
+// used in a shufflevector operation. This function works recursively, when
+// invoked with I = i, the function assumes Insts[i] is the last found
+// insertelement in the chain. The next insertelement insertruction is saved in
+// Insts[i+1].
+bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
+                    CfgVector<const Inst *> *Insts, SizeT I = 0) {
+  const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
+
+  if (I > Insts->size()) {
+    if (Verbose) {
+      Ctx->getStrDump() << "\tToo many inserts.\n";
+    }
+    return false;
+  }
+
+  const auto *LastInsert = Insts->at(I);
+  assert(llvm::isa<InstInsertElement>(LastInsert));
+
+  if (I == Insts->size() - 1) {
+    // Matching against undef is not really needed because the value in Src(0)
+    // will be totally overwritten. We still enforce it anyways because the
+    // PNaCl toolchain generates the bitcode with it.
+    if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
+      if (Verbose) {
+        Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
+                          << Insts->size();
+        LastInsert->dump(Func);
+        Ctx->getStrDump() << "\n";
+      }
+      return false;
+    }
+
+    // The following loop ensures that the insertelements are sorted. In theory,
+    // we could relax this restriction and allow any order. As long as each
+    // index appears exactly once, this chain is still a candidate for becoming
+    // a shufflevector. The Insts vector is traversed backwards because the
+    // instructions are "enqueued" in reverse order.
+    int32_t ExpectedElement = 0;
+    for (const auto *I : reverse_range(*Insts)) {
+      if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
+          ExpectedElement) {
+        return false;
+      }
+      ++ExpectedElement;
+    }
+    return true;
+  }
+
+  const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
+  const auto *Def = VM->getSingleDefinition(Src0V);
+
+  // Only optimize if the first operand in
+  //
+  //   Dest = insertelement A, B, 10
+  //
+  // is singly-def'ed.
+  if (Def == nullptr) {
+    if (Verbose) {
+      Ctx->getStrDump() << "\tmulti-def: ";
+      (*Insts)[I]->dump(Func);
+      Ctx->getStrDump() << "\n";
+    }
+    return false;
+  }
+
+  // We also require the (single) definition to come from an insertelement
+  // instruction.
+  if (!llvm::isa<InstInsertElement>(Def)) {
+    if (Verbose) {
+      Ctx->getStrDump() << "\tnot insert element: ";
+      Def->dump(Func);
+      Ctx->getStrDump() << "\n";
+    }
+    return false;
+  }
+
+  // Everything seems fine, so we save Def in Insts, and delegate the decision
+  // to findAllInserts.
+  (*Insts)[I + 1] = Def;
+
+  return findAllInserts(Func, Ctx, VM, Insts, I + 1);
+}
+
+// insertsLastElement returns true if Insert is inserting an element in the last
+// position of a vector.
+bool insertsLastElement(const Inst &Insert) {
+  const Type DestTy = Insert.getDest()->getType();
+  assert(isVectorType(DestTy));
+  const SizeT Elem =
+      llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
+  return Elem == typeNumElements(DestTy) - 1;
+}
+
+// findAllExtracts goes over all the insertelement instructions that are
+// candidates to be replaced by a shufflevector, and searches for all the
+// definitions of the elements being inserted. If all of the elements are the
+// result of an extractelement instruction, and all of the extractelements
+// operate on at most two different sources, than the instructions can be
+// replaced by a shufflevector.
+bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
+                     const CfgVector<const Inst *> &Insts, Variable **Src0,
+                     Variable **Src1, CfgVector<const Inst *> *Extracts) {
+  const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
+
+  *Src0 = nullptr;
+  *Src1 = nullptr;
+  assert(Insts.size() > 0);
+  for (SizeT I = 0; I < Insts.size(); ++I) {
+    const auto *Insert = Insts.at(I);
+    const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
+    if (Src1V == nullptr) {
+      if (Verbose) {
+        Ctx->getStrDump() << "src(1) is not a variable: ";
+        Insert->dump(Func);
+        Ctx->getStrDump() << "\n";
+      }
+      return false;
+    }
+
+    const auto *Def = VM->getSingleDefinition(Src1V);
+    if (Def == nullptr) {
+      if (Verbose) {
+        Ctx->getStrDump() << "multi-def src(1): ";
+        Insert->dump(Func);
+        Ctx->getStrDump() << "\n";
+      }
+      return false;
+    }
+
+    if (!llvm::isa<InstExtractElement>(Def)) {
+      if (Verbose) {
+        Ctx->getStrDump() << "not extractelement: ";
+        Def->dump(Func);
+        Ctx->getStrDump() << "\n";
+      }
+      return false;
+    }
+
+    auto *Src = llvm::cast<Variable>(Def->getSrc(0));
+    if (*Src0 == nullptr) {
+      // No sources yet. Save Src to Src0.
+      *Src0 = Src;
+    } else if (*Src1 == nullptr) {
+      // We already have a source, so we might save Src in Src1 -- but only if
+      // Src0 is not Src.
+      if (*Src0 != Src) {
+        *Src1 = Src;
+      }
+    } else if (Src != *Src0 && Src != *Src1) {
+      // More than two sources, so we can't rematerialize the shufflevector
+      // instruction.
+      if (Verbose) {
+        Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
+      }
+      return false;
+    }
+
+    (*Extracts)[I] = Def;
+  }
+
+  // We should have seen at least one source operand.
+  assert(*Src0 != nullptr);
+
+  // If a second source was not seen, then we just make Src1 = Src0 to simplify
+  // things down stream. This should not matter, as all of the indexes in the
+  // shufflevector instruction will point to Src0.
+  if (*Src1 == nullptr) {
+    *Src1 = *Src0;
+  }
+
+  return true;
+}
+
+} // end of namespace ShuffleVectorUtils
+} // end of anonymous namespace
+
+void Cfg::materializeVectorShuffles() {
+  const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);
+
+  std::unique_ptr<OstreamLocker> L;
+  if (Verbose) {
+    L.reset(new OstreamLocker(getContext()));
+    getContext()->getStrDump() << "\nShuffle materialization:\n";
+  }
+
+  // MaxVectorElements is the maximum number of elements in the vector types
+  // handled by Subzero. We use it to create the Inserts and Extracts vectors
+  // with the appropriate size, thus avoiding resize() calls.
+  const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
+  CfgVector<const Inst *> Inserts(MaxVectorElements);
+  CfgVector<const Inst *> Extracts(MaxVectorElements);
+
+  TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
+  for (CfgNode *Node : Nodes) {
+    for (auto &Instr : Node->getInsts()) {
+      if (!llvm::isa<InstInsertElement>(Instr)) {
+        continue;
+      }
+      if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
+        // To avoid wasting time, we only start the pattern match at the last
+        // insertelement instruction -- and go backwards from there.
+        continue;
+      }
+      if (Verbose) {
+        getContext()->getStrDump() << "\tCandidate: ";
+        Instr.dump(this);
+        getContext()->getStrDump() << "\n";
+      }
+      Inserts.resize(typeNumElements(Instr.getDest()->getType()));
+      Inserts[0] = &Instr;
+      if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
+                                              VMetadata.get(), &Inserts)) {
+        // If we fail to find a sequence of insertelements, we stop the
+        // optimization.
+        if (Verbose) {
+          getContext()->getStrDump() << "\tFalse alarm.\n";
+        }
+        continue;
+      }
+      if (Verbose) {
+        getContext()->getStrDump() << "\tFound the following insertelement: \n";
+        for (auto *I : reverse_range(Inserts)) {
+          getContext()->getStrDump() << "\t\t";
+          I->dump(this);
+          getContext()->getStrDump() << "\n";
+        }
+      }
+      Extracts.resize(Inserts.size());
+      Variable *Src0;
+      Variable *Src1;
+      if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
+                                               VMetadata.get(), Inserts, &Src0,
+                                               &Src1, &Extracts)) {
+        // If we fail to match the definitions of the insertelements' sources
+        // with extractelement instructions -- or if those instructions operate
+        // on more than two different variables -- we stop the optimization.
+        if (Verbose) {
+          getContext()->getStrDump() << "\tFailed to match extractelements.\n";
+        }
+        continue;
+      }
+      if (Verbose) {
+        getContext()->getStrDump()
+            << "\tFound the following insert/extract element pairs: \n";
+        for (SizeT I = 0; I < Inserts.size(); ++I) {
+          const SizeT Pos = Inserts.size() - I - 1;
+          getContext()->getStrDump() << "\t\tInsert : ";
+          Inserts[Pos]->dump(this);
+          getContext()->getStrDump() << "\n\t\tExtract: ";
+          Extracts[Pos]->dump(this);
+          getContext()->getStrDump() << "\n";
+        }
+      }
+
+      assert(Src0 != nullptr);
+      assert(Src1 != nullptr);
+
+      auto *ShuffleVector =
+          InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
+      assert(ShuffleVector->getSrc(0) == Src0);
+      assert(ShuffleVector->getSrc(1) == Src1);
+      for (SizeT I = 0; I < Extracts.size(); ++I) {
+        const SizeT Pos = Extracts.size() - I - 1;
+        auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
+        if (Src0 == Extracts[Pos]->getSrc(0)) {
+          ShuffleVector->addIndex(Index);
+        } else {
+          ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
+              Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
+        }
+      }
+
+      if (Verbose) {
+        getContext()->getStrDump() << "Created: ";
+        ShuffleVector->dump(this);
+        getContext()->getStrDump() << "\n";
+      }
+
+      Instr.setDeleted();
+      auto &LoweringContext = getTarget()->getContext();
+      LoweringContext.setInsertPoint(Instr);
+      LoweringContext.insert(ShuffleVector);
+    }
+  }
+}
+
 void Cfg::doNopInsertion() {
   if (!getFlags().getShouldDoNopInsertion())
     return;
diff --git a/src/IceCfg.h b/src/IceCfg.h
index 7c53b1d..b85da28 100644
--- a/src/IceCfg.h
+++ b/src/IceCfg.h
@@ -206,6 +206,9 @@
   /// entry block and emit stack or frame pointer-relative addressing.
   void processAllocas(bool SortAndCombine);
   void doAddressOpt();
+  /// Find clusters of insertelement/extractelement instructions that can be
+  /// replaced by a shufflevector instruction.
+  void materializeVectorShuffles();
   void doArgLowering();
   void doNopInsertion();
   void genCode();
diff --git a/src/IceClFlags.def b/src/IceClFlags.def
index 0ae8e09..cfbf9aa 100644
--- a/src/IceClFlags.def
+++ b/src/IceClFlags.def
@@ -327,6 +327,8 @@
         clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"),      \
         clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"),        \
         clEnumValN(Ice::IceV_Mem, "mem", "Memory usage details"),              \
+        clEnumValN(Ice::IceV_ShufMat, "shufvec",                               \
+	           "Shufflevector rematerialization"),                         \
         clEnumValN(Ice::IceV_Status, "status",                                 \
                    "Print the name of the function being translated"),         \
         clEnumValN(Ice::IceV_AvailableRegs, "registers",                       \
diff --git a/src/IceDefs.h b/src/IceDefs.h
index e555412..d7ef288 100644
--- a/src/IceDefs.h
+++ b/src/IceDefs.h
@@ -342,6 +342,7 @@
   IceV_GlobalInit = 1 << 22,
   IceV_ConstPoolStats = 1 << 23,
   IceV_Wasm = 1 << 24,
+  IceV_ShufMat = 1 << 25,
   IceV_All = ~IceV_None,
   IceV_Most =
       IceV_All & ~IceV_LinearScan & ~IceV_GlobalInit & ~IceV_ConstPoolStats
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index aac42c3..6eb8058 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -112,6 +112,7 @@
     X(FakeUse, "fakeuse");
     X(FakeKill, "fakekill");
     X(JumpTable, "jumptable");
+    X(ShuffleVector, "shufflevector");
 #undef X
   default:
     assert(Kind >= Target);
@@ -574,6 +575,15 @@
 InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked)
     : InstHighLevel(Func, Inst::FakeKill, 0, nullptr), Linked(Linked) {}
 
+InstShuffleVector::InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0,
+                                     Variable *Src1)
+    : InstHighLevel(Func, Inst::ShuffleVector, 2, Dest),
+      NumIndexes(typeNumElements(Dest->getType())) {
+  addSource(Src0);
+  addSource(Src1);
+  Indexes = Func->allocateArrayOf<ConstantInteger32 *>(NumIndexes);
+}
+
 namespace {
 GlobalString makeName(Cfg *Func, const SizeT Id) {
   const auto FuncName = Func->getFunctionName();
@@ -1032,6 +1042,21 @@
   Str << "kill.pseudo scratch_regs";
 }
 
+void InstShuffleVector::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "shufflevector ";
+  dumpDest(Func);
+  Str << " = ";
+  dumpSources(Func);
+  for (SizeT I = 0; I < NumIndexes; ++I) {
+    Str << ", ";
+    Indexes[I]->dump(Func);
+  }
+  Str << "\n";
+}
+
 void InstJumpTable::dump(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
diff --git a/src/IceInst.h b/src/IceInst.h
index db273aa..ad62e09 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -22,6 +22,7 @@
 #include "IceDefs.h"
 #include "IceInst.def"
 #include "IceIntrinsics.h"
+#include "IceOperand.h"
 #include "IceSwitchLowering.h"
 #include "IceTypes.h"
 
@@ -61,14 +62,15 @@
     Select,
     Store,
     Switch,
-    Assign,       // not part of LLVM/PNaCl bitcode
-    Breakpoint,   // not part of LLVM/PNaCl bitcode
-    BundleLock,   // not part of LLVM/PNaCl bitcode
-    BundleUnlock, // not part of LLVM/PNaCl bitcode
-    FakeDef,      // not part of LLVM/PNaCl bitcode
-    FakeUse,      // not part of LLVM/PNaCl bitcode
-    FakeKill,     // not part of LLVM/PNaCl bitcode
-    JumpTable,    // not part of LLVM/PNaCl bitcode
+    Assign,        // not part of LLVM/PNaCl bitcode
+    Breakpoint,    // not part of LLVM/PNaCl bitcode
+    BundleLock,    // not part of LLVM/PNaCl bitcode
+    BundleUnlock,  // not part of LLVM/PNaCl bitcode
+    FakeDef,       // not part of LLVM/PNaCl bitcode
+    FakeUse,       // not part of LLVM/PNaCl bitcode
+    FakeKill,      // not part of LLVM/PNaCl bitcode
+    JumpTable,     // not part of LLVM/PNaCl bitcode
+    ShuffleVector, // not part of LLVM/PNaCl bitcode
     // Anything >= Target is an InstTarget subclass. Note that the value-spaces
     // are shared across targets. To avoid confusion over the definition of
     // shared values, an object specific to one target should never be passed
@@ -917,6 +919,52 @@
   const Inst *Linked;
 };
 
+/// ShuffleVector instruction. This represents a shuffle operation on vector
+/// types. This instruction is not part of the PNaCl bitcode: it is generated
+/// by Subzero when it matches the pattern used by pnacl-clang when compiling
+/// to bitcode.
+class InstShuffleVector : public InstHighLevel {
+  InstShuffleVector() = delete;
+  InstShuffleVector(const InstShuffleVector &) = delete;
+  InstShuffleVector &operator=(const InstShuffleVector &) = delete;
+
+public:
+  static InstShuffleVector *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                   Variable *Src1) {
+    return new (Func->allocate<InstShuffleVector>())
+        InstShuffleVector(Func, Dest, Src0, Src1);
+  }
+
+  SizeT getNumIndexes() const { return NumIndexes; }
+
+  void addIndex(ConstantInteger32 *Index) {
+    assert(CurrentIndex < NumIndexes);
+    Indexes[CurrentIndex++] = Index;
+  }
+
+  ConstantInteger32 *getIndex(SizeT Pos) const {
+    assert(Pos < NumIndexes);
+    return Indexes[Pos];
+  }
+
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) {
+    return Instr->getKind() == ShuffleVector;
+  }
+
+private:
+  InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1);
+
+  void destroy(Cfg *Func) override {
+    Func->deallocateArrayOf<ConstantInteger32 *>(Indexes);
+    Inst::destroy(Func);
+  }
+
+  ConstantInteger32 **Indexes;
+  SizeT CurrentIndex = 0;
+  const SizeT NumIndexes;
+};
+
 /// JumpTable instruction. This represents a jump table that will be stored in
 /// the .rodata section. This is used to track and repoint the target CfgNodes
 /// which may change, for example due to splitting for phi lowering.
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 496845c..275516e 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -440,6 +440,9 @@
     case Inst::Select:
       lowerSelect(llvm::cast<InstSelect>(Instr));
       break;
+    case Inst::ShuffleVector:
+      lowerShuffleVector(llvm::cast<InstShuffleVector>(Instr));
+      break;
     case Inst::Store:
       lowerStore(llvm::cast<InstStore>(Instr));
       break;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index 2c7adc8..88de3d5 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -387,6 +387,7 @@
   virtual void lowerPhi(const InstPhi *Instr) = 0;
   virtual void lowerRet(const InstRet *Instr) = 0;
   virtual void lowerSelect(const InstSelect *Instr) = 0;
+  virtual void lowerShuffleVector(const InstShuffleVector *Instr) = 0;
   virtual void lowerStore(const InstStore *Instr) = 0;
   virtual void lowerSwitch(const InstSwitch *Instr) = 0;
   virtual void lowerUnreachable(const InstUnreachable *Instr) = 0;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index fbfa914..22d4c0a 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -1020,6 +1020,7 @@
   // Address mode optimization.
   Func->getVMetadata()->init(VMK_SingleDefs);
   Func->doAddressOpt();
+  Func->materializeVectorShuffles();
 
   // Argument lowering
   Func->doArgLowering();
@@ -5812,6 +5813,44 @@
   Context.insert<InstFakeUse>(SP);
 }
 
+void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
+  auto *Dest = Instr->getDest();
+  const Type DestTy = Dest->getType();
+
+  auto *T = makeReg(DestTy);
+
+  switch (DestTy) {
+  default:
+    break;
+    // TODO(jpp): figure out how to properly lower this without scalarization.
+  }
+
+  // Unoptimized shuffle. Perform a series of inserts and extracts.
+  Context.insert<InstFakeDef>(T);
+  auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
+  auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
+  const SizeT NumElements = typeNumElements(DestTy);
+  const Type ElementType = typeElementType(DestTy);
+  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
+    auto *Index = Instr->getIndex(I);
+    const SizeT Elem = Index->getValue();
+    auto *ExtElmt = makeReg(ElementType);
+    if (Elem < NumElements) {
+      lowerExtractElement(
+          InstExtractElement::create(Func, ExtElmt, Src0, Index));
+    } else {
+      lowerExtractElement(InstExtractElement::create(
+          Func, ExtElmt, Src1,
+          Ctx->getConstantInt32(Index->getValue() - NumElements)));
+    }
+    auto *NewT = makeReg(DestTy);
+    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
+                                                 Ctx->getConstantInt32(I)));
+    T = NewT;
+  }
+  _mov(Dest, T);
+}
+
 void TargetARM32::lowerSelect(const InstSelect *Instr) {
   Variable *Dest = Instr->getDest();
   Type DestTy = Dest->getType();
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 18930be..9d31b9e 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -285,6 +285,7 @@
   void lowerPhi(const InstPhi *Instr) override;
   void lowerRet(const InstRet *Instr) override;
   void lowerSelect(const InstSelect *Instr) override;
+  void lowerShuffleVector(const InstShuffleVector *Instr) override;
   void lowerStore(const InstStore *Instr) override;
   void lowerSwitch(const InstSwitch *Instr) override;
   void lowerUnreachable(const InstUnreachable *Instr) override;
diff --git a/src/IceTargetLoweringMIPS32.cpp b/src/IceTargetLoweringMIPS32.cpp
index 1bc15e3..94216b8 100644
--- a/src/IceTargetLoweringMIPS32.cpp
+++ b/src/IceTargetLoweringMIPS32.cpp
@@ -1145,6 +1145,10 @@
   UnimplementedLoweringError(this, Instr);
 }
 
+void TargetMIPS32::lowerShuffleVector(const InstShuffleVector *Instr) {
+  UnimplementedLoweringError(this, Instr);
+}
+
 void TargetMIPS32::lowerStore(const InstStore *Instr) {
   UnimplementedLoweringError(this, Instr);
 }
diff --git a/src/IceTargetLoweringMIPS32.h b/src/IceTargetLoweringMIPS32.h
index ea062e7..eb2ff10 100644
--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -299,6 +299,7 @@
   void lowerPhi(const InstPhi *Instr) override;
   void lowerRet(const InstRet *Instr) override;
   void lowerSelect(const InstSelect *Instr) override;
+  void lowerShuffleVector(const InstShuffleVector *Instr) override;
   void lowerStore(const InstStore *Instr) override;
   void lowerSwitch(const InstSwitch *Instr) override;
   void lowerUnreachable(const InstUnreachable *Instr) override;
diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h
index 7aeb933..f84c6df 100644
--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -260,6 +260,7 @@
   void lowerPhi(const InstPhi *Instr) override;
   void lowerRet(const InstRet *Instr) override;
   void lowerSelect(const InstSelect *Instr) override;
+  void lowerShuffleVector(const InstShuffleVector *Instr) override;
   void lowerStore(const InstStore *Instr) override;
   void lowerSwitch(const InstSwitch *Instr) override;
   void lowerUnreachable(const InstUnreachable *Instr) override;
diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h
index 4927e17..ebc5a2e 100644
--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -428,6 +428,7 @@
   // Address mode optimization.
   Func->getVMetadata()->init(VMK_SingleDefs);
   Func->doAddressOpt();
+  Func->materializeVectorShuffles();
 
   // Find read-modify-write opportunities. Do this after address mode
   // optimization so that doAddressOpt() doesn't need to be applied to RMW
@@ -5570,6 +5571,46 @@
 }
 
 template <typename TraitsType>
+void TargetX86Base<TraitsType>::lowerShuffleVector(
+    const InstShuffleVector *Instr) {
+  auto *Dest = Instr->getDest();
+  const Type DestTy = Dest->getType();
+
+  auto *T = makeReg(DestTy);
+
+  switch (DestTy) {
+  default:
+    break;
+    // TODO(jpp): figure out how to properly lower this without scalarization.
+  }
+
+  // Unoptimized shuffle. Perform a series of inserts and extracts.
+  Context.insert<InstFakeDef>(T);
+  auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
+  auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
+  const SizeT NumElements = typeNumElements(DestTy);
+  const Type ElementType = typeElementType(DestTy);
+  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
+    auto *Index = Instr->getIndex(I);
+    const SizeT Elem = Index->getValue();
+    auto *ExtElmt = makeReg(ElementType);
+    if (Elem < NumElements) {
+      lowerExtractElement(
+          InstExtractElement::create(Func, ExtElmt, Src0, Index));
+    } else {
+      lowerExtractElement(InstExtractElement::create(
+          Func, ExtElmt, Src1,
+          Ctx->getConstantInt32(Index->getValue() - NumElements)));
+    }
+    auto *NewT = makeReg(DestTy);
+    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
+                                                 Ctx->getConstantInt32(I)));
+    T = NewT;
+  }
+  _movp(Dest, T);
+}
+
+template <typename TraitsType>
 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
   Variable *Dest = Select->getDest();
 
diff --git a/src/IceTimerTree.def b/src/IceTimerTree.def
index 340b5b8..a40bd60 100644
--- a/src/IceTimerTree.def
+++ b/src/IceTimerTree.def
@@ -15,55 +15,56 @@
 #ifndef SUBZERO_SRC_ICETIMERTREE_DEF
 #define SUBZERO_SRC_ICETIMERTREE_DEF
 
-#define TIMERTREE_TABLE        \
-  /* enum value */             \
-  X(O2)                        \
-  X(Om1)                       \
-  X(advancedPhiLowering)       \
-  X(alloca)                    \
-  X(computeLoopNestDepth)      \
-  X(convertToIce)              \
-  X(deletePhis)                \
-  X(doAddressOpt)              \
-  X(doArgLowering)             \
-  X(doBranchOpt)               \
-  X(doNopInsertion)            \
-  X(emitAsm)                   \
-  X(emitGlobalInitializers)    \
-  X(findRMW)                   \
-  X(genCode)                   \
-  X(genFrame)                  \
-  X(genHelpers)                \
-  X(initUnhandled)             \
-  X(linearScan)                \
-  X(liveRange)                 \
-  X(liveness)                  \
-  X(livenessLightweight)       \
-  X(llvmConvert)               \
-  X(loadOpt)                   \
-  X(lowerPhiAssignments)       \
-  X(parse)                     \
-  X(parseConstants)            \
-  X(parseFunctions)            \
-  X(parseFunctionValuesymtabs) \
-  X(parseGlobals)              \
-  X(parseModule)               \
-  X(parseModuleValuesymtabs)   \
-  X(parseTypes)                \
-  X(phiValidation)             \
-  X(placePhiLoads)             \
-  X(placePhiStores)            \
-  X(qEmitPop)                  \
-  X(qEmitPush)                 \
-  X(qTransPop)                 \
-  X(qTransPush)                \
-  X(regAlloc)                  \
-  X(renumberInstructions)      \
-  X(szmain)                    \
-  X(translate)                 \
-  X(translateFunctions)        \
-  X(validateLiveness)          \
-  X(vmetadata)                 \
+#define TIMERTREE_TABLE                                                        \
+  /* enum value */                                                             \
+  X(O2)                                                                        \
+  X(Om1)                                                                       \
+  X(advancedPhiLowering)                                                       \
+  X(alloca)                                                                    \
+  X(computeLoopNestDepth)                                                      \
+  X(convertToIce)                                                              \
+  X(deletePhis)                                                                \
+  X(doAddressOpt)                                                              \
+  X(doArgLowering)                                                             \
+  X(doBranchOpt)                                                               \
+  X(doNopInsertion)                                                            \
+  X(emitAsm)                                                                   \
+  X(emitGlobalInitializers)                                                    \
+  X(findRMW)                                                                   \
+  X(genCode)                                                                   \
+  X(genFrame)                                                                  \
+  X(genHelpers)                                                                \
+  X(initUnhandled)                                                             \
+  X(linearScan)                                                                \
+  X(liveRange)                                                                 \
+  X(liveness)                                                                  \
+  X(livenessLightweight)                                                       \
+  X(llvmConvert)                                                               \
+  X(loadOpt)                                                                   \
+  X(lowerPhiAssignments)                                                       \
+  X(materializeVectorShuffles)                                                 \
+  X(parse)                                                                     \
+  X(parseConstants)                                                            \
+  X(parseFunctions)                                                            \
+  X(parseFunctionValuesymtabs)                                                 \
+  X(parseGlobals)                                                              \
+  X(parseModule)                                                               \
+  X(parseModuleValuesymtabs)                                                   \
+  X(parseTypes)                                                                \
+  X(phiValidation)                                                             \
+  X(placePhiLoads)                                                             \
+  X(placePhiStores)                                                            \
+  X(qEmitPop)                                                                  \
+  X(qEmitPush)                                                                 \
+  X(qTransPop)                                                                 \
+  X(qTransPush)                                                                \
+  X(regAlloc)                                                                  \
+  X(renumberInstructions)                                                      \
+  X(szmain)                                                                    \
+  X(translate)                                                                 \
+  X(translateFunctions)                                                        \
+  X(validateLiveness)                                                          \
+  X(vmetadata)                                                                 \
   X(writeELF)
 //#define X(tag)