[WebAssembly] Initial SIMD128 support.

Kicks off the implementation of wasm SIMD128 support (spec:
https://github.com/stoklund/portable-simd/blob/master/portable-simd.md),
adding support for add, sub, mul for i8x16, i16x8, i32x4, and f32x4.

The spec is WIP, and might change in the near future.

Patch by João Porto

Differential Revision: https://reviews.llvm.org/D22686

llvm-svn: 277543
diff --git a/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 267d716d..aadbf33 100644
--- a/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -210,6 +210,11 @@
     return "f32";
   case MVT::f64:
     return "f64";
+  case MVT::v16i8:
+  case MVT::v8i16:
+  case MVT::v4i32:
+  case MVT::v4f32:
+    return "v128";
   default:
     llvm_unreachable("unsupported type");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 551ad93..f647349 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -23,7 +23,7 @@
 // WebAssembly Subtarget features.
 //===----------------------------------------------------------------------===//
 
-def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "false",
+def FeatureSIMD128 : SubtargetFeature<"simd128", "HasSIMD128", "true",
                                       "Enable 128-bit SIMD">;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
index 5887f45..0ccc813 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp
@@ -26,9 +26,10 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "WebAssembly.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -71,6 +72,10 @@
   case WebAssembly::ARGUMENT_I64:
   case WebAssembly::ARGUMENT_F32:
   case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index b95d77b..7894575 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -95,7 +95,8 @@
 
 MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const {
   const TargetRegisterClass *TRC = MRI->getRegClass(RegNo);
-  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64})
+  for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64, MVT::v16i8, MVT::v8i16,
+                MVT::v4i32, MVT::v4f32})
     if (TRC->hasType(T))
       return T;
   DEBUG(errs() << "Unknown type for register number: " << RegNo);
@@ -234,13 +235,21 @@
   case WebAssembly::ARGUMENT_I64:
   case WebAssembly::ARGUMENT_F32:
   case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
     // These represent values which are live into the function entry, so there's
     // no instruction to emit.
     break;
   case WebAssembly::FALLTHROUGH_RETURN_I32:
   case WebAssembly::FALLTHROUGH_RETURN_I64:
   case WebAssembly::FALLTHROUGH_RETURN_F32:
-  case WebAssembly::FALLTHROUGH_RETURN_F64: {
+  case WebAssembly::FALLTHROUGH_RETURN_F64:
+  case WebAssembly::FALLTHROUGH_RETURN_v16i8:
+  case WebAssembly::FALLTHROUGH_RETURN_v8i16:
+  case WebAssembly::FALLTHROUGH_RETURN_v4i32:
+  case WebAssembly::FALLTHROUGH_RETURN_v4f32: {
     // These instructions represent the implicit return at the end of a
     // function body. The operand is always a pop.
     assert(MFI->isVRegStackified(MI->getOperand(0).getReg()));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 7bfa407..b4786e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -113,6 +113,13 @@
     case MVT::f32:
     case MVT::f64:
       return VT;
+    case MVT::v16i8:
+    case MVT::v8i16:
+    case MVT::v4i32:
+    case MVT::v4f32:
+      if (Subtarget->hasSIMD128())
+        return VT;
+      break;
     default:
       break;
     }
@@ -575,7 +582,9 @@
       return false;
 
     Type *ArgTy = Arg.getType();
-    if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+    if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+      return false;
+    if (!Subtarget->hasSIMD128() && ArgTy->isVectorTy())
       return false;
 
     unsigned Opc;
@@ -600,6 +609,22 @@
       Opc = WebAssembly::ARGUMENT_F64;
       RC = &WebAssembly::F64RegClass;
       break;
+    case MVT::v16i8:
+      Opc = WebAssembly::ARGUMENT_v16i8;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v8i16:
+      Opc = WebAssembly::ARGUMENT_v8i16;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4i32:
+      Opc = WebAssembly::ARGUMENT_v4i32;
+      RC = &WebAssembly::V128RegClass;
+      break;
+    case MVT::v4f32:
+      Opc = WebAssembly::ARGUMENT_v4f32;
+      RC = &WebAssembly::V128RegClass;
+      break;
     default:
       return false;
     }
@@ -639,6 +664,9 @@
   if (IsVoid) {
     Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::CALL_INDIRECT_VOID;
   } else {
+    if (!Subtarget->hasSIMD128() && Call->getType()->isVectorTy())
+      return false;
+
     MVT::SimpleValueType RetTy = getSimpleType(Call->getType());
     switch (RetTy) {
     case MVT::i1:
@@ -660,6 +688,26 @@
       Opc = IsDirect ? WebAssembly::CALL_F64 : WebAssembly::CALL_INDIRECT_F64;
       ResultReg = createResultReg(&WebAssembly::F64RegClass);
       break;
+    case MVT::v16i8:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v16i8 : WebAssembly::CALL_INDIRECT_v16i8;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v8i16:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v8i16 : WebAssembly::CALL_INDIRECT_v8i16;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4i32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4i32 : WebAssembly::CALL_INDIRECT_v4i32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
+    case MVT::v4f32:
+      Opc =
+          IsDirect ? WebAssembly::CALL_v4f32 : WebAssembly::CALL_INDIRECT_v4f32;
+      ResultReg = createResultReg(&WebAssembly::V128RegClass);
+      break;
     default:
       return false;
     }
@@ -972,6 +1020,8 @@
   const LoadInst *Load = cast<LoadInst>(I);
   if (Load->isAtomic())
     return false;
+  if (!Subtarget->hasSIMD128() && Load->getType()->isVectorTy())
+    return false;
 
   Address Addr;
   if (!computeAddress(Load->getPointerOperand(), Addr))
@@ -1027,6 +1077,9 @@
   const StoreInst *Store = cast<StoreInst>(I);
   if (Store->isAtomic())
     return false;
+  if (!Subtarget->hasSIMD128() &&
+      Store->getValueOperand()->getType()->isVectorTy())
+    return false;
 
   Address Addr;
   if (!computeAddress(Store->getPointerOperand(), Addr))
@@ -1102,7 +1155,7 @@
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
       .addMBB(TBB)
       .addReg(CondReg);
-  
+
   finishCondBranch(Br->getParent(), TBB, FBB);
   return true;
 }
@@ -1120,6 +1173,9 @@
   }
 
   Value *RV = Ret->getOperand(0);
+  if (!Subtarget->hasSIMD128() && RV->getType()->isVectorTy())
+    return false;
+
   unsigned Opc;
   switch (getSimpleType(RV->getType())) {
   case MVT::i1: case MVT::i8:
@@ -1129,8 +1185,24 @@
   case MVT::i64:
     Opc = WebAssembly::RETURN_I64;
     break;
-  case MVT::f32: Opc = WebAssembly::RETURN_F32; break;
-  case MVT::f64: Opc = WebAssembly::RETURN_F64; break;
+  case MVT::f32:
+    Opc = WebAssembly::RETURN_F32;
+    break;
+  case MVT::f64:
+    Opc = WebAssembly::RETURN_F64;
+    break;
+  case MVT::v16i8:
+    Opc = WebAssembly::RETURN_v16i8;
+    break;
+  case MVT::v8i16:
+    Opc = WebAssembly::RETURN_v8i16;
+    break;
+  case MVT::v4i32:
+    Opc = WebAssembly::RETURN_v4i32;
+    break;
+  case MVT::v4f32:
+    Opc = WebAssembly::RETURN_v4f32;
+    break;
   default: return false;
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index cb9ab15..931e2ad 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -54,6 +54,12 @@
   addRegisterClass(MVT::i64, &WebAssembly::I64RegClass);
   addRegisterClass(MVT::f32, &WebAssembly::F32RegClass);
   addRegisterClass(MVT::f64, &WebAssembly::F64RegClass);
+  if (Subtarget->hasSIMD128()) {
+    addRegisterClass(MVT::v16i8, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
+    addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
+  }
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -190,6 +196,10 @@
     switch (Constraint[0]) {
       case 'r':
         assert(VT != MVT::iPTR && "Pointer MVT not expected here");
+        if (Subtarget->hasSIMD128() && VT.isVector()) {
+          if (VT.getSizeInBits() == 128)
+            return std::make_pair(0U, &WebAssembly::V128RegClass);
+        }
         if (VT.isInteger() && !VT.isVector()) {
           if (VT.getSizeInBits() <= 32)
             return std::make_pair(0U, &WebAssembly::I32RegClass);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index cfa1519..4bbe2d7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -33,11 +33,29 @@
                             [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
                             !strconcat(prefix, "call_indirect\t$dst, $callee")>;
 }
+
+multiclass SIMD_CALL<ValueType vt, string prefix> {
+  def CALL_#vt : SIMD_I<(outs V128:$dst), (ins i32imm:$callee, variable_ops),
+                         [(set (vt V128:$dst),
+                               (WebAssemblycall1 (i32 imm:$callee)))],
+                         !strconcat(prefix, "call\t$dst, $callee")>;
+  def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
+                                  (ins I32:$callee, variable_ops),
+                                  [(set (vt V128:$dst),
+                                        (WebAssemblycall1 I32:$callee))],
+                                  !strconcat(prefix,
+                                             "call_indirect\t$dst, $callee")>;
+}
+
 let Uses = [SP32, SP64], isCall = 1 in {
   defm : CALL<I32, "i32.">;
   defm : CALL<I64, "i64.">;
   defm : CALL<F32, "f32.">;
   defm : CALL<F64, "f64.">;
+  defm : SIMD_CALL<v16i8, "i8x16.">;
+  defm : SIMD_CALL<v8i16, "i16x8.">;
+  defm : SIMD_CALL<v4i32, "i32x4.">;
+  defm : SIMD_CALL<v4f32, "f32x4.">;
 
   def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops),
                     [(WebAssemblycall0 (i32 imm:$callee))],
@@ -58,6 +76,14 @@
           (CALL_F32 tglobaladdr:$callee)>;
 def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
           (CALL_F64 tglobaladdr:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v8i16 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
+          (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
           (CALL_VOID tglobaladdr:$callee)>;
 
@@ -70,5 +96,13 @@
           (CALL_F32 texternalsym:$callee)>;
 def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
           (CALL_F64 texternalsym:$callee)>;
+def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v8i16 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
+def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
+          (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
 def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
           (CALL_VOID texternalsym:$callee)>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 444e275..06cb061 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -77,12 +77,27 @@
   def FALLTHROUGH_RETURN_#vt : I<(outs), (ins vt:$val), []>;
 }
 
+multiclass SIMD_RETURN<ValueType vt> {
+  def RETURN_#vt : SIMD_I<(outs), (ins V128:$val),
+                          [(WebAssemblyreturn (vt V128:$val))],
+                          "return  \t$val">;
+  // Equivalent to RETURN_#vt, for use at the end of a function when wasm
+  // semantics return by falling off the end of the block.
+  let isCodeGenOnly = 1 in
+  def FALLTHROUGH_RETURN_#vt : SIMD_I<(outs), (ins V128:$val), []>;
+}
+
 let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
 let isReturn = 1 in {
   defm : RETURN<I32>;
   defm : RETURN<I64>;
   defm : RETURN<F32>;
   defm : RETURN<F64>;
+  defm : SIMD_RETURN<v16i8>;
+  defm : SIMD_RETURN<v8i16>;
+  defm : SIMD_RETURN<v4i32>;
+  defm : SIMD_RETURN<v4f32>;
+
   def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">;
 
   // This is to RETURN_VOID what FALLTHROUGH_RETURN_#vt is to RETURN_#vt.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index 8008dd3..66145b0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -28,6 +28,9 @@
   let Pattern        = pattern;
 }
 
+class SIMD_I<dag oops, dag iops, list<dag> pattern, string asmstr = "">
+    : I<oops, iops, pattern, asmstr>, Requires<[HasSIMD128]>;
+
 // Unary and binary instructions, for the local types that WebAssembly supports.
 multiclass UnaryInt<SDNode node, string name> {
   def _I32 : I<(outs I32:$dst), (ins I32:$src),
@@ -61,6 +64,21 @@
                [(set F64:$dst, (node F64:$lhs, F64:$rhs))],
                !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
 }
+multiclass SIMDBinary<SDNode node, SDNode fnode, string name> {
+  def _I8x16 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v16i8 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i8x16.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I16x8 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v8i16 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i16x8.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _I32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4i32 V128:$dst), (node V128:$lhs, V128:$rhs))],
+                      !strconcat("i32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+  def _F32x4 : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs),
+                      [(set (v4f32 V128:$dst), (fnode V128:$lhs, V128:$rhs))],
+                      !strconcat("f32x4.", !strconcat(name, "\t$dst, $lhs, $rhs"))>;
+
+}
 multiclass ComparisonInt<CondCode cond, string name> {
   def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs),
                [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))],
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 4b31987..922279d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -100,10 +100,20 @@
   def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno),
                        [(set vt:$res, (WebAssemblyargument timm:$argno))]>;
 }
+multiclass SIMD_ARGUMENT<ValueType vt> {
+  let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in
+  def ARGUMENT_#vt : SIMD_I<(outs V128:$res), (ins i32imm:$argno),
+                            [(set (vt V128:$res),
+                                  (WebAssemblyargument timm:$argno))]>;
+}
 defm : ARGUMENT<I32>;
 defm : ARGUMENT<I64>;
 defm : ARGUMENT<F32>;
 defm : ARGUMENT<F64>;
+defm : SIMD_ARGUMENT<v16i8>;
+defm : SIMD_ARGUMENT<v8i16>;
+defm : SIMD_ARGUMENT<v4i32>;
+defm : SIMD_ARGUMENT<v4f32>;
 
 let Defs = [ARGUMENTS] in {
 
@@ -131,6 +141,7 @@
 defm : LOCAL<I64>;
 defm : LOCAL<F32>;
 defm : LOCAL<F64>;
+defm : LOCAL<V128>, Requires<[HasSIMD128]>;
 
 let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm),
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 3e29906..e403534 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -12,5 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement SIMD instructions.
-// Note: use Requires<[HasSIMD128]>.
+let isCommutable = 1 in {
+defm ADD : SIMDBinary<add, fadd, "add ">;
+defm MUL: SIMDBinary<mul, fmul, "mul ">;
+} // isCommutable = 1
+defm SUB: SIMDBinary<sub, fsub, "sub ">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 56d44e6..1c3c104 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -108,7 +108,8 @@
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
-  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  const auto &Subtarget = MF.getSubtarget<WebAssemblySubtarget>();
+  const auto &TII = *Subtarget.getInstrInfo();
   const WebAssemblyTargetLowering &TLI =
       *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
   auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
@@ -186,6 +187,34 @@
             MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_F64,
             WebAssembly::COPY_LOCAL_F64);
         break;
+      case WebAssembly::RETURN_v16i8:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v16i8,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v8i16:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v8i16,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v4i32:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v4i32,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
+      case WebAssembly::RETURN_v4f32:
+        Changed |=
+            Subtarget.hasSIMD128() &&
+            MaybeRewriteToFallthrough(MI, MBB, MF, MFI, MRI, TII,
+                                      WebAssembly::FALLTHROUGH_RETURN_v4f32,
+                                      WebAssembly::COPY_LOCAL_V128);
+        break;
       case WebAssembly::RETURN_VOID:
         if (!DisableWebAssemblyFallthroughReturnOpt &&
             &MBB == &MF.back() && &MI == &MBB.back())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 30444ac..7417cde 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -65,6 +65,10 @@
   case WebAssembly::ARGUMENT_I64:
   case WebAssembly::ARGUMENT_F32:
   case WebAssembly::ARGUMENT_F64:
+  case WebAssembly::ARGUMENT_v16i8:
+  case WebAssembly::ARGUMENT_v8i16:
+  case WebAssembly::ARGUMENT_v4i32:
+  case WebAssembly::ARGUMENT_v4f32:
     return true;
   default:
     return false;
@@ -73,7 +77,7 @@
 
 // Test whether the given register has an ARGUMENT def.
 static bool HasArgumentDef(unsigned Reg, const MachineRegisterInfo &MRI) {
-  for (auto &Def : MRI.def_instructions(Reg))
+  for (const auto &Def : MRI.def_instructions(Reg))
     if (IsArgument(&Def))
       return true;
   return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index 4a8fd96..5e43804 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -72,7 +72,11 @@
     case WebAssembly::ARGUMENT_I32:
     case WebAssembly::ARGUMENT_I64:
     case WebAssembly::ARGUMENT_F32:
-    case WebAssembly::ARGUMENT_F64: {
+    case WebAssembly::ARGUMENT_F64:
+    case WebAssembly::ARGUMENT_v16i8:
+    case WebAssembly::ARGUMENT_v8i16:
+    case WebAssembly::ARGUMENT_v4i32:
+    case WebAssembly::ARGUMENT_v4f32: {
       int64_t Imm = MI.getOperand(1).getImm();
       DEBUG(dbgs() << "Arg VReg " << MI.getOperand(0).getReg() << " -> WAReg "
                    << Imm << "\n");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 0aa3b62..5ff0085 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -418,6 +418,8 @@
     return WebAssembly::TEE_LOCAL_F32;
   if (RC == &WebAssembly::F64RegClass)
     return WebAssembly::TEE_LOCAL_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::TEE_LOCAL_V128;
   llvm_unreachable("Unexpected register class");
 }
 
@@ -765,7 +767,11 @@
         if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 ||
             Def->getOpcode() == WebAssembly::ARGUMENT_I64 ||
             Def->getOpcode() == WebAssembly::ARGUMENT_F32 ||
-            Def->getOpcode() == WebAssembly::ARGUMENT_F64)
+            Def->getOpcode() == WebAssembly::ARGUMENT_F64 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v16i8 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v8i16 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v4i32 ||
+            Def->getOpcode() == WebAssembly::ARGUMENT_v4f32)
           continue;
 
         // Decide which strategy to take. Prefer to move a single-use value
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
index 80a83fa..52456aa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td
@@ -39,6 +39,8 @@
 def F32_0 : WebAssemblyReg<"%f32.0">;
 def F64_0 : WebAssemblyReg<"%f64.0">;
 
+def V128_0: WebAssemblyReg<"%v128">;
+
 // The expression stack "register". This is an opaque entity which serves to
 // order uses and defs that must remain in LIFO order.
 def EXPR_STACK : WebAssemblyReg<"STACK">;
@@ -56,3 +58,5 @@
 def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>;
 def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>;
 def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>;
+def V128 : WebAssemblyRegClass<[v4f32, v4i32, v16i8, v8i16], 128, (add V128_0)>;
+