X86-FMA3: Implemented commute transformation for EVEX/AVX512 FMA3 opcodes.
This helped to improved memory-folding and register coalescing optimizations.

Also, this patch fixed the tracker #17229.

Reviewer: Craig Topper.
Differential Revision: https://reviews.llvm.org/D23108

llvm-svn: 278431
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 894090f..8679278 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -24,6 +24,7 @@
   X86FrameLowering.cpp
   X86ISelDAGToDAG.cpp
   X86ISelLowering.cpp
+  X86InstrFMA3Info.cpp
   X86InstrInfo.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 44b5bea..50791e9 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -194,7 +194,8 @@
                                   list<dag> ZeroMaskingPattern,
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
-                                  bit IsCommutable = 0> {
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> {
   let isCommutable = IsCommutable in
     def NAME: AVX512<O, F, Outs, Ins,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
@@ -202,7 +203,7 @@
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
-  let AddedComplexity = 20 in
+  let AddedComplexity = 20, isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -210,8 +211,11 @@
               EVEX_K {
       // In case of the 3src subclass this is overridden with a let.
       string Constraints = MaskingConstraint;
-  }
-  let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<>
+    }
+
+  // Zero mask does not add any restrictions to commute operands transformation.
+  // So, it is Ok to use IsCommutable instead of IsKCommutable.
+  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -231,14 +235,16 @@
                                   SDNode Select = vselect,
                                   string MaskingConstraint = "",
                                   InstrItinClass itin = NoItinerary,
-                                  bit IsCommutable = 0> :
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
   AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
                          AttSrcAsm, IntelSrcAsm,
                          [(set _.RC:$dst, RHS)],
                          [(set _.RC:$dst, MaskingRHS)],
                          [(set _.RC:$dst,
                                (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
-                         MaskingConstraint, NoItinerary, IsCommutable>;
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
@@ -248,13 +254,14 @@
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
                            InstrItinClass itin = NoItinerary,
-                           bit IsCommutable = 0, SDNode Select = vselect> :
+                           bit IsCommutable = 0, bit IsKCommutable = 0,
+                           SDNode Select = vselect> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
-                          "$src0 = $dst", itin, IsCommutable>;
+                          "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the scalar instruction.
@@ -278,15 +285,17 @@
 multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                                 dag Outs, dag NonTiedIns, string OpcodeStr,
                                 string AttSrcAsm, string IntelSrcAsm,
-                                dag RHS> :
+                                dag RHS, bit IsCommutable = 0, 
+                                bit IsKCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
-                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>;
+                          (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
+                          vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
 
-// Similar to AVX512_maskable_3rc but in this case the input VT for the tied
+// Similar to AVX512_maskable_3src but in this case the input VT for the tied
 // operand differs from the output VT. This requires a bitconvert on
 // the preserved vector going into the vselect.
 multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
@@ -305,14 +314,16 @@
 multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag Outs, dag NonTiedIns, string OpcodeStr,
                                      string AttSrcAsm, string IntelSrcAsm,
-                                     dag RHS> :
+                                     dag RHS, bit IsCommutable = 0,
+                                     bit IsKCommutable = 0> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
-                          X86selects>;
+                          X86selects, "", NoItinerary, IsCommutable,
+                          IsKCommutable>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
@@ -4842,13 +4853,13 @@
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
          AVX512FMA3Base;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3)))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
           AVX512FMA3Base;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4856,7 +4867,7 @@
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
-             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
+             _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
             AVX512FMA3Base, EVEX_B;
   }
 
@@ -4875,7 +4886,7 @@
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc)))>,
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
           AVX512FMA3Base, EVEX_B, EVEX_RC;
 }
 
@@ -4917,13 +4928,13 @@
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1))>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
          AVX512FMA3Base;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
          AVX512FMA3Base;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -4932,7 +4943,7 @@
          "$src2, ${src3}"##_.BroadcastStr,
          (_.VT (OpNode _.RC:$src2,
                       (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
-                      _.RC:$src1))>, AVX512FMA3Base, EVEX_B;
+                      _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
   }
 
   // Additional patterns for folding broadcast nodes in other orders.
@@ -4960,7 +4971,7 @@
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc)))>,
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
           AVX512FMA3Base, EVEX_B, EVEX_RC;
 }
 
@@ -6036,7 +6047,7 @@
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
                                 (i32 imm:$src2),
                                 (i32 FROUND_CURRENT)),
-                   NoItinerary, 0, X86select>, AVX512AIi8Base;
+                   NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
   def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
              (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -6056,7 +6067,7 @@
                    (X86cvtps2ph (_src.VT _src.RC:$src1),
                                 (i32 imm:$src2),
                                 (i32 FROUND_NO_EXC)),
-                   NoItinerary, 0, X86select>, EVEX_B, AVX512AIi8Base;
+                   NoItinerary, 0, 0, X86select>, EVEX_B, AVX512AIi8Base;
 }
 let Predicates = [HasAVX512] in {
   defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
new file mode 100644
index 0000000..7bd8415
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -0,0 +1,284 @@
+//===-- X86InstrFMA3Info.cpp - X86 FMA3 Instruction Information -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrFMA3Info.h"
+#include "X86InstrInfo.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Threading.h"
+
+/// This flag is used in the method llvm::call_once() used below to make the
+/// initialization of the map 'OpcodeToGroup' thread safe.
+LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+
+static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
+X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
+  return &*X86InstrFMA3InfoObj;
+}
+
+void X86InstrFMA3Info::initRMGroup(const uint16_t *RegOpcodes,
+                                   const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]] && !OpcodeToGroup[MemOpcodes[0]] &&
+          !OpcodeToGroup[MemOpcodes[1]] && !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initRGroup(const uint16_t *RegOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(RegOpcodes, nullptr, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[RegOpcodes[0]] && !OpcodeToGroup[RegOpcodes[1]] &&
+          !OpcodeToGroup[RegOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[RegOpcodes[0]] = G;
+  OpcodeToGroup[RegOpcodes[1]] = G;
+  OpcodeToGroup[RegOpcodes[2]] = G;
+}
+
+void X86InstrFMA3Info::initMGroup(const uint16_t *MemOpcodes, unsigned Attr) {
+  // Create a new instance of this class that would hold a group of FMA opcodes.
+  X86InstrFMA3Group *G = new X86InstrFMA3Group(nullptr, MemOpcodes, Attr);
+
+  // Add the references from indvidual opcodes to the group holding them.
+  assert((!OpcodeToGroup[MemOpcodes[0]] && !OpcodeToGroup[MemOpcodes[1]] &&
+          !OpcodeToGroup[MemOpcodes[2]]) &&
+         "Duplication or rewrite of elements in OpcodeToGroup.");
+  OpcodeToGroup[MemOpcodes[0]] = G;
+  OpcodeToGroup[MemOpcodes[1]] = G;
+  OpcodeToGroup[MemOpcodes[2]] = G;
+}
+
+#define FMA3RM(R132, R213, R231, M132, M213, M231)                             \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132);
+
+#define FMA3RMA(R132, R213, R231, M132, M213, M231, Attrs)                     \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  static const uint16_t Mem##R132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initRMGroup(Reg##R132, Mem##R132, (Attrs));
+
+#define FMA3R(R132, R213, R231)                                                \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132);
+
+#define FMA3RA(R132, R213, R231, Attrs)                                        \
+  static const uint16_t Reg##R132[3] = {X86::R132, X86::R213, X86::R231};      \
+  initRGroup(Reg##R132, (Attrs));
+
+#define FMA3M(M132, M213, M231)                                                \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132);
+
+#define FMA3MA(M132, M213, M231, Attrs)                                        \
+  static const uint16_t Mem##M132[3] = {X86::M132, X86::M213, X86::M231};      \
+  initMGroup(Mem##M132, (Attrs));
+
+#define FMA3_AVX2_VECTOR_GROUP(Name)                                           \
+  FMA3RM(Name##132PSr, Name##213PSr, Name##231PSr,                             \
+         Name##132PSm, Name##213PSm, Name##231PSm);                            \
+  FMA3RM(Name##132PDr, Name##213PDr, Name##231PDr,                             \
+         Name##132PDm, Name##213PDm, Name##231PDm);                            \
+  FMA3RM(Name##132PSYr, Name##213PSYr, Name##231PSYr,                          \
+         Name##132PSYm, Name##213PSYm, Name##231PSYm);                         \
+  FMA3RM(Name##132PDYr, Name##213PDYr, Name##231PDYr,                          \
+         Name##132PDYm, Name##213PDYm, Name##231PDYm);
+
+#define FMA3_AVX2_SCALAR_GROUP(Name)                                           \
+  FMA3RM(Name##132SSr, Name##213SSr, Name##231SSr,                             \
+         Name##132SSm, Name##213SSm, Name##231SSm);                            \
+  FMA3RM(Name##132SDr, Name##213SDr, Name##231SDr,                             \
+         Name##132SDm, Name##213SDm, Name##231SDm);                            \
+  FMA3RMA(Name##132SSr_Int, Name##213SSr_Int, Name##231SSr_Int,                \
+          Name##132SSm_Int, Name##213SSm_Int, Name##231SSm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDr_Int, Name##213SDr_Int, Name##231SDr_Int,                \
+          Name##132SDm_Int, Name##213SDm_Int, Name##231SDm_Int,                \
+          X86InstrFMA3Group::X86FMA3Intrinsic);
+
+#define FMA3_AVX2_FULL_GROUP(Name)                                             \
+  FMA3_AVX2_VECTOR_GROUP(Name);                                                \
+  FMA3_AVX2_SCALAR_GROUP(Name);
+
+#define FMA3_AVX512_VECTOR_GROUP(Name)                                         \
+  FMA3RM(Name##132PSZ128r, Name##213PSZ128r, Name##231PSZ128r,                 \
+         Name##132PSZ128m, Name##213PSZ128m, Name##231PSZ128m);                \
+  FMA3RM(Name##132PDZ128r, Name##213PDZ128r, Name##231PDZ128r,                 \
+         Name##132PDZ128m, Name##213PDZ128m, Name##231PDZ128m);                \
+  FMA3RM(Name##132PSZ256r, Name##213PSZ256r, Name##231PSZ256r,                 \
+         Name##132PSZ256m, Name##213PSZ256m, Name##231PSZ256m);                \
+  FMA3RM(Name##132PDZ256r, Name##213PDZ256r, Name##231PDZ256r,                 \
+         Name##132PDZ256m, Name##213PDZ256m, Name##231PDZ256m);                \
+  FMA3RM(Name##132PSZr,    Name##213PSZr,    Name##231PSZr,                    \
+         Name##132PSZm,    Name##213PSZm,    Name##231PSZm);                   \
+  FMA3RM(Name##132PDZr,    Name##213PDZr,    Name##231PDZr,                    \
+         Name##132PDZm,    Name##213PDZm,    Name##231PDZm);                   \
+  FMA3RMA(Name##132PSZ128rk, Name##213PSZ128rk, Name##231PSZ128rk,             \
+          Name##132PSZ128mk, Name##213PSZ128mk, Name##231PSZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ128rk, Name##213PDZ128rk, Name##231PDZ128rk,             \
+          Name##132PDZ128mk, Name##213PDZ128mk, Name##231PDZ128mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ256rk, Name##213PSZ256rk, Name##231PSZ256rk,             \
+          Name##132PSZ256mk, Name##213PSZ256mk, Name##231PSZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZ256rk, Name##213PDZ256rk, Name##231PDZ256rk,             \
+          Name##132PDZ256mk, Name##213PDZ256mk, Name##231PDZ256mk,             \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZrk,    Name##213PSZrk,    Name##231PSZrk,                \
+          Name##132PSZmk,    Name##213PSZmk,    Name##231PSZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PDZrk,    Name##213PDZrk,    Name##231PDZrk,                \
+          Name##132PDZmk,    Name##213PDZmk,    Name##231PDZmk,                \
+          X86InstrFMA3Group::X86FMA3KMergeMasked);                             \
+  FMA3RMA(Name##132PSZ128rkz, Name##213PSZ128rkz, Name##231PSZ128rkz,          \
+          Name##132PSZ128mkz, Name##213PSZ128mkz, Name##231PSZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ128rkz, Name##213PDZ128rkz, Name##231PDZ128rkz,          \
+          Name##132PDZ128mkz, Name##213PDZ128mkz, Name##231PDZ128mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZ256rkz, Name##213PSZ256rkz, Name##231PSZ256rkz,          \
+          Name##132PSZ256mkz, Name##213PSZ256mkz, Name##231PSZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZ256rkz, Name##213PDZ256rkz, Name##231PDZ256rkz,          \
+          Name##132PDZ256mkz, Name##213PDZ256mkz, Name##231PDZ256mkz,          \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PSZrkz,    Name##213PSZrkz,    Name##231PSZrkz,             \
+          Name##132PSZmkz,    Name##213PSZmkz,    Name##231PSZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3RMA(Name##132PDZrkz,    Name##213PDZrkz,    Name##231PDZrkz,             \
+          Name##132PDZmkz,    Name##213PDZmkz,    Name##231PDZmkz,             \
+          X86InstrFMA3Group::X86FMA3KZeroMasked);                              \
+  FMA3R(Name##132PSZrb, Name##213PSZrb, Name##231PSZrb);                       \
+  FMA3R(Name##132PDZrb, Name##213PDZrb, Name##231PDZrb);                       \
+  FMA3RA(Name##132PSZrbk, Name##213PSZrbk, Name##231PSZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PDZrbk, Name##213PDZrbk, Name##231PDZrbk,                    \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3RA(Name##132PSZrbkz, Name##213PSZrbkz, Name##231PSZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3RA(Name##132PDZrbkz, Name##213PDZrbkz, Name##231PDZrbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3M(Name##132PSZ128mb, Name##213PSZ128mb, Name##231PSZ128mb);              \
+  FMA3M(Name##132PDZ128mb, Name##213PDZ128mb, Name##231PDZ128mb);              \
+  FMA3M(Name##132PSZ256mb, Name##213PSZ256mb, Name##231PSZ256mb);              \
+  FMA3M(Name##132PDZ256mb, Name##213PDZ256mb, Name##231PDZ256mb);              \
+  FMA3M(Name##132PSZmb, Name##213PSZmb, Name##231PSZmb);                       \
+  FMA3M(Name##132PDZmb, Name##213PDZmb, Name##231PDZmb);                       \
+  FMA3MA(Name##132PSZ128mbk, Name##213PSZ128mbk, Name##231PSZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ128mbk, Name##213PDZ128mbk, Name##231PDZ128mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ256mbk, Name##213PSZ256mbk, Name##231PSZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZ256mbk, Name##213PDZ256mbk, Name##231PDZ256mbk,           \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZmbk,    Name##213PSZmbk,    Name##231PSZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PDZmbk,    Name##213PDZmbk,    Name##231PDZmbk,              \
+         X86InstrFMA3Group::X86FMA3KMergeMasked);                              \
+  FMA3MA(Name##132PSZ128mbkz, Name##213PSZ128mbkz, Name##231PSZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ128mbkz, Name##213PDZ128mbkz, Name##231PDZ128mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZ256mbkz, Name##213PSZ256mbkz, Name##231PSZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZ256mbkz, Name##213PDZ256mbkz, Name##231PDZ256mbkz,        \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PSZmbkz, Name##213PSZmbkz, Name##231PSZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);                               \
+  FMA3MA(Name##132PDZmbkz, Name##213PDZmbkz, Name##231PDZmbkz,                 \
+         X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_SCALAR_GROUP(Name)                                         \
+  FMA3RM(Name##132SSZr,      Name##213SSZr,     Name##231SSZr,                 \
+         Name##132SSZm,      Name##213SSZm,     Name##231SSZm);                \
+  FMA3RM(Name##132SDZr,      Name##213SDZr,     Name##231SDZr,                 \
+         Name##132SDZm,      Name##213SDZm,     Name##231SDZm);                \
+  FMA3RMA(Name##132SSZr_Int, Name##213SSZr_Int, Name##231SSZr_Int,             \
+          Name##132SSZm_Int, Name##213SSZm_Int, Name##231SSZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SDZr_Int, Name##213SDZr_Int, Name##231SDZr_Int,             \
+          Name##132SDZm_Int, Name##213SDZm_Int, Name##231SDZm_Int,             \
+          X86InstrFMA3Group::X86FMA3Intrinsic);                                \
+  FMA3RMA(Name##132SSZr_Intk, Name##213SSZr_Intk, Name##231SSZr_Intk,          \
+          Name##132SSZm_Intk, Name##213SSZm_Intk, Name##231SSZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SDZr_Intk, Name##213SDZr_Intk, Name##231SDZr_Intk,          \
+          Name##132SDZm_Intk, Name##213SDZm_Intk, Name##231SDZm_Intk,          \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KMergeMasked);                         \
+  FMA3RMA(Name##132SSZr_Intkz, Name##213SSZr_Intkz, Name##231SSZr_Intkz,       \
+          Name##132SSZm_Intkz, Name##213SSZm_Intkz, Name##231SSZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RMA(Name##132SDZr_Intkz, Name##213SDZr_Intkz, Name##231SDZr_Intkz,       \
+          Name##132SDZm_Intkz, Name##213SDZm_Intkz, Name##231SDZm_Intkz,       \
+          X86InstrFMA3Group::X86FMA3Intrinsic |                                \
+              X86InstrFMA3Group::X86FMA3KZeroMasked);                          \
+  FMA3RA(Name##132SSZrb_Int, Name##213SSZrb_Int, Name##231SSZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SDZrb_Int, Name##213SDZrb_Int, Name##231SDZrb_Int,           \
+         X86InstrFMA3Group::X86FMA3Intrinsic);                                 \
+  FMA3RA(Name##132SSZrb_Intk, Name##213SSZrb_Intk, Name##231SSZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SDZrb_Intk, Name##213SDZrb_Intk, Name##231SDZrb_Intk,        \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KMergeMasked);                          \
+  FMA3RA(Name##132SSZrb_Intkz, Name##213SSZrb_Intkz, Name##231SSZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);                           \
+  FMA3RA(Name##132SDZrb_Intkz, Name##213SDZrb_Intkz, Name##231SDZrb_Intkz,     \
+         X86InstrFMA3Group::X86FMA3Intrinsic |                                 \
+             X86InstrFMA3Group::X86FMA3KZeroMasked);
+
+#define FMA3_AVX512_FULL_GROUP(Name)                                           \
+  FMA3_AVX512_VECTOR_GROUP(Name);                                              \
+  FMA3_AVX512_SCALAR_GROUP(Name);
+
+void X86InstrFMA3Info::initGroupsOnceImpl() {
+  FMA3_AVX2_FULL_GROUP(VFMADD);
+  FMA3_AVX2_FULL_GROUP(VFMSUB);
+  FMA3_AVX2_FULL_GROUP(VFNMADD);
+  FMA3_AVX2_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX2_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX2_VECTOR_GROUP(VFMSUBADD);
+
+  FMA3_AVX512_FULL_GROUP(VFMADD);
+  FMA3_AVX512_FULL_GROUP(VFMSUB);
+  FMA3_AVX512_FULL_GROUP(VFNMADD);
+  FMA3_AVX512_FULL_GROUP(VFNMSUB);
+
+  FMA3_AVX512_VECTOR_GROUP(VFMADDSUB);
+  FMA3_AVX512_VECTOR_GROUP(VFMSUBADD);
+}
+
+void X86InstrFMA3Info::initGroupsOnce() {
+  llvm::call_once(InitGroupsOnceFlag,
+                  []() { getX86InstrFMA3Info()->initGroupsOnceImpl(); });
+}
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.h b/llvm/lib/Target/X86/X86InstrFMA3Info.h
new file mode 100644
index 0000000..987ff9e
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -0,0 +1,315 @@
+//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the classes providing information
+// about existing X86 FMA3 opcodes, classifying and grouping them.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+#define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
+
+#include "X86.h"
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <set>
+
+using namespace llvm;
+
+/// This class is used to group {132, 213, 231} forms of FMA opcodes together.
+/// Each of the groups has either 3 register opcodes, 3 memory opcodes,
+/// or 6 register and memory opcodes. Also, each group has an attrubutes field
+/// describing it.
+class X86InstrFMA3Group {
+private:
+  /// Reference to an array holding 3 forms of register FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *RegOpcodes;
+
+  /// Reference to an array holding 3 forms of memory FMA opcodes.
+  /// It may be set to nullptr if the group of FMA opcodes does not have
+  /// any register form opcodes.
+  const uint16_t *MemOpcodes;
+
+  /// This bitfield specifies the attributes associated with the created
+  /// FMA groups of opcodes.
+  unsigned Attributes;
+
+  static const unsigned Form132 = 0;
+  static const unsigned Form213 = 1;
+  static const unsigned Form231 = 2;
+
+public:
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of FMA intrinsic opcodes.
+  static const unsigned X86FMA3Intrinsic = 0x1;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-mask and
+  /// passing the elements from the 1st operand to the result of the operation
+  /// when the correpondings bits in the k-mask are unset.
+  static const unsigned X86FMA3KMergeMasked = 0x2;
+
+  /// This bit must be set in the 'Attributes' field of FMA group if such
+  /// group of FMA opcodes consists of AVX512 opcodes accepting a k-zeromask.
+  static const unsigned X86FMA3KZeroMasked = 0x4;
+
+  /// Constructor. Creates a new group of FMA opcodes with three register form
+  /// FMA opcodes \p RegOpcodes and three memory form FMA opcodes \p MemOpcodes.
+  /// The parameters \p RegOpcodes and \p MemOpcodes may be set to nullptr,
+  /// which means that the created group of FMA opcodes does not have the
+  /// corresponding (register or memory) opcodes.
+  /// The parameter \p Attr specifies the attributes describing the created
+  /// group.
+  X86InstrFMA3Group(const uint16_t *RegOpcodes, const uint16_t *MemOpcodes,
+                    unsigned Attr)
+      : RegOpcodes(RegOpcodes), MemOpcodes(MemOpcodes), Attributes(Attr) {
+    assert((RegOpcodes || MemOpcodes) &&
+           "Cannot create a group not having any opcodes.");
+  }
+
+  /// Returns a memory form opcode that is the equivalent of the given register
+  /// form opcode \p RegOpcode. 0 is returned if the group does not have
+  /// either register of memory opcodes.
+  unsigned getMemOpcode(unsigned RegOpcode) const {
+    if (!RegOpcodes || !MemOpcodes)
+      return 0;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (RegOpcodes[Form] == RegOpcode)
+        return MemOpcodes[Form];
+    return 0;
+  }
+
+  /// Returns the 132 form of FMA register opcode.
+  unsigned getReg132Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA register opcode.
+  unsigned getReg213Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA register opcode.
+  unsigned getReg231Opcode() const {
+    assert(RegOpcodes && "The group does not have register opcodes.");
+    return RegOpcodes[Form231];
+  }
+
+  /// Returns the 132 form of FMA memory opcode.
+  unsigned getMem132Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form132];
+  }
+
+  /// Returns the 213 form of FMA memory opcode.
+  unsigned getMem213Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form213];
+  }
+
+  /// Returns the 231 form of FMA memory opcode.
+  unsigned getMem231Opcode() const {
+    assert(MemOpcodes && "The group does not have memory opcodes.");
+    return MemOpcodes[Form231];
+  }
+
+  /// Returns true iff the group of FMA opcodes holds intrinsic opcodes.
+  bool isIntrinsic() const { return (Attributes & X86FMA3Intrinsic) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds k-merge-masked opcodes.
+  bool isKMergeMasked() const {
+    return (Attributes & X86FMA3KMergeMasked) != 0;
+  }
+
+  /// Returns true iff the group of FMA opcodes holds k-zero-masked opcodes.
+  bool isKZeroMasked() const { return (Attributes & X86FMA3KZeroMasked) != 0; }
+
+  /// Returns true iff the group of FMA opcodes holds any of k-masked opcodes.
+  bool isKMasked() const {
+    return (Attributes & (X86FMA3KMergeMasked | X86FMA3KZeroMasked)) != 0;
+  }
+
+  /// Returns true iff the given \p Opcode is a register opcode from the
+  /// groups of FMA opcodes.
+  bool isRegOpcodeFromGroup(unsigned Opcode) const {
+    if (!RegOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == RegOpcodes[Form])
+        return true;
+    return false;
+  }
+
+  /// Returns true iff the given \p Opcode is a memory opcode from the
+  /// groups of FMA opcodes.
+  bool isMemOpcodeFromGroup(unsigned Opcode) const {
+    if (!MemOpcodes)
+      return false;
+    for (unsigned Form = 0; Form < 3; Form++)
+      if (Opcode == MemOpcodes[Form])
+        return true;
+    return false;
+  }
+};
+
+/// This class provides information about all existing FMA3 opcodes
+///
+class X86InstrFMA3Info {
+private:
+  /// A map that is used to find the group of FMA opcodes using any FMA opcode
+  /// from the group.
+  DenseMap<unsigned, const X86InstrFMA3Group *> OpcodeToGroup;
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method can be called many times, but the actual initialization is
+  /// called only once.
+  static void initGroupsOnce();
+
+  /// Creates groups of FMA opcodes and initializes Opcode-to-Group map.
+  /// This method must be called ONLY from initGroupsOnce(). Otherwise, such
+  /// call is not thread safe.
+  void initGroupsOnceImpl();
+
+  /// Creates one group of FMA opcodes having the register opcodes
+  /// \p RegOpcodes and memory opcodes \p MemOpcodes. The parameter \p Attr
+  /// specifies the attributes describing the created group.
+  void initRMGroup(const uint16_t *RegOpcodes,
+                   const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the register opcodes
+  /// \p RegOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initRGroup(const uint16_t *RegOpcodes, unsigned Attr = 0);
+
+  /// Creates one group of FMA opcodes having only the memory opcodes
+  /// \p MemOpcodes. The parameter \p Attr specifies the attributes describing
+  /// the created group.
+  void initMGroup(const uint16_t *MemOpcodes, unsigned Attr = 0);
+
+public:
+  /// Returns the reference to an object of this class. It is assumed that
+  /// only one object may exist.
+  static X86InstrFMA3Info *getX86InstrFMA3Info();
+
+  /// Constructor. Just creates an object of the class.
+  X86InstrFMA3Info() {}
+
+  /// Destructor. Deallocates the memory used for FMA3 Groups.
+  ~X86InstrFMA3Info() {
+    std::set<const X86InstrFMA3Group *> DeletedGroups;
+    auto E = OpcodeToGroup.end();
+    for (auto I = OpcodeToGroup.begin(); I != E; I++) {
+      const X86InstrFMA3Group *G = I->second;
+      if (DeletedGroups.find(G) == DeletedGroups.end()) {
+        DeletedGroups.insert(G);
+        delete G;
+      }
+    }
+  }
+
+  /// Returns a reference to a group of FMA3 opcodes to where the given
+  /// \p Opcode is included. If the given \p Opcode is not recognized as FMA3
+  /// and not included into any FMA3 group, then nullptr is returned.
+  static const X86InstrFMA3Group *getFMA3Group(unsigned Opcode) {
+    // Ensure that the groups of opcodes are initialized.
+    initGroupsOnce();
+
+    // Find the group including the given opcode.
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.find(Opcode);
+    if (I == FMA3Info->OpcodeToGroup.end())
+      return nullptr;
+
+    return I->second;
+  }
+
+  /// Returns true iff the given \p Opcode is recognized as FMA3 by this class.
+  static bool isFMA3(unsigned Opcode) {
+    return getFMA3Group(Opcode) != nullptr;
+  }
+
+  /// Iterator that is used to walk on FMA register opcodes having memory
+  /// form equivalents.
+  class rm_iterator {
+  private:
+    /// Iterator associated with the OpcodeToGroup map. It must always be
+    /// initialized with an entry from OpcodeToGroup for which I->first
+    /// points to a register FMA opcode and I->second points to a group of
+    /// FMA opcodes having memory form equivalent of I->first.
+    DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I;
+
+  public:
+    /// Constructor. Creates rm_iterator. The parameter \p I must be an
+    /// iterator to OpcodeToGroup map entry having I->first pointing to
+    /// register form FMA opcode and I->second pointing to a group of FMA
+    /// opcodes holding memory form equivalent for I->fist.
+    rm_iterator(DenseMap<unsigned, const X86InstrFMA3Group *>::const_iterator I)
+        : I(I) {}
+
+    /// Returns the register form FMA opcode.
+    unsigned getRegOpcode() const { return I->first; };
+
+    /// Returns the memory form equivalent opcode for FMA register opcode
+    /// referenced by I->first.
+    unsigned getMemOpcode() const {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *Group = I->second;
+      return Group->getMemOpcode(Opcode);
+    }
+
+    /// Returns a reference to a group of FMA opcodes.
+    const X86InstrFMA3Group *getGroup() const { return I->second; }
+
+    bool operator==(const rm_iterator &OtherIt) const { return I == OtherIt.I; }
+    bool operator!=(const rm_iterator &OtherIt) const { return I != OtherIt.I; }
+
+    /// Increment. Advances the 'I' iterator to the next OpcodeToGroup entry
+    /// having I->first pointing to register form FMA and I->second pointing
+    /// to a group of FMA opcodes holding memory form equivalen for I->first.
+    rm_iterator &operator++() {
+      auto E = getX86InstrFMA3Info()->OpcodeToGroup.end();
+      for (++I; I != E; ++I) {
+        unsigned RegOpcode = I->first;
+        const X86InstrFMA3Group *Group = I->second;
+        if (Group->getMemOpcode(RegOpcode) != 0)
+          break;
+      }
+      return *this;
+    }
+  };
+
+  /// Returns rm_iterator pointing to the first entry of OpcodeToGroup map
+  /// with a register FMA opcode having memory form opcode equivalent.
+  static rm_iterator rm_begin() {
+    initGroupsOnce();
+    const X86InstrFMA3Info *FMA3Info = getX86InstrFMA3Info();
+    auto I = FMA3Info->OpcodeToGroup.begin();
+    auto E = FMA3Info->OpcodeToGroup.end();
+    while (I != E) {
+      unsigned Opcode = I->first;
+      const X86InstrFMA3Group *G = I->second;
+      if (G->getMemOpcode(Opcode) != 0)
+        break;
+      I++;
+    }
+    return rm_iterator(I);
+  }
+
+  /// Returns the last rm_iterator.
+  static rm_iterator rm_end() {
+    initGroupsOnce();
+    return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
+  }
+};
+
+#endif
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 9a83c09..9df179d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1855,281 +1855,6 @@
   }
 
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
-    // FMA foldable instructions
-    { X86::VFMADD231SSr,          X86::VFMADD231SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD231SSr_Int,      X86::VFMADD231SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD231SDr,          X86::VFMADD231SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD231SDr_Int,      X86::VFMADD231SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD132SSr,          X86::VFMADD132SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD132SSr_Int,      X86::VFMADD132SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD132SDr,          X86::VFMADD132SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD132SDr_Int,      X86::VFMADD132SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD213SSr,          X86::VFMADD213SSm,          TB_ALIGN_NONE },
-    { X86::VFMADD213SSr_Int,      X86::VFMADD213SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD213SDr,          X86::VFMADD213SDm,          TB_ALIGN_NONE },
-    { X86::VFMADD213SDr_Int,      X86::VFMADD213SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMADD231SSZr,         X86::VFMADD231SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231SSZr_Int,     X86::VFMADD231SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD231SDZr,         X86::VFMADD231SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231SDZr_Int,     X86::VFMADD231SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD132SSZr,         X86::VFMADD132SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132SSZr_Int,     X86::VFMADD132SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD132SDZr,         X86::VFMADD132SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132SDZr_Int,     X86::VFMADD132SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD213SSZr,         X86::VFMADD213SSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213SSZr_Int,     X86::VFMADD213SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMADD213SDZr,         X86::VFMADD213SDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213SDZr_Int,     X86::VFMADD213SDZm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFMADD231PSr,          X86::VFMADD231PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD231PDr,          X86::VFMADD231PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD132PSr,          X86::VFMADD132PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD132PDr,          X86::VFMADD132PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD213PSr,          X86::VFMADD213PSm,          TB_ALIGN_NONE },
-    { X86::VFMADD213PDr,          X86::VFMADD213PDm,          TB_ALIGN_NONE },
-    { X86::VFMADD231PSYr,         X86::VFMADD231PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PDYr,         X86::VFMADD231PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PSYr,         X86::VFMADD132PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PDYr,         X86::VFMADD132PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PSYr,         X86::VFMADD213PSYm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PDYr,         X86::VFMADD213PDYm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PSZr,         X86::VFMADD231PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PDZr,         X86::VFMADD231PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PSZr,         X86::VFMADD132PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD132PDZr,         X86::VFMADD132PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PSZr,         X86::VFMADD213PSZm,         TB_ALIGN_NONE },
-    { X86::VFMADD213PDZr,         X86::VFMADD213PDZm,         TB_ALIGN_NONE },
-    { X86::VFMADD231PSZ128r,      X86::VFMADD231PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PDZ128r,      X86::VFMADD231PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PSZ128r,      X86::VFMADD132PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PDZ128r,      X86::VFMADD132PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PSZ128r,      X86::VFMADD213PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PDZ128r,      X86::VFMADD213PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PSZ256r,      X86::VFMADD231PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD231PDZ256r,      X86::VFMADD231PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PSZ256r,      X86::VFMADD132PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD132PDZ256r,      X86::VFMADD132PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PSZ256r,      X86::VFMADD213PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMADD213PDZ256r,      X86::VFMADD213PDZ256m,      TB_ALIGN_NONE },
-
-    { X86::VFNMADD231SSr,         X86::VFNMADD231SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231SSr_Int,     X86::VFNMADD231SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD231SDr,         X86::VFNMADD231SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231SDr_Int,     X86::VFNMADD231SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD132SSr,         X86::VFNMADD132SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132SSr_Int,     X86::VFNMADD132SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD132SDr,         X86::VFNMADD132SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132SDr_Int,     X86::VFNMADD132SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD213SSr,         X86::VFNMADD213SSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213SSr_Int,     X86::VFNMADD213SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD213SDr,         X86::VFNMADD213SDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213SDr_Int,     X86::VFNMADD213SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMADD231SSZr,        X86::VFNMADD231SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231SSZr_Int,    X86::VFNMADD231SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD231SDZr,        X86::VFNMADD231SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231SDZr_Int,    X86::VFNMADD231SDZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD132SSZr,        X86::VFNMADD132SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132SSZr_Int,    X86::VFNMADD132SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD132SDZr,        X86::VFNMADD132SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132SDZr_Int,    X86::VFNMADD132SDZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD213SSZr,        X86::VFNMADD213SSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213SSZr_Int,    X86::VFNMADD213SSZm_Int,    TB_ALIGN_NONE },
-    { X86::VFNMADD213SDZr,        X86::VFNMADD213SDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213SDZr_Int,    X86::VFNMADD213SDZm_Int,    TB_ALIGN_NONE },
-
-    { X86::VFNMADD231PSr,         X86::VFNMADD231PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231PDr,         X86::VFNMADD231PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132PSr,         X86::VFNMADD132PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD132PDr,         X86::VFNMADD132PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213PSr,         X86::VFNMADD213PSm,         TB_ALIGN_NONE },
-    { X86::VFNMADD213PDr,         X86::VFNMADD213PDm,         TB_ALIGN_NONE },
-    { X86::VFNMADD231PSYr,        X86::VFNMADD231PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PDYr,        X86::VFNMADD231PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PSYr,        X86::VFNMADD132PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PDYr,        X86::VFNMADD132PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PSYr,        X86::VFNMADD213PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PDYr,        X86::VFNMADD213PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZr,        X86::VFNMADD231PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZr,        X86::VFNMADD231PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZr,        X86::VFNMADD132PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZr,        X86::VFNMADD132PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZr,        X86::VFNMADD213PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZr,        X86::VFNMADD213PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZ128r,     X86::VFNMADD231PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZ128r,     X86::VFNMADD231PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZ128r,     X86::VFNMADD132PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZ128r,     X86::VFNMADD132PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZ128r,     X86::VFNMADD213PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZ128r,     X86::VFNMADD213PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PSZ256r,     X86::VFNMADD231PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD231PDZ256r,     X86::VFNMADD231PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PSZ256r,     X86::VFNMADD132PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD132PDZ256r,     X86::VFNMADD132PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PSZ256r,     X86::VFNMADD213PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMADD213PDZ256r,     X86::VFNMADD213PDZ256m,     TB_ALIGN_NONE },
-
-    { X86::VFMSUB231SSr,          X86::VFMSUB231SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231SSr_Int,      X86::VFMSUB231SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB231SDr,          X86::VFMSUB231SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231SDr_Int,      X86::VFMSUB231SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB132SSr,          X86::VFMSUB132SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132SSr_Int,      X86::VFMSUB132SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB132SDr,          X86::VFMSUB132SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132SDr_Int,      X86::VFMSUB132SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB213SSr,          X86::VFMSUB213SSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213SSr_Int,      X86::VFMSUB213SSm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB213SDr,          X86::VFMSUB213SDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213SDr_Int,      X86::VFMSUB213SDm_Int,      TB_ALIGN_NONE },
-    { X86::VFMSUB231SSZr,         X86::VFMSUB231SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231SSZr_Int,     X86::VFMSUB231SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB231SDZr,         X86::VFMSUB231SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231SDZr_Int,     X86::VFMSUB231SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB132SSZr,         X86::VFMSUB132SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132SSZr_Int,     X86::VFMSUB132SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB132SDZr,         X86::VFMSUB132SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132SDZr_Int,     X86::VFMSUB132SDZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB213SSZr,         X86::VFMSUB213SSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213SSZr_Int,     X86::VFMSUB213SSZm_Int,     TB_ALIGN_NONE },
-    { X86::VFMSUB213SDZr,         X86::VFMSUB213SDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213SDZr_Int,     X86::VFMSUB213SDZm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFMSUB231PSr,          X86::VFMSUB231PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231PDr,          X86::VFMSUB231PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132PSr,          X86::VFMSUB132PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB132PDr,          X86::VFMSUB132PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213PSr,          X86::VFMSUB213PSm,          TB_ALIGN_NONE },
-    { X86::VFMSUB213PDr,          X86::VFMSUB213PDm,          TB_ALIGN_NONE },
-    { X86::VFMSUB231PSYr,         X86::VFMSUB231PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PDYr,         X86::VFMSUB231PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PSYr,         X86::VFMSUB132PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PDYr,         X86::VFMSUB132PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PSYr,         X86::VFMSUB213PSYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PDYr,         X86::VFMSUB213PDYm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZr,         X86::VFMSUB231PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZr,         X86::VFMSUB231PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZr,         X86::VFMSUB132PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZr,         X86::VFMSUB132PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZr,         X86::VFMSUB213PSZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZr,         X86::VFMSUB213PDZm,         TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZ128r,      X86::VFMSUB231PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZ128r,      X86::VFMSUB231PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZ128r,      X86::VFMSUB132PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZ128r,      X86::VFMSUB132PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZ128r,      X86::VFMSUB213PSZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZ128r,      X86::VFMSUB213PDZ128m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PSZ256r,      X86::VFMSUB231PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB231PDZ256r,      X86::VFMSUB231PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PSZ256r,      X86::VFMSUB132PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB132PDZ256r,      X86::VFMSUB132PDZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PSZ256r,      X86::VFMSUB213PSZ256m,      TB_ALIGN_NONE },
-    { X86::VFMSUB213PDZ256r,      X86::VFMSUB213PDZ256m,      TB_ALIGN_NONE },
-
-    { X86::VFNMSUB231SSr,         X86::VFNMSUB231SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231SSr_Int,     X86::VFNMSUB231SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231SDr,         X86::VFNMSUB231SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231SDr_Int,     X86::VFNMSUB231SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132SSr,         X86::VFNMSUB132SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132SSr_Int,     X86::VFNMSUB132SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132SDr,         X86::VFNMSUB132SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132SDr_Int,     X86::VFNMSUB132SDm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213SSr,         X86::VFNMSUB213SSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213SSr_Int,     X86::VFNMSUB213SSm_Int,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213SDr,         X86::VFNMSUB213SDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213SDr_Int,     X86::VFNMSUB213SDm_Int,     TB_ALIGN_NONE },
-
-    { X86::VFNMSUB231PSr,         X86::VFNMSUB231PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDr,         X86::VFNMSUB231PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSr,         X86::VFNMSUB132PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDr,         X86::VFNMSUB132PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSr,         X86::VFNMSUB213PSm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDr,         X86::VFNMSUB213PDm,         TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSYr,        X86::VFNMSUB231PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDYr,        X86::VFNMSUB231PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSYr,        X86::VFNMSUB132PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDYr,        X86::VFNMSUB132PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSYr,        X86::VFNMSUB213PSYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDYr,        X86::VFNMSUB213PDYm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZr,        X86::VFNMSUB231PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZr,        X86::VFNMSUB231PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZr,        X86::VFNMSUB132PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZr,        X86::VFNMSUB132PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZr,        X86::VFNMSUB213PSZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZr,        X86::VFNMSUB213PDZm,        TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZ128r,     X86::VFNMSUB231PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZ128r,     X86::VFNMSUB231PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZ128r,     X86::VFNMSUB132PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZ128r,     X86::VFNMSUB132PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZ128r,     X86::VFNMSUB213PSZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZ128r,     X86::VFNMSUB213PDZ128m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PSZ256r,     X86::VFNMSUB231PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB231PDZ256r,     X86::VFNMSUB231PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PSZ256r,     X86::VFNMSUB132PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB132PDZ256r,     X86::VFNMSUB132PDZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PSZ256r,     X86::VFNMSUB213PSZ256m,     TB_ALIGN_NONE },
-    { X86::VFNMSUB213PDZ256r,     X86::VFNMSUB213PDZ256m,     TB_ALIGN_NONE },
-
-    { X86::VFMADDSUB231PSr,       X86::VFMADDSUB231PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDr,       X86::VFMADDSUB231PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSr,       X86::VFMADDSUB132PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDr,       X86::VFMADDSUB132PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSr,       X86::VFMADDSUB213PSm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDr,       X86::VFMADDSUB213PDm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSYr,      X86::VFMADDSUB231PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDYr,      X86::VFMADDSUB231PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSYr,      X86::VFMADDSUB132PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDYr,      X86::VFMADDSUB132PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSYr,      X86::VFMADDSUB213PSYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDYr,      X86::VFMADDSUB213PDYm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZr,      X86::VFMADDSUB231PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZr,      X86::VFMADDSUB231PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZr,      X86::VFMADDSUB132PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZr,      X86::VFMADDSUB132PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZr,      X86::VFMADDSUB213PSZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZr,      X86::VFMADDSUB213PDZm,      TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZ128r,   X86::VFMADDSUB231PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZ128r,   X86::VFMADDSUB231PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZ128r,   X86::VFMADDSUB132PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZ128r,   X86::VFMADDSUB132PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZ128r,   X86::VFMADDSUB213PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZ128r,   X86::VFMADDSUB213PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PSZ256r,   X86::VFMADDSUB231PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB231PDZ256r,   X86::VFMADDSUB231PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PSZ256r,   X86::VFMADDSUB132PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB132PDZ256r,   X86::VFMADDSUB132PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PSZ256r,   X86::VFMADDSUB213PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMADDSUB213PDZ256r,   X86::VFMADDSUB213PDZ256m,   TB_ALIGN_NONE },
-
-    { X86::VFMSUBADD231PSr,       X86::VFMSUBADD231PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDr,       X86::VFMSUBADD231PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSr,       X86::VFMSUBADD132PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDr,       X86::VFMSUBADD132PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSr,       X86::VFMSUBADD213PSm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDr,       X86::VFMSUBADD213PDm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSYr,      X86::VFMSUBADD231PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDYr,      X86::VFMSUBADD231PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSYr,      X86::VFMSUBADD132PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDYr,      X86::VFMSUBADD132PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSYr,      X86::VFMSUBADD213PSYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDYr,      X86::VFMSUBADD213PDYm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZr,      X86::VFMSUBADD231PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZr,      X86::VFMSUBADD231PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZr,      X86::VFMSUBADD132PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZr,      X86::VFMSUBADD132PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZr,      X86::VFMSUBADD213PSZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZr,      X86::VFMSUBADD213PDZm,      TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZ128r,   X86::VFMSUBADD231PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZ128r,   X86::VFMSUBADD231PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZ128r,   X86::VFMSUBADD132PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZ128r,   X86::VFMSUBADD132PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZ128r,   X86::VFMSUBADD213PSZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZ128r,   X86::VFMSUBADD213PDZ128m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PSZ256r,   X86::VFMSUBADD231PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD231PDZ256r,   X86::VFMSUBADD231PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PSZ256r,   X86::VFMSUBADD132PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD132PDZ256r,   X86::VFMSUBADD132PDZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PSZ256r,   X86::VFMSUBADD213PSZ256m,   TB_ALIGN_NONE },
-    { X86::VFMSUBADD213PDZ256r,   X86::VFMSUBADD213PDZ256m,   TB_ALIGN_NONE },
-
     // FMA4 foldable patterns
     { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
     { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
@@ -2234,6 +1959,13 @@
                   // Index 3, folded load
                   Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
+  auto I = X86InstrFMA3Info::rm_begin();
+  auto E = X86InstrFMA3Info::rm_end();
+  for (; I != E; ++I)
+    if (!I.getGroup()->isKMasked())
+      AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
+                    I.getRegOpcode(), I.getMemOpcode(),
+                    TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
 
   static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
      // AVX-512 foldable instructions
@@ -2283,6 +2015,11 @@
                   // Index 4, folded load
                   Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
+  for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I)
+    if (I.getGroup()->isKMasked())
+      AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
+                    I.getRegOpcode(), I.getMemOpcode(),
+                    TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
 }
 
 void
@@ -3345,241 +3082,11 @@
   return NewMI;
 }
 
-/// Returns true if the given instruction opcode is FMA3.
-/// Otherwise, returns false.
-/// The second parameter is optional and is used as the second return from
-/// the function. It is set to true if the given instruction has FMA3 opcode
-/// that is used for lowering of scalar FMA intrinsics, and it is set to false
-/// otherwise.
-static bool isFMA3(unsigned Opcode, bool &IsIntrinsic) {
-  IsIntrinsic = false;
+unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
+    const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
 
-#define FMA3_CASE(Name, Modifier) \
-case X86::Name##r##Modifier: case X86::Name##m##Modifier:
-
-#define FMA3_SCALAR_PAIR(Name, Size, Modifier) \
-  FMA3_CASE(Name##SD##Size, Modifier) \
-  FMA3_CASE(Name##SS##Size, Modifier)
-
-#define FMA3_PACKED_PAIR(Name, Size) \
-  FMA3_CASE(Name##PD##Size, ) \
-  FMA3_CASE(Name##PS##Size, )
-
-#define FMA3_PACKED_SET(Form, Size) \
-  FMA3_PACKED_PAIR(VFMADD##Form, Size) \
-  FMA3_PACKED_PAIR(VFMSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFNMADD##Form, Size) \
-  FMA3_PACKED_PAIR(VFNMSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFMADDSUB##Form, Size) \
-  FMA3_PACKED_PAIR(VFMSUBADD##Form, Size)
-
-#define FMA3_CASES(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, ,) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, ,) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, ,) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, ,) \
-  FMA3_PACKED_SET(Form, ) \
-  FMA3_PACKED_SET(Form, Y) \
-
-#define FMA3_CASES_AVX512(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, Z, ) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, ) \
-  FMA3_PACKED_SET(Form, Z128) \
-  FMA3_PACKED_SET(Form, Z256) \
-  FMA3_PACKED_SET(Form, Z)
-
-#define FMA3_CASES_SCALAR_INT(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, , _Int) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, , _Int)
-
-#define FMA3_CASES_SCALAR_INT_AVX512(Form) \
-  FMA3_SCALAR_PAIR(VFMADD##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFMSUB##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFNMADD##Form, Z, _Int) \
-  FMA3_SCALAR_PAIR(VFNMSUB##Form, Z, _Int)
-
-  switch (Opcode) {
-  FMA3_CASES(132)
-  FMA3_CASES(213)
-  FMA3_CASES(231)
-
-  // AVX-512 instructions
-  FMA3_CASES_AVX512(132)
-  FMA3_CASES_AVX512(213)
-  FMA3_CASES_AVX512(231)
-    return true;
-
-  FMA3_CASES_SCALAR_INT(132)
-  FMA3_CASES_SCALAR_INT(213)
-  FMA3_CASES_SCALAR_INT(231)
-
-  // AVX-512 instructions
-  FMA3_CASES_SCALAR_INT_AVX512(132)
-  FMA3_CASES_SCALAR_INT_AVX512(213)
-  FMA3_CASES_SCALAR_INT_AVX512(231)
-    IsIntrinsic = true;
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("Opcode not handled by the switch");
-
-#undef FMA3_CASE
-#undef FMA3_SCALAR_PAIR
-#undef FMA3_PACKED_PAIR
-#undef FMA3_PACKED_SET
-#undef FMA3_CASES
-#undef FMA3_CASES_AVX512
-#undef FMA3_CASES_SCALAR_INT
-#undef FMA3_CASES_SCALAR_INT_AVX512
-}
-
-/// Returns an adjusted FMA opcode that must be used in FMA instruction that
-/// performs the same computations as the given MI but which has the operands
-/// \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
-/// It may return 0 if it is unsafe to commute the operands.
-///
-/// The returned FMA opcode may differ from the opcode in the given \p MI.
-/// For example, commuting the operands #1 and #3 in the following FMA
-///     FMA213 #1, #2, #3
-/// results into instruction with adjusted opcode:
-///     FMA231 #3, #2, #1
-static unsigned getFMA3OpcodeToCommuteOperands(unsigned Opc,
-                                               bool IsIntrinOpcode,
-                                               unsigned SrcOpIdx1,
-                                               unsigned SrcOpIdx2) {
-#define FMA3_ENTRY(Name, Suffix) \
-  { X86::Name##132##Suffix, X86::Name##213##Suffix, X86::Name##231##Suffix },
-
-#define FMA3_SCALAR_PAIR(Name, Suffix) \
-  FMA3_ENTRY(Name, SS##Suffix) \
-  FMA3_ENTRY(Name, SD##Suffix)
-
-#define FMA3_PACKED_PAIR(Name, Suffix) \
-  FMA3_ENTRY(Name, PS##Suffix) \
-  FMA3_ENTRY(Name, PD##Suffix)
-
-#define FMA3_PACKED_SIZES(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Y##Suffix)
-
-#define FMA3_TABLE_ALL(Name) \
-  FMA3_SCALAR_PAIR(Name, r) \
-  FMA3_PACKED_SIZES(Name, r) \
-  FMA3_SCALAR_PAIR(Name, m) \
-  FMA3_PACKED_SIZES(Name, m)
-
-#define FMA3_TABLE_PACKED(Name) \
-  FMA3_PACKED_SIZES(Name, r) \
-  FMA3_PACKED_SIZES(Name, m)
-
-#define FMA3_TABLE_SCALAR_INT(Name) \
-  FMA3_SCALAR_PAIR(Name, r_Int) \
-  FMA3_SCALAR_PAIR(Name, m_Int)
-
-#define FMA3_PACKED_SIZES_AVX512(Name, Suffix) \
-  FMA3_PACKED_PAIR(Name, Z128##Suffix) \
-  FMA3_PACKED_PAIR(Name, Z256##Suffix) \
-  FMA3_PACKED_PAIR(Name, Z##Suffix)
-
-#define FMA3_TABLE_ALL_AVX512(Name) \
-  FMA3_SCALAR_PAIR(Name, Zr) \
-  FMA3_PACKED_SIZES_AVX512(Name, r) \
-  FMA3_SCALAR_PAIR(Name, Zm) \
-  FMA3_PACKED_SIZES_AVX512(Name, m)
-
-#define FMA3_TABLE_PACKED_AVX512(Name) \
-  FMA3_PACKED_SIZES_AVX512(Name, r) \
-  FMA3_PACKED_SIZES_AVX512(Name, m)
-
-#define FMA3_TABLE_SCALAR_INT_AVX512(Name) \
-  FMA3_SCALAR_PAIR(Name, Zr_Int) \
-  FMA3_SCALAR_PAIR(Name, Zm_Int)
-
-  // Define the array that holds FMA opcodes in groups
-  // of 3 opcodes(132, 213, 231) in each group.
-  static const uint16_t RegularOpcodeGroups[][3] = {
-    FMA3_TABLE_ALL(VFMADD)
-    FMA3_TABLE_ALL(VFMSUB)
-    FMA3_TABLE_ALL(VFNMADD)
-    FMA3_TABLE_ALL(VFNMSUB)
-    FMA3_TABLE_PACKED(VFMADDSUB)
-    FMA3_TABLE_PACKED(VFMSUBADD)
-
-    // AVX-512 instructions
-    FMA3_TABLE_ALL_AVX512(VFMADD)
-    FMA3_TABLE_ALL_AVX512(VFMSUB)
-    FMA3_TABLE_ALL_AVX512(VFNMADD)
-    FMA3_TABLE_ALL_AVX512(VFNMSUB)
-    FMA3_TABLE_PACKED_AVX512(VFMADDSUB)
-    FMA3_TABLE_PACKED_AVX512(VFMSUBADD)
-  };
-
-  // Define the array that holds FMA*_Int opcodes in groups
-  // of 3 opcodes(132, 213, 231) in each group.
-  static const uint16_t IntrinOpcodeGroups[][3] = {
-    FMA3_TABLE_SCALAR_INT(VFMADD)
-    FMA3_TABLE_SCALAR_INT(VFMSUB)
-    FMA3_TABLE_SCALAR_INT(VFNMADD)
-    FMA3_TABLE_SCALAR_INT(VFNMSUB)
-
-    // AVX-512 instructions
-    FMA3_TABLE_SCALAR_INT_AVX512(VFMADD)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFMSUB)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFNMADD)
-    FMA3_TABLE_SCALAR_INT_AVX512(VFNMSUB)
-  };
-
-#undef FMA3_ENTRY
-#undef FMA3_SCALAR_PAIR
-#undef FMA3_PACKED_PAIR
-#undef FMA3_PACKED_SIZES
-#undef FMA3_TABLE_ALL
-#undef FMA3_TABLE_PACKED
-#undef FMA3_TABLE_SCALAR_INT
-#undef FMA3_SCALAR_PAIR_AVX512
-#undef FMA3_PACKED_SIZES_AVX512
-#undef FMA3_TABLE_ALL_AVX512
-#undef FMA3_TABLE_PACKED_AVX512
-#undef FMA3_TABLE_SCALAR_INT_AVX512
-
-  const unsigned Form132Index = 0;
-  const unsigned Form213Index = 1;
-  const unsigned Form231Index = 2;
-  const unsigned FormsNum = 3;
-
-  size_t GroupsNum;
-  const uint16_t (*OpcodeGroups)[3];
-  if (IsIntrinOpcode) {
-    GroupsNum = array_lengthof(IntrinOpcodeGroups);
-    OpcodeGroups = IntrinOpcodeGroups;
-  } else {
-    GroupsNum = array_lengthof(RegularOpcodeGroups);
-    OpcodeGroups = RegularOpcodeGroups;
-  }
-
-  const uint16_t *FoundOpcodesGroup = nullptr;
-  size_t FormIndex;
-
-  // Look for the input opcode in the corresponding opcodes table.
-  for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup;
-         ++GroupIndex) {
-    for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) {
-      if (OpcodeGroups[GroupIndex][FormIndex] == Opc) {
-        FoundOpcodesGroup = OpcodeGroups[GroupIndex];
-        break;
-      }
-    }
-  }
-
-  // The input opcode does not match with any of the opcodes from the tables.
-  // The unsupported FMA opcode must be added to one of the two opcode groups
-  // defined above.
-  assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode");
+  unsigned Opc = MI.getOpcode();
 
   // Put the lowest index to SrcOpIdx1 to simplify the checks below.
   if (SrcOpIdx1 > SrcOpIdx2)
@@ -3591,15 +3098,40 @@
   // not implemented yet. So, just return 0 in that case.
   // When such analysis are available this place will be the right place for
   // calling it.
-  if (IsIntrinOpcode && SrcOpIdx1 == 1)
+  if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
     return 0;
 
+  unsigned FMAOp1 = 1, FMAOp2 = 2, FMAOp3 = 3;
+  if (FMA3Group.isKMasked()) {
+    // The k-mask operand cannot be commuted.
+    if (SrcOpIdx1 == 2)
+      return 0;
+
+    // For k-zero-masked operations it is Ok to commute the first vector
+    // operand.
+    // For regular k-masked operations a conservative choice is done as the
+    // elements of the first vector operand, for which the corresponding bit
+    // in the k-mask operand is set to 0, are copied to the result of FMA.
+    // TODO/FIXME: The commute still may be legal if it is known that the
+    // k-mask operand is set to either all ones or all zeroes.
+    // It is also Ok to commute the 1st operand if all users of MI use only
+    // the elements enabled by the k-mask operand. For example,
+    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
+    //                                                     : v1[i];
+    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
+    //                                  // Ok, to commute v1 in FMADD213PSZrk.
+    if (FMA3Group.isKMergeMasked() && SrcOpIdx1 == FMAOp1)
+      return 0;
+    FMAOp2++;
+    FMAOp3++;
+  }
+
   unsigned Case;
-  if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2)
+  if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp2)
     Case = 0;
-  else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3)
+  else if (SrcOpIdx1 == FMAOp1 && SrcOpIdx2 == FMAOp3)
     Case = 1;
-  else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3)
+  else if (SrcOpIdx1 == FMAOp2 && SrcOpIdx2 == FMAOp3)
     Case = 2;
   else
     return 0;
@@ -3607,6 +3139,9 @@
   // Define the FMA forms mapping array that helps to map input FMA form
   // to output FMA form to preserve the operation semantics after
   // commuting the operands.
+  const unsigned Form132Index = 0;
+  const unsigned Form213Index = 1;
+  const unsigned Form231Index = 2;
   static const unsigned FormMapping[][3] = {
     // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
     // FMA132 A, C, b; ==> FMA231 C, A, b;
@@ -3625,9 +3160,24 @@
     { Form213Index, Form132Index, Form231Index }
   };
 
+  unsigned FMAForms[3];
+  if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
+    FMAForms[0] = FMA3Group.getReg132Opcode();
+    FMAForms[1] = FMA3Group.getReg213Opcode();
+    FMAForms[2] = FMA3Group.getReg231Opcode();
+  } else {
+    FMAForms[0] = FMA3Group.getMem132Opcode();
+    FMAForms[1] = FMA3Group.getMem213Opcode();
+    FMAForms[2] = FMA3Group.getMem231Opcode();
+  }
+  unsigned FormIndex;
+  for (FormIndex = 0; FormIndex < 3; FormIndex++)
+    if (Opc == FMAForms[FormIndex])
+      break;
+
   // Everything is ready, just adjust the FMA opcode and return it.
   FormIndex = FormMapping[Case][FormIndex];
-  return FoundOpcodesGroup[FormIndex];
+  return FMAForms[FormIndex];
 }
 
 MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
@@ -3852,11 +3402,11 @@
                                                    OpIdx1, OpIdx2);
   }
   default:
-    bool IsIntrinOpcode;
-    if (isFMA3(MI.getOpcode(), IsIntrinOpcode)) {
-      unsigned Opc = getFMA3OpcodeToCommuteOperands(MI.getOpcode(),
-                                                    IsIntrinOpcode,
-                                                    OpIdx1, OpIdx2);
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group) {
+      unsigned Opc =
+        getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
       if (Opc == 0)
         return nullptr;
       auto &WorkingMI = cloneIfNew(MI);
@@ -3869,21 +3419,37 @@
   }
 }
 
-bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI,
-                                             bool IsIntrinOpcode,
-                                             unsigned &SrcOpIdx1,
-                                             unsigned &SrcOpIdx2) const {
+bool X86InstrInfo::findFMA3CommutedOpIndices(
+    const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
+    const X86InstrFMA3Group &FMA3Group) const {
+  unsigned FirstCommutableVecOp = 1;
+  unsigned LastCommutableVecOp = 3;
+  unsigned KMaskOp = 0;
+  if (FMA3Group.isKMasked()) {
+    // The k-mask operand has index = 2 for masked and zero-masked operations.
+    KMaskOp = 2;
 
-  unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3;
+    // The operand with index = 1 is used as a source for those elements for
+    // which the corresponding bit in the k-mask is set to 0.
+    if (FMA3Group.isKMergeMasked())
+      FirstCommutableVecOp = 3;
+
+    LastCommutableVecOp++;
+  }
+
+  if (isMem(MI, LastCommutableVecOp))
+    LastCommutableVecOp--;
 
   // Only the first RegOpsNum operands are commutable.
   // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
   // that the operand is not specified/fixed.
   if (SrcOpIdx1 != CommuteAnyOperandIndex &&
-      (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum))
+      (SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
+       SrcOpIdx1 == KMaskOp))
     return false;
   if (SrcOpIdx2 != CommuteAnyOperandIndex &&
-      (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum))
+      (SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
+       SrcOpIdx2 == KMaskOp))
     return false;
 
   // Look for two different register operands assumed to be commutable
@@ -3898,7 +3464,7 @@
     if (SrcOpIdx1 == SrcOpIdx2)
       // Both of operands are not fixed. By default set one of commutable
       // operands to the last register operand of the instruction.
-      CommutableOpIdx2 = RegOpsNum;
+      CommutableOpIdx2 = LastCommutableVecOp;
     else if (SrcOpIdx2 == CommuteAnyOperandIndex)
       // Only one of operands is not fixed.
       CommutableOpIdx2 = SrcOpIdx1;
@@ -3906,7 +3472,12 @@
     // CommutableOpIdx2 is well defined now. Let's choose another commutable
     // operand and assign its index to CommutableOpIdx1.
     unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
-    for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) {
+    for (CommutableOpIdx1 = LastCommutableVecOp;
+         CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
+      // Just ignore and skip the k-mask operand.
+      if (CommutableOpIdx1 == KMaskOp)
+        continue;
+
       // The commuted operands must have different registers.
       // Otherwise, the commute transformation does not change anything and
       // is useless then.
@@ -3915,7 +3486,7 @@
     }
 
     // No appropriate commutable operands were found.
-    if (CommutableOpIdx1 == 0)
+    if (CommutableOpIdx1 < FirstCommutableVecOp)
       return false;
 
     // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
@@ -3927,8 +3498,7 @@
 
   // Check if we can adjust the opcode to preserve the semantics when
   // commute the register operands.
-  return getFMA3OpcodeToCommuteOperands(MI.getOpcode(), IsIntrinOpcode,
-                                        SrcOpIdx1, SrcOpIdx2) != 0;
+  return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
 }
 
 bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
@@ -3955,10 +3525,10 @@
     return false;
   }
   default:
-    bool IsIntrinOpcode;
-    if (isFMA3(MI.getOpcode(), IsIntrinOpcode))
-      return findFMA3CommutedOpIndices(MI, IsIntrinOpcode,
-                                       SrcOpIdx1, SrcOpIdx2);
+    const X86InstrFMA3Group *FMA3Group =
+        X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
+    if (FMA3Group)
+      return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
     return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
   return false;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 7251aec..5c8de0f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
 
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "X86InstrFMA3Info.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -265,7 +266,7 @@
                              unsigned &SrcOpIdx2) const override;
 
   /// Returns true if the routine could find two commutable operands
-  /// in the given FMA instruction. Otherwise, returns false.
+  /// in the given FMA instruction \p MI. Otherwise, returns false.
   ///
   /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
   /// The output indices of the commuted operands are returned in these
@@ -274,10 +275,12 @@
   /// value 'CommuteAnyOperandIndex' which means that the corresponding
   /// operand index is not set and this method is free to pick any of
   /// available commutable operands.
+  /// The parameter \p FMA3Group keeps the reference to the group of relative
+  /// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
   ///
   /// For example, calling this method this way:
   ///     unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
-  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2);
+  ///     findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
   /// can be interpreted as a query asking if the operand #1 can be swapped
   /// with any other available operand (e.g. operand #2, operand #3, etc.).
   ///
@@ -286,9 +289,30 @@
   ///     FMA213 #1, #2, #3
   /// results into instruction with adjusted opcode:
   ///     FMA231 #3, #2, #1
-  bool findFMA3CommutedOpIndices(MachineInstr &MI, bool IsIntrinOpcode,
+  bool findFMA3CommutedOpIndices(const MachineInstr &MI,
                                  unsigned &SrcOpIdx1,
-                                 unsigned &SrcOpIdx2) const;
+                                 unsigned &SrcOpIdx2,
+                                 const X86InstrFMA3Group &FMA3Group) const;
+
+  /// Returns an adjusted FMA opcode that must be used in FMA instruction that
+  /// performs the same computations as the given \p MI but which has the
+  /// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
+  /// It may return 0 if it is unsafe to commute the operands.
+  /// Note that a machine instruction (instead of its opcode) is passed as the
+  /// first parameter to make it possible to analyze the instruction's uses and
+  /// commute the first operand of FMA even when it seems unsafe when you look
+  /// at the opcode. For example, it is Ok to commute the first operand of
+  /// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
+  ///
+  /// The returned FMA opcode may differ from the opcode in the given \p MI.
+  /// For example, commuting the operands #1 and #3 in the following FMA
+  ///     FMA213 #1, #2, #3
+  /// results into instruction with adjusted opcode:
+  ///     FMA231 #3, #2, #1
+  unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
+                                          unsigned SrcOpIdx1,
+                                          unsigned SrcOpIdx2,
+                                          const X86InstrFMA3Group &FMA3Group) const;
 
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr &MI) const override;