Created new X86 FMA3 opcodes (FMA*_Int) that are used now for lowering of scalar FMA intrinsics. Patch by Slava Klochkov The key difference between FMA* and FMA*_Int opcodes is that FMA*_Int opcodes are handled more conservatively. It is illegal to commute the 1st operand of FMA*_Int instructions as the upper bits of scalar FMA intrinsic result must be taken from the 1st operand, but such commute transformation would change those upper bits and invalidate the intrinsic's result. Reviewers: Quentin Colombet, Elena Demikhovsky Differential Revision: http://reviews.llvm.org/D13710 llvm-svn: 252060

commit: e41a8c4182f403b4f98f6b03f1803484a1f60201 [log] [tgz]
author: Andrew Kaylor <andrew.kaylor@intel.com> Wed Nov 04 18:10:41 2015 +0000
committer: Andrew Kaylor <andrew.kaylor@intel.com> Wed Nov 04 18:10:41 2015 +0000
tree: 2622019fe649a2f9f562157ac145937b27004552
parent: e028e4e3bbfe631b62f9c0127777ed9c56dfc4b7 [diff]
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 7cc3b59..96c067b 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td

@@ -126,9 +126,22 @@
                                v4f64>, VEX_W;
 }
 
-let Constraints = "$src1 = $dst" in {
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
-                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+// All source register operands of FMA instructions can be commuted.
+// In many cases such commute transformation requres an opcode adjustment,
+// for example, commuting the operands 1 and 2 in FMA*132 form would require
+// an opcode change to FMA*231:
+//     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
+//     -->
+//     FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
+// Currently, the commute transformation is supported for only few FMA forms.
+// That is the reason why \p IsRVariantCommutable and \p IsMVariantCommutable
+// parameters are used here.
+// The general commute operands optimization working for all forms is going
+// to be implemented soon. (Please, see http://reviews.llvm.org/D13269
+// for details).
+let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
+multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
+                    X86MemOperand x86memop, RegisterClass RC,
                     bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator OpNode = null_frag> {
   let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
@@ -136,8 +149,7 @@
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set RC:$dst,
-                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+                   [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
 
   let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
@@ -145,52 +157,96 @@
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
-                     (OpVT (OpNode RC:$src2, RC:$src1,
-                            (mem_frag addr:$src3))))]>;
+                     (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
 }
-} // Constraints = "$src1 = $dst"
+} // Constraints = "$src1 = $dst", hasSideEffects = 0
+
+// These FMA*_Int instructions are defined specially for being used when
+// the scalar FMA intrinsics are lowered to machine instructions, and in that
+// sence they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc.
+// instructions.
+//
+// The FMA*_Int instructions are _TEMPORARILY_ defined as NOT commutable.
+// The upper bits of the result of scalar FMA intrinsics must be copied from
+// the upper bits of the 1st operand. So, commuting the 1st operand would
+// invalidate the upper bits of the intrinsic result.
+// The corresponding optimization which allows commuting 2nd and 3rd operands
+// of FMA*_Int instructions has been developed and is waiting for
+// code-review approval and checkin (Please see http://reviews.llvm.org/D13269).
+let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
+    hasSideEffects = 0 in {
+multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
+                        Operand memopr, RegisterClass RC> {
+  def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+
+  let mayLoad = 1 in
+  def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, memopr:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   []>;
+}
+} // Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1,
+  // hasSideEffects = 0
 
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpStr, string PackTy, string PT2, Intrinsic Int,
-                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
-                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
-                       ComplexPattern mem_cpat> {
-let hasSideEffects = 0 in {
-  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                       x86memop, RC, OpVT, mem_frag>;
-  // See the other defm of r231 for the explanation regarding the
-  // commutable flags.
-  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                       x86memop, RC, OpVT, mem_frag,
+                       string OpStr, string PackTy,
+                       SDNode OpNode, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>;
+  defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC,
                        /* IsRVariantCommutable */ 1,
-                       /* IsMVariantCommutable */ 0>;
+                       /* IsMVariantCommutable */ 1,
+                       OpNode>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0,
+                       null_frag>;
 }
 
-// See the other defm of r213 for the explanation regarding the
-// commutable flags.
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag,
-                     /* IsRVariantCommutable */ 1,
-                     /* IsMVariantCommutable */ 1,
-                     OpNode>;
+// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// to machine instructions.
+// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
+// of FMA 213 form.
+// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132
+// forms and is possible only after special analysis of all uses of the initial
+// instruction. Such analysis do not exist yet and thus introducing the 231
+// form of FMA*_Int instructions is done using an optimistic assumption that
+// such analysis will be implemented eventually.
+multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                           string OpStr, string PackTy,
+                           RegisterClass RC, Operand memop> {
+  defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy),
+                           memop, RC>;
+  defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
+                           memop, RC>;
+  defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy),
+                           memop, RC>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                  string OpStr, Intrinsic IntF32, Intrinsic IntF64,
                  SDNode OpNode> {
-  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode,
-                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
-  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode,
-                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode,
+                        FR32, f32mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>;
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode,
+                        FR64, f64mem>,
+            fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>,
+            VEX_W;
 
-// These patterns use the 123 ordering, instead of 213, even though
-// they match the intrinsic to the 213 version of the instruction.
-// This is because src1 is tied to dest, and the scalar intrinsics
-// require the pass-through values to come from the first source
-// operand, not the second.
+  // These patterns use the 123 ordering, instead of 213, even though
+  // they match the intrinsic to the 213 version of the instruction.
+  // This is because src1 is tied to dest, and the scalar intrinsics
+  // require the pass-through values to come from the first source
+  // operand, not the second.
   def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
             (COPY_TO_REGCLASS
-              (!cast<Instruction>(NAME#"SSr213r")
+              (!cast<Instruction>(NAME#"SSr213r_Int")
                 (COPY_TO_REGCLASS $src1, FR32),
                 (COPY_TO_REGCLASS $src2, FR32),
                 (COPY_TO_REGCLASS $src3, FR32)),
@@ -198,7 +254,7 @@
 
   def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
             (COPY_TO_REGCLASS
-              (!cast<Instruction>(NAME#"SDr213r")
+              (!cast<Instruction>(NAME#"SDr213r_Int")
                 (COPY_TO_REGCLASS $src1, FR64),
                 (COPY_TO_REGCLASS $src2, FR64),
                 (COPY_TO_REGCLASS $src3, FR64)),

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 029edd4..81c61d9 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp

@@ -1734,11 +1734,17 @@
   static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
     // FMA foldable instructions
     { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr231r_Int,     X86::VFMADDSSr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr231r_Int,     X86::VFMADDSDr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr132r_Int,     X86::VFMADDSSr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr132r_Int,     X86::VFMADDSDr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSSr213r_Int,     X86::VFMADDSSr213m_Int,     TB_ALIGN_NONE },
     { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMADDSDr213r_Int,     X86::VFMADDSDr213m_Int,     TB_ALIGN_NONE },
 
     { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
     { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
@@ -1754,11 +1760,17 @@
     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr231r_Int,    X86::VFNMADDSSr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr231r_Int,    X86::VFNMADDSDr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr132r_Int,    X86::VFNMADDSSr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr132r_Int,    X86::VFNMADDSDr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSSr213r_Int,    X86::VFNMADDSSr213m_Int,    TB_ALIGN_NONE },
     { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMADDSDr213r_Int,    X86::VFNMADDSDr213m_Int,    TB_ALIGN_NONE },
 
     { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
@@ -1774,11 +1786,17 @@
     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
 
     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr231r_Int,     X86::VFMSUBSSr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr231r_Int,     X86::VFMSUBSDr231m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr132r_Int,     X86::VFMSUBSSr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr132r_Int,     X86::VFMSUBSDr132m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSSr213r_Int,     X86::VFMSUBSSr213m_Int,     TB_ALIGN_NONE },
     { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
+    { X86::VFMSUBSDr213r_Int,     X86::VFMSUBSDr213m_Int,     TB_ALIGN_NONE },
 
     { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
@@ -1794,11 +1812,17 @@
     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr231r_Int,    X86::VFNMSUBSSr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr231r_Int,    X86::VFNMSUBSDr231m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr132r_Int,    X86::VFNMSUBSSr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr132r_Int,    X86::VFNMSUBSDr132m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSSr213r_Int,    X86::VFNMSUBSSr213m_Int,    TB_ALIGN_NONE },
     { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
+    { X86::VFNMSUBSDr213r_Int,    X86::VFNMSUBSDr213m_Int,    TB_ALIGN_NONE },
 
     { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
commit	e41a8c4182f403b4f98f6b03f1803484a1f60201	[log] [tgz]
author	Andrew Kaylor <andrew.kaylor@intel.com>	Wed Nov 04 18:10:41 2015 +0000
committer	Andrew Kaylor <andrew.kaylor@intel.com>	Wed Nov 04 18:10:41 2015 +0000
tree	2622019fe649a2f9f562157ac145937b27004552
parent	e028e4e3bbfe631b62f9c0127777ed9c56dfc4b7 [diff]