Implement "punpckldq  %xmm0, $xmm0" as "pshufd  $0x50, %xmm0, %xmm" unless optimizing for code size.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@56711 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 56bffab..b5c0c35 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
@@ -130,12 +131,17 @@
     ///
     MachineBasicBlock *CurBB;
 
+    /// OptForSize - If true, selector should try to optimize for code size
+    /// instead of performance.
+    bool OptForSize;
+
   public:
     X86DAGToDAGISel(X86TargetMachine &tm, bool fast)
       : SelectionDAGISel(X86Lowering, fast),
         ContainsFPCode(false), TM(tm),
         X86Lowering(*TM.getTargetLowering()),
-        Subtarget(&TM.getSubtarget<X86Subtarget>()) {}
+        Subtarget(&TM.getSubtarget<X86Subtarget>()),
+        OptForSize(OptimizeForSize) {}
 
     virtual bool runOnFunction(Function &Fn) {
       // Make sure we re-emit a set of the global base reg if necessary
@@ -650,6 +656,10 @@
 /// when it has created a SelectionDAG for us to codegen.
 void X86DAGToDAGISel::InstructionSelect() {
   CurBB = BB;  // BB can change as result of isel.
+  if (!OptForSize) {
+    const Function *F = CurDAG->getMachineFunction().getFunction();
+    OptForSize = !F->isDeclaration() && F->hasNote(Attribute::OptimizeForSize);
+  }
 
   DEBUG(BB->dump());
   if (!Fast)
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index bd61c3a..11ea1c0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -186,6 +186,7 @@
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
 def NotSmallCode : Predicate<"TM.getCodeModel() != CodeModel::Small">;
 def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def OptForSpeed  : Predicate<"!OptForSize">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index af7694a..8b45795 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -744,7 +744,7 @@
                      addr:$dst)]>;
 
 let Constraints = "$src1 = $dst" in {
-let AddedComplexity = 15 in {
+let AddedComplexity = 20 in {
 def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "movlhps\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
@@ -759,7 +759,7 @@
 } // AddedComplexity
 } // Constraints = "$src1 = $dst"
 
-let AddedComplexity = 15 in
+let AddedComplexity = 20 in
 def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef), MOVDDUP_shuffle_mask)),
           (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
 
@@ -2921,6 +2921,7 @@
            SHUFP_unary_shuffle_mask:$sm),
           (PSHUFDmi addr:$src1, SHUFP_unary_shuffle_mask:$sm)>,
       Requires<[HasSSE2]>;
+
 // Special binary v4i32 shuffle cases with SHUFPS.
 def : Pat<(v4i32 (vector_shuffle VR128:$src1, (v4i32 VR128:$src2),
            PSHUFD_binary_shuffle_mask:$sm)),
@@ -2937,11 +2938,21 @@
           Requires<[HasSSE2]>;
 // Special unary SHUFPDrri case.
 def : Pat<(v2i64 (vector_shuffle VR128:$src1, (undef),
-           SHUFP_unary_shuffle_mask:$sm)),
+                  SHUFP_unary_shuffle_mask:$sm)),
           (SHUFPDrri VR128:$src1, VR128:$src1, SHUFP_unary_shuffle_mask:$sm)>,
       Requires<[HasSSE2]>;
 
 // vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                                 UNPCKL_v_undef_shuffle_mask:$sm)),
+          (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                                 UNPCKL_v_undef_shuffle_mask:$sm)),
+          (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
 let AddedComplexity = 10 in {
 def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
                   UNPCKL_v_undef_shuffle_mask)),
@@ -2958,6 +2969,16 @@
 }
 
 // vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (vector_shuffle VR128:$src, (undef),
+                 UNPCKH_v_undef_shuffle_mask:$sm)),
+          (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
+                 UNPCKH_v_undef_shuffle_mask:$sm)),
+          (PSHUFDri VR128:$src, PSHUFD_shuffle_mask:$sm)>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
 let AddedComplexity = 10 in {
 def : Pat<(v4f32 (vector_shuffle VR128:$src, (undef),
                   UNPCKH_v_undef_shuffle_mask)),
@@ -2973,7 +2994,7 @@
           (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
 }
 
-let AddedComplexity = 15 in {
+let AddedComplexity = 20 in {
 // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
 def : Pat<(v4i32 (vector_shuffle VR128:$src1, VR128:$src2,
                   MOVHP_shuffle_mask)),