Add preliminary v2i32 support for SPU backend. As there are no
such registers in SPU, this support boils down to "emulating" 
them by duplicating instructions on the general purpose registers. 
This adds the most basic operations on v2i32: passing parameters,
addition, subtraction, multiplication and a few others.
llvm-svn: 110035
diff --git a/llvm/lib/Target/CellSPU/SPUCallingConv.td b/llvm/lib/Target/CellSPU/SPUCallingConv.td
index ec2f663..047eeb4 100644
--- a/llvm/lib/Target/CellSPU/SPUCallingConv.td
+++ b/llvm/lib/Target/CellSPU/SPUCallingConv.td
@@ -37,7 +37,7 @@
 //===----------------------------------------------------------------------===//
 def CCC_SPU : CallingConv<[
   CCIfType<[i8, i16, i32, i64, i128, f32, f64, 
-            v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
+            v16i8, v8i16, v4i32, v4f32, v2i64, v2f64, v2i32],
             CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
                            R12, R13, R14, R15, R16, R17, R18, R19, R20,
                            R21, R22, R23, R24, R25, R26, R27, R28, R29,
diff --git a/llvm/lib/Target/CellSPU/SPUISelLowering.cpp b/llvm/lib/Target/CellSPU/SPUISelLowering.cpp
index ece19b9..bcde579 100644
--- a/llvm/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/llvm/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -1067,6 +1067,7 @@
       case MVT::v4i32:
       case MVT::v8i16:
       case MVT::v16i8:
+      case MVT::v2i32:
         ArgRegClass = &SPU::VECREGRegClass;
         break;
       }
@@ -1622,8 +1623,7 @@
     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
   }
   case MVT::v2i32: {
-    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
+    return SDValue();
   }
   case MVT::v2i64: {
     return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
@@ -1768,6 +1768,9 @@
   } else if (EltVT == MVT::i16) {
     V2EltIdx0 = 8;
     maskVT = MVT::v8i16;
+  } else if (VecVT == MVT::v2i32 || VecVT == MVT::v2f32 ) {
+    V2EltIdx0 = 2;
+    maskVT = MVT::v4i32;
   } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
     V2EltIdx0 = 4;
     maskVT = MVT::v4i32;
@@ -1847,6 +1850,15 @@
       for (unsigned j = 0; j < BytesPerElement; ++j)
         ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
     }
+    // For half vectors padd the mask with zeros for the second half.
+    // This is needed because mask is assumed to be full vector elsewhere in 
+    // the SPU backend. 
+    if(VecVT == MVT::v2i32 || VecVT == MVT::v2f32)
+    for( unsigned i = 0; i < 2; ++i )
+    {
+      for (unsigned j = 0; j < BytesPerElement; ++j)
+        ResultMask.push_back(DAG.getConstant(0,MVT::i8));
+    }
 
     SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
                                     &ResultMask[0], ResultMask.size());
@@ -1877,6 +1889,7 @@
     case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
     case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
     case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
+    case MVT::v2i32: n_copies = 2; VT = MVT::i32; break;
     }
 
     SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
@@ -1997,7 +2010,7 @@
     // Variable index: Rotate the requested element into slot 0, then replicate
     // slot 0 across the vector
     EVT VecVT = N.getValueType();
-    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
+    if (!VecVT.isSimple() || !VecVT.isVector()) {
       report_fatal_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
                         "vector type!");
     }
diff --git a/llvm/lib/Target/CellSPU/SPUInstrInfo.td b/llvm/lib/Target/CellSPU/SPUInstrInfo.td
index a7fb14c..bc9668a 100644
--- a/llvm/lib/Target/CellSPU/SPUInstrInfo.td
+++ b/llvm/lib/Target/CellSPU/SPUInstrInfo.td
@@ -607,7 +607,7 @@
 multiclass AddInstruction {
   def v4i32: AVecInst<v4i32>;
   def v16i8: AVecInst<v16i8>;
-  
+  def v2i32: AVecInst<v2i32>;
   def r32:   ARegInst<R32C>;
 }
 
@@ -672,6 +672,12 @@
   "sf\t$rT, $rA, $rB", IntegerOp,
   [(set (v4i32 VECREG:$rT), (sub (v4i32 VECREG:$rB), (v4i32 VECREG:$rA)))]>;
 
+def SF2vec : RRForm<0b00000010000, (outs VECREG:$rT),
+                                  (ins VECREG:$rA, VECREG:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set (v2i32 VECREG:$rT), (sub (v2i32 VECREG:$rB), (v2i32 VECREG:$rA)))]>;
+
+
 def SFr32 : RRForm<0b00000010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
   "sf\t$rT, $rA, $rB", IntegerOp,
   [(set R32C:$rT, (sub R32C:$rB, R32C:$rA))]>;
@@ -829,6 +835,10 @@
   MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
            [/* no pattern */]>;
 
+def MPYUv2i32:
+  MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
 def MPYUr16:
   MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
            [(set R32C:$rT, (mul (zext R16C:$rA), (zext R16C:$rB)))]>;
@@ -908,6 +918,10 @@
     MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
              [/* no pattern */]>;
 
+def MPYHv2i32:
+    MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [/* no pattern */]>;
+
 def MPYHr32:
     MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
              [/* no pattern */]>;
@@ -1561,6 +1575,9 @@
 def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)),
           (ORv4i32_i32 R32C:$rA)>;
 
+def : Pat<(v2i32 (SPUprefslot2vec R32C:$rA)),
+          (ORv4i32_i32 R32C:$rA)>;
+
 def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)),
           (ORv2i64_i64 R64C:$rA)>;
 
@@ -1582,6 +1599,9 @@
 def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)),
           (ORi32_v4i32 VECREG:$rA)>;
 
+def : Pat<(SPUvec2prefslot (v2i32 VECREG:$rA)),
+          (ORi32_v4i32 VECREG:$rA)>;
+
 def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)),
           (ORi64_v2i64 VECREG:$rA)>;
 
@@ -2123,6 +2143,8 @@
   def v8i16_m32 : SHUFBVecInst<v8i16, v4i32>;
   def v4i32     : SHUFBVecInst<v4i32, v16i8>;
   def v4i32_m32 : SHUFBVecInst<v4i32, v4i32>;
+  def v2i32     : SHUFBVecInst<v2i32, v16i8>;
+  def v2i32_m32 : SHUFBVecInst<v2i32, v4i32>;
   def v2i64     : SHUFBVecInst<v2i64, v16i8>;
   def v2i64_m32 : SHUFBVecInst<v2i64, v4i32>;
 
diff --git a/llvm/lib/Target/CellSPU/SPUMathInstr.td b/llvm/lib/Target/CellSPU/SPUMathInstr.td
index ed7129e..7205593 100644
--- a/llvm/lib/Target/CellSPU/SPUMathInstr.td
+++ b/llvm/lib/Target/CellSPU/SPUMathInstr.td
@@ -39,7 +39,7 @@
                      (FSMBIv8i16 0xcccc))>;
                  
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
-// v4i32, i32 multiply instruction sequence:
+// v4i32, v2i32, i32 multiply instruction sequence:
 //-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
 
 def MPYv4i32:
@@ -49,6 +49,14 @@
                        (v4i32 (MPYHv4i32 VECREG:$rB, VECREG:$rA)))),
         (v4i32 (MPYUv4i32 VECREG:$rA, VECREG:$rB)))>;
 
+def MPYv2i32:
+  Pat<(mul (v2i32 VECREG:$rA), (v2i32 VECREG:$rB)),
+      (Av2i32
+        (v2i32 (Av2i32 (v2i32 (MPYHv2i32 VECREG:$rA, VECREG:$rB)),
+                       (v2i32 (MPYHv2i32 VECREG:$rB, VECREG:$rA)))),
+        (v2i32 (MPYUv2i32 VECREG:$rA, VECREG:$rB)))>;
+
+
 def MPYi32:
   Pat<(mul R32C:$rA, R32C:$rB),
       (Ar32
diff --git a/llvm/test/CodeGen/CellSPU/v2i32.ll b/llvm/test/CodeGen/CellSPU/v2i32.ll
new file mode 100644
index 0000000..be3822a
--- /dev/null
+++ b/llvm/test/CodeGen/CellSPU/v2i32.ll
@@ -0,0 +1,57 @@
+;RUN: llc --march=cellspu %s -o - | FileCheck %s
+%vec = type <2 x i32>
+
+define %vec @test_ret(%vec %param)
+{
+;CHECK:	bi	$lr
+  ret %vec %param
+}
+
+define %vec @test_add(%vec %param)
+{
+;CHECK: a $3, $3, $3
+  %1 = add %vec %param, %param
+;CHECK: bi $lr
+  ret %vec %1
+}
+
+define %vec @test_sub(%vec %param)
+{
+;CHECK: sf $3, $4, $3
+  %1 = sub %vec %param, <i32 1, i32 1>
+
+;CHECK: bi $lr
+  ret %vec %1
+}
+
+define %vec @test_mul(%vec %param)
+{
+;CHECK: mpyu
+;CHECK: mpyh
+;CHECK: a
+;CHECK: a $3
+  %1 = mul %vec %param, %param
+
+;CHECK: bi $lr
+  ret %vec %1
+}
+
+define <2 x i32> @test_splat(i32 %param ) {
+;TODO insertelement transforms to a PREFSLOT2VEC, that trasforms to the 
+;     somewhat redundant: 
+;CHECK-NOT or $3, $3, $3
+;CHECK: lqa
+;CHECK: shufb
+  %sv = insertelement <1 x i32> undef, i32 %param, i32 0 
+  %rv = shufflevector <1 x i32> %sv, <1 x i32> undef, <2 x i32> zeroinitializer 
+;CHECK: bi $lr
+  ret <2 x i32> %rv
+}
+
+define i32 @test_extract() {
+;CHECK: shufb $3
+  %rv = extractelement <2 x i32> zeroinitializer, i32 undef ; <i32> [#uses=1]
+;CHECK: bi $lr
+  ret i32 %rv
+}
+