CellSPU:
- Ensure that (operation) legalization emits proper FDIV libcall when needed.
- Fix various bugs encountered during llvm-spu-gcc build, along with various
  cleanups.
- Start supporting double precision comparisons for remaining libgcc2 build.
  Discovered interesting DAGCombiner feature, which is currently solved via
  custom lowering (64-bit constants are not legal on CellSPU, but DAGCombiner
  insists on inserting one anyway.)
- Update README.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@62664 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
index 92bd928..124f1a7 100644
--- a/lib/Target/CellSPU/SPUISelLowering.cpp
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp
@@ -92,6 +92,9 @@
   setUseUnderscoreSetJmp(true);
   setUseUnderscoreLongJmp(true);
 
+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
   // Set up the SPU's register classes:
   addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
   addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
@@ -183,6 +186,9 @@
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
+  // Make sure that DAGCombine doesn't insert illegal 64-bit constants
+  setOperationAction(ISD::FABS,  MVT::f64, Custom);
+
   // SPU can do rotate right and left, so legalize it... but customize for i8
   // because instructions don't exist.
 
@@ -243,6 +249,7 @@
   setOperationAction(ISD::SETCC, MVT::i16,   Legal);
   setOperationAction(ISD::SETCC, MVT::i32,   Legal);
   setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
 
   // Custom lower i128 -> i64 truncates
   setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
@@ -410,6 +417,9 @@
     node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
     node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
     node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
+            "SPUISD::ROTBYTES_LEFT_BITS";
     node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
     node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
     node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
@@ -1552,12 +1562,9 @@
   return false;  // Can't be a splat if two pieces don't match.
 }
 
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.  If we CAN select this case, and if it
-// selects to a single instruction, return Op.  Otherwise, if we can codegen
-// this case more efficiently than a constant pool load, lower it to the
-// sequence of ops that should be used.
-static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+//! Lower a BUILD_VECTOR instruction creatively:
+SDValue
+SPU::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getValueType();
   // If this is a vector of constants or undefs, get the bits.  A bit in
   // UndefBits is set if the corresponding element of the vector is an
@@ -1575,6 +1582,11 @@
 
   switch (VT.getSimpleVT()) {
   default:
+    cerr << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getMVTString()
+         << "\n";
+    abort();
+    /*NOTREACHED*/
   case MVT::v4f32: {
     uint32_t Value32 = SplatBits;
     assert(SplatSize == 4
@@ -2188,32 +2200,32 @@
 
 //! Generate the carry-generate shuffle mask.
 SDValue SPU::getCarryGenerateShufMask(SelectionDAG &DAG) {
-SmallVector<SDValue, 16> ShufBytes;
+  SmallVector<SDValue, 16 > ShufBytes;
 
-// Create the shuffle mask for "rotating" the borrow up one register slot
-// once the borrow is generated.
-ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+  // Create the shuffle mask for "rotating" the borrow up one register slot
+  // once the borrow is generated.
+  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
 
-return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                   &ShufBytes[0], ShufBytes.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                     &ShufBytes[0], ShufBytes.size());
 }
 
 //! Generate the borrow-generate shuffle mask
 SDValue SPU::getBorrowGenerateShufMask(SelectionDAG &DAG) {
-SmallVector<SDValue, 16> ShufBytes;
+  SmallVector<SDValue, 16 > ShufBytes;
 
-// Create the shuffle mask for "rotating" the borrow up one register slot
-// once the borrow is generated.
-ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
-ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+  // Create the shuffle mask for "rotating" the borrow up one register slot
+  // once the borrow is generated.
+  ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+  ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
 
-return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
-                   &ShufBytes[0], ShufBytes.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32,
+                     &ShufBytes[0], ShufBytes.size());
 }
 
 //! Lower byte immediate operations for v16i8 vectors:
@@ -2372,6 +2384,83 @@
   return SDValue();
 }
 
+//! Lower ISD::FABS
+/*!
+ DAGCombine does the same basic reduction: convert the double to i64 and mask
+ off the sign bit. Unfortunately, DAGCombine inserts the i64 constant, which
+ CellSPU has to legalize. Hence, the custom lowering.
+ */
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  MVT OpVT = Op.getValueType();
+  MVT IntVT(MVT::i64);
+  SDValue Op0 = Op.getOperand(0);
+
+  assert(OpVT == MVT::f64 && "LowerFABS: expecting MVT::f64!\n");
+
+  SDValue iABS =
+          DAG.getNode(ISD::AND, IntVT,
+                      DAG.getNode(ISD::BIT_CONVERT, IntVT, Op0),
+                      DAG.getConstant(~IntVT.getIntegerVTSignBit(), IntVT));
+
+  return DAG.getNode(ISD::BIT_CONVERT, MVT::f64, iABS);
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode > (Op.getOperand(2));
+  MVT lhsVT = lhs.getValueType();
+  SDValue posNaN = DAG.getConstant(0x7ff0000000000001ULL, MVT::i64);
+
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
+    abort();
+    break;
+  case ISD::SETO: {
+    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
+    SDValue i64lhs =
+            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
+
+    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETLT);
+  }
+  case ISD::SETUO: {
+    SDValue lhsfabs = DAG.getNode(ISD::FABS, MVT::f64, lhs);
+    SDValue i64lhs =
+            DAG.getNode(ISD::BIT_CONVERT, MVT::i64, lhsfabs);
+
+    return DAG.getSetCC(MVT::i32, i64lhs, posNaN, ISD::SETGE);
+  }
+  case ISD::SETUEQ:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUNE:
+  default:
+    cerr << "CellSPU ISel Select: unimplemented f64 condition\n";
+    abort();
+    break;
+  }
+
+  return SDValue();
+}
+
 //! Lower ISD::SELECT_CC
 /*!
   ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
@@ -2501,9 +2590,12 @@
     break;
   }
 
+  case ISD::FABS:
+    return LowerFABS(Op, DAG);
+
   // Vector-related lowering.
   case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
+    return SPU::LowerBUILD_VECTOR(Op, DAG);
   case ISD::SCALAR_TO_VECTOR:
     return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
@@ -2530,6 +2622,9 @@
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG, *this);
 
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
   case ISD::TRUNCATE:
     return LowerTRUNCATE(Op, DAG);
   }
@@ -2656,8 +2751,8 @@
   }
   case SPUISD::IndirectAddr: {
     if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
-      ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(1));
-      if (CN->getZExtValue() == 0) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->getZExtValue() == 0) {
         // (SPUindirect (SPUaform <addr>, 0), 0) ->
         // (SPUaform <addr>, 0)
 
@@ -2736,7 +2831,7 @@
     break;
   }
   }
-  
+
   // Otherwise, return unchanged.
 #ifndef NDEBUG
   if (Result.getNode()) {
@@ -2809,41 +2904,18 @@
                                                   unsigned Depth ) const {
 #if 0
   const uint64_t uint64_sizebits = sizeof(uint64_t) * 8;
-#endif
 
   switch (Op.getOpcode()) {
   default:
     // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
     break;
-
-#if 0
   case CALL:
   case SHUFB:
   case SHUFFLE_MASK:
   case CNTB:
-#endif
-
-  case SPUISD::PREFSLOT2VEC: {
-    SDValue Op0 = Op.getOperand(0);
-    MVT Op0VT = Op0.getValueType();
-    unsigned Op0VTBits = Op0VT.getSizeInBits();
-    uint64_t InMask = Op0VT.getIntegerVTBitMask();
-    KnownZero |= APInt(Op0VTBits, ~InMask, false);
-    KnownOne |= APInt(Op0VTBits, InMask, false);
-    break;
-  }
-
+  case SPUISD::PREFSLOT2VEC:
   case SPUISD::LDRESULT:
-  case SPUISD::VEC2PREFSLOT: {
-    MVT OpVT = Op.getValueType();
-    unsigned OpVTBits = OpVT.getSizeInBits();
-    uint64_t InMask = OpVT.getIntegerVTBitMask();
-    KnownZero |= APInt(OpVTBits, ~InMask, false);
-    KnownOne |= APInt(OpVTBits, InMask, false);
-    break;
-  }
-
-#if 0
+  case SPUISD::VEC2PREFSLOT:
   case SPUISD::SHLQUAD_L_BITS:
   case SPUISD::SHLQUAD_L_BYTES:
   case SPUISD::VEC_SHL:
@@ -2854,8 +2926,8 @@
   case SPUISD::ROTBYTES_LEFT:
   case SPUISD::SELECT_MASK:
   case SPUISD::SELB:
-#endif
   }
+#endif
 }
 
 unsigned