- Remove Tilmann's custom truncate lowering: it completely hosed over
  DAGcombine's ability to find reasons to remove truncates when they were not
  needed. Consequently, the CellSPU backend would produce correct, but _really
  slow and horrible_, code.

  Replaced with instruction sequences that do the equivalent truncation in
  SPUInstrInfo.td.

- Re-examine how unaligned loads and stores work. Generated unaligned
  load code has been tested on the CellSPU hardware; see the i32operations.c
  and i64operations.c in CodeGen/CellSPU/useful-harnesses.  (While they may be
  toy test code, it does prove that some real world code does compile
  correctly.)

- Fix truncating stores in bug 3193 (note: unpack_df.ll will still make llc
  fault because i64 ult is not yet implemented.)

- Added i64 eq and neq for setcc and select/setcc; started new instruction
  information file for them in SPU64InstrInfo.td. Additional i64 operations
  should be added to this file and not to SPUInstrInfo.td.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@61447 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/test/CodeGen/CellSPU/call_indirect.ll b/test/CodeGen/CellSPU/call_indirect.ll
index 4b0a957..9be714e 100644
--- a/test/CodeGen/CellSPU/call_indirect.ll
+++ b/test/CodeGen/CellSPU/call_indirect.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu -mattr=large_mem > %t2.s
 ; RUN: grep bisl    %t1.s | count 7
 ; RUN: grep ila     %t1.s | count 1
-; RUN: grep rotqbyi %t1.s | count 4
+; RUN: grep rotqby  %t1.s | count 6
 ; RUN: grep lqa     %t1.s | count 1
 ; RUN: grep lqd     %t1.s | count 12
 ; RUN: grep dispatch_tab %t1.s | count 5
diff --git a/test/CodeGen/CellSPU/icmp64.ll b/test/CodeGen/CellSPU/icmp64.ll
new file mode 100644
index 0000000..d2b4fc0
--- /dev/null
+++ b/test/CodeGen/CellSPU/icmp64.ll
@@ -0,0 +1,144 @@
+; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
+; RUN: grep ceq                                %t1.s | count 4
+; RUN: grep cgti                               %t1.s | count 4
+; RUN: grep gb                                 %t1.s | count 4
+; RUN: grep fsm                                %t1.s | count 2
+; RUN: grep xori                               %t1.s | count 1
+; RUN: grep selb                               %t1.s | count 2
+
+target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
+target triple = "spu"
+
+; $3 = %arg1, $4 = %arg2, $5 = %val1, $6 = %val2
+; $3 = %arg1, $4 = %val1, $5 = %val2
+;
+; i64 integer comparisons:
+define i64 @icmp_eq_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp eq i64 %arg1, %arg2
+       %B = select i1 %A, i64 %val1, i64 %val2
+       ret i64 %B
+}
+
+define i1 @icmp_eq_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp eq i64 %arg1, %arg2
+       ret i1 %A
+}
+
+define i64 @icmp_ne_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp ne i64 %arg1, %arg2
+       %B = select i1 %A, i64 %val1, i64 %val2
+       ret i64 %B
+}
+
+define i1 @icmp_ne_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+entry:
+       %A = icmp ne i64 %arg1, %arg2
+       ret i1 %A
+}
+
+;; define i64 @icmp_ugt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ugt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ugt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ugt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_uge_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp uge i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_uge_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp uge i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_ult_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ult i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ult_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ult i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_ule_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ule i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_ule_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp ule i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sgt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sgt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sgt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sgt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sge_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sge i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sge_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sge i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_slt_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp slt i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_slt_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp slt i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
+;; 
+;; define i64 @icmp_sle_select_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sle i64 %arg1, %arg2
+;;        %B = select i1 %A, i64 %val1, i64 %val2
+;;        ret i64 %B
+;; }
+;; 
+;; define i1 @icmp_sle_setcc_i64(i64 %arg1, i64 %arg2, i64 %val1, i64 %val2) nounwind {
+;; entry:
+;;        %A = icmp sle i64 %arg1, %arg2
+;;        ret i1 %A
+;; }
diff --git a/test/CodeGen/CellSPU/stores.ll b/test/CodeGen/CellSPU/stores.ll
index 28d2e5b..f2f35ef 100644
--- a/test/CodeGen/CellSPU/stores.ll
+++ b/test/CodeGen/CellSPU/stores.ll
@@ -3,8 +3,17 @@
 ; RUN: grep {stqd.*16(\$3)}     %t1.s | count 4
 ; RUN: grep 16256               %t1.s | count 2
 ; RUN: grep 16384               %t1.s | count 1
+; RUN: grep 771                 %t1.s | count 4
+; RUN: grep 515                 %t1.s | count 2
+; RUN: grep 1799                %t1.s | count 2
+; RUN: grep 1543                %t1.s | count 5
+; RUN: grep 1029                %t1.s | count 3
 ; RUN: grep {shli.*, 4}         %t1.s | count 4
 ; RUN: grep stqx                %t1.s | count 4
+; RUN: grep ilhu                %t1.s | count 11
+; RUN: grep iohl                %t1.s | count 8
+; RUN: grep shufb               %t1.s | count 15
+; RUN: grep frds                %t1.s | count 1
 
 ; ModuleID = 'stores.bc'
 target datalayout = "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128-i16:16:128-i8:8:128-i1:8:128-a0:0:128-v128:128:128-s0:128:128"
@@ -89,3 +98,54 @@
         store <4 x float> < float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 >, <4 x float>* %arrayidx
         ret void
 }
+
+; Test truncating stores:
+
+define zeroext i8 @tstore_i16_i8(i16 signext %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i16 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define zeroext i8 @tstore_i32_i8(i32 %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i32 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define signext i16 @tstore_i32_i16(i32 %val, i16* %dest) nounwind {
+entry:
+	%conv = trunc i32 %val to i16
+	store i16 %conv, i16* %dest
+	ret i16 %conv
+}
+
+define zeroext i8 @tstore_i64_i8(i64 %val, i8* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i8
+	store i8 %conv, i8* %dest
+	ret i8 %conv
+}
+
+define signext i16 @tstore_i64_i16(i64 %val, i16* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i16
+	store i16 %conv, i16* %dest
+	ret i16 %conv
+}
+
+define i32 @tstore_i64_i32(i64 %val, i32* %dest) nounwind {
+entry:
+	%conv = trunc i64 %val to i32
+	store i32 %conv, i32* %dest
+	ret i32 %conv
+}
+
+define float @tstore_f64_f32(double %val, float* %dest) nounwind {
+entry:
+	%conv = fptrunc double %val to float
+	store float %conv, float* %dest
+	ret float %conv
+}
diff --git a/test/CodeGen/CellSPU/struct_1.ll b/test/CodeGen/CellSPU/struct_1.ll
index 3df7267..82d319d 100644
--- a/test/CodeGen/CellSPU/struct_1.ll
+++ b/test/CodeGen/CellSPU/struct_1.ll
@@ -35,7 +35,7 @@
 ;   int           i2;   // offset 12 [ignored]
 ;   unsigned char c4;   // offset 16 [ignored]
 ;   unsigned char c5;   // offset 17 [ignored]
-;   unsigned char c6;   // offset 18 [ignored]
+;   unsigned char c6;   // offset 18 (rotate left by 14 bytes to byte 3)
 ;   unsigned char c7;   // offset 19 (no rotate, in preferred slot)
 ;   int           i3;   // offset 20 [ignored]
 ;   int           i4;   // offset 24 [ignored]
diff --git a/test/CodeGen/CellSPU/trunc.ll b/test/CodeGen/CellSPU/trunc.ll
index 845feed..1c6e1f6 100644
--- a/test/CodeGen/CellSPU/trunc.ll
+++ b/test/CodeGen/CellSPU/trunc.ll
@@ -1,16 +1,12 @@
 ; RUN: llvm-as -o - %s | llc -march=cellspu > %t1.s
-; RUN: grep shufb   %t1.s | count 9
+; RUN: grep shufb   %t1.s | count 10
 ; RUN: grep {ilhu.*1799}  %t1.s | count 1
-; RUN: grep {ilhu.*771}  %t1.s | count 3
+; RUN: grep {ilhu.*771}  %t1.s | count 1
 ; RUN: grep {ilhu.*1543}  %t1.s | count 1
 ; RUN: grep {ilhu.*1029}  %t1.s | count 1
-; RUN: grep {ilhu.*515}  %t1.s | count 1
-; RUN: grep {iohl.*1799}  %t1.s | count 1
-; RUN: grep {iohl.*771}  %t1.s | count 3
-; RUN: grep {iohl.*1543}  %t1.s | count 2
-; RUN: grep {iohl.*515}  %t1.s | count 1
-; RUN: grep xsbh  %t1.s | count 6
-; RUN: grep sfh  %t1.s | count 5
+; RUN: grep {ilhu.*515}  %t1.s | count 2
+; RUN: grep xsbh  %t1.s | count 2
+; RUN: grep sfh  %t1.s | count 1
 
 ; ModuleID = 'trunc.bc'
 target datalayout = "E-p:32:32:128-i1:8:128-i8:8:128-i16:16:128-i32:32:128-i64:32:128-f32:32:128-f64:64:128-v64:64:64-v128:128:128-a0:0:128-s0:128:128"
@@ -41,23 +37,22 @@
 ;	ret i64 %0
 ;}
 
-define i8 @trunc_i64_i8(i64 %u, i8 %v) nounwind readnone {
+define <16 x i8> @trunc_i64_i8(i64 %u, <16 x i8> %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i8
-	%1 = sub i8 %0, %v
-	ret i8 %1
+        %tmp1 = insertelement <16 x i8> %v, i8 %0, i32 10
+	ret <16 x i8> %tmp1
 }
-define i16 @trunc_i64_i16(i64 %u, i16 %v) nounwind readnone {
+define <8 x i16> @trunc_i64_i16(i64 %u, <8 x i16> %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i16
-        %1 = sub i16 %0, %v
-	ret i16 %1
+        %tmp1 = insertelement <8 x i16> %v, i16 %0, i32 6
+	ret <8 x i16> %tmp1
 }
 define i32 @trunc_i64_i32(i64 %u, i32 %v) nounwind readnone {
 entry:
 	%0 = trunc i64 %u to i32
-	%1 = sub i32 %0, %v
-	ret i32 %1
+	ret i32 %0
 }
 
 define i8 @trunc_i32_i8(i32 %u, i8 %v) nounwind readnone {
@@ -66,16 +61,16 @@
 	%1 = sub i8 %0, %v
 	ret i8 %1
 }
-define i16 @trunc_i32_i16(i32 %u, i16 %v) nounwind readnone {
+define <8 x i16> @trunc_i32_i16(i32 %u, <8 x i16> %v) nounwind readnone {
 entry:
 	%0 = trunc i32 %u to i16
-	%1 = sub i16 %0, %v
-	ret i16 %1
+        %tmp1 = insertelement <8 x i16> %v, i16 %0, i32 3
+	ret <8 x i16> %tmp1
 }
 
-define i8 @trunc_i16_i8(i16 %u, i8 %v) nounwind readnone {
+define <16 x i8> @trunc_i16_i8(i16 %u, <16 x i8> %v) nounwind readnone {
 entry:
 	%0 = trunc i16 %u to i8
-	%1 = sub i8 %0, %v
-	ret i8 %1
+        %tmp1 = insertelement <16 x i8> %v, i8 %0, i32 5
+	ret <16 x i8> %tmp1
 }
diff --git a/test/CodeGen/CellSPU/useful-harnesses/i32operations.c b/test/CodeGen/CellSPU/useful-harnesses/i32operations.c
new file mode 100644
index 0000000..12fc30b
--- /dev/null
+++ b/test/CodeGen/CellSPU/useful-harnesses/i32operations.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+
+typedef unsigned int  		uint32_t;
+typedef int           		int32_t;
+
+const char *boolstring(int val) {
+  return val ? "true" : "false";
+}
+
+int i32_eq(int32_t a, int32_t b) {
+  return (a == b);
+}
+
+int i32_neq(int32_t a, int32_t b) {
+  return (a != b);
+}
+
+int32_t i32_eq_select(int32_t a, int32_t b, int32_t c, int32_t d) {
+  return ((a == b) ? c : d);
+}
+
+int32_t i32_neq_select(int32_t a, int32_t b, int32_t c, int32_t d) {
+  return ((a != b) ? c : d);
+}
+
+struct pred_s {
+  const char *name;
+  int (*predfunc)(int32_t, int32_t);
+  int (*selfunc)(int32_t, int32_t, int32_t, int32_t);
+};
+
+struct pred_s preds[] = {
+  { "eq",  i32_eq,  i32_eq_select },
+  { "neq", i32_neq, i32_neq_select }
+};
+
+int main(void) {
+  int i;
+  int32_t a = 1234567890;
+  int32_t b =  345678901;
+  int32_t c = 1234500000;
+  int32_t d =      10001;
+  int32_t e =      10000;
+
+  printf("a = %12d (0x%08x)\n", a, a);
+  printf("b = %12d (0x%08x)\n", b, b);
+  printf("c = %12d (0x%08x)\n", c, c);
+  printf("d = %12d (0x%08x)\n", d, d);
+  printf("e = %12d (0x%08x)\n", e, e);
+  printf("----------------------------------------\n");
+
+  for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s b = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, b)));
+    printf("a %s c = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, c)));
+    printf("d %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(d, e)));
+    printf("e %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(e, e)));
+
+    printf("a %s a ? c : d = %d\n", preds[i].name, (*preds[i].selfunc)(a, a, c, d));
+    printf("a %s a ? c : d == c (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, a, c, d) == c));
+    printf("a %s b ? c : d = %d\n", preds[i].name, (*preds[i].selfunc)(a, b, c, d));
+    printf("a %s b ? c : d == d (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, b, c, d) == d));
+
+    printf("----------------------------------------\n");
+  }
+
+  return 0;
+}
diff --git a/test/CodeGen/CellSPU/useful-harnesses/i64operations.c b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
new file mode 100644
index 0000000..7b86070
--- /dev/null
+++ b/test/CodeGen/CellSPU/useful-harnesses/i64operations.c
@@ -0,0 +1,68 @@
+#include <stdio.h>
+
+typedef unsigned long long int	uint64_t;
+typedef long long int   	int64_t;
+
+const char *boolstring(int val) {
+  return val ? "true" : "false";
+}
+
+int i64_eq(int64_t a, int64_t b) {
+  return (a == b);
+}
+
+int i64_neq(int64_t a, int64_t b) {
+  return (a != b);
+}
+
+int64_t i64_eq_select(int64_t a, int64_t b, int64_t c, int64_t d) {
+  return ((a == b) ? c : d);
+}
+
+int64_t i64_neq_select(int64_t a, int64_t b, int64_t c, int64_t d) {
+  return ((a != b) ? c : d);
+}
+
+struct pred_s {
+  const char   *name;
+  int 		(*predfunc)(int64_t, int64_t);
+  int64_t       (*selfunc)(int64_t, int64_t, int64_t, int64_t);
+};
+
+struct pred_s preds[] = {
+  { "eq",  i64_eq,  i64_eq_select },
+  { "neq", i64_neq, i64_neq_select }
+};
+
+int main(void) {
+  int i;
+  int64_t a = 1234567890000LL;
+  int64_t b = 2345678901234LL;
+  int64_t c = 1234567890001LL;
+  int64_t d =         10001LL;
+  int64_t e =         10000LL;
+
+  printf("a = %16lld (0x%016llx)\n", a, a);
+  printf("b = %16lld (0x%016llx)\n", b, b);
+  printf("c = %16lld (0x%016llx)\n", c, c);
+  printf("d = %16lld (0x%016llx)\n", d, d);
+  printf("e = %16lld (0x%016llx)\n", e, e);
+  printf("----------------------------------------\n");
+
+  for (i = 0; i < sizeof(preds)/sizeof(preds[0]); ++i) {
+    printf("a %s a = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, a)));
+    printf("a %s b = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, b)));
+    printf("a %s c = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(a, c)));
+    printf("d %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(d, e)));
+    printf("e %s e = %s\n", preds[i].name, boolstring((*preds[i].predfunc)(e, e)));
+
+    printf("a %s a ? c : d = %lld\n", preds[i].name, (*preds[i].selfunc)(a, a, c, d));
+    printf("a %s a ? c : d == c (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, a, c, d) == c));
+    printf("a %s b ? c : d = %lld\n", preds[i].name, (*preds[i].selfunc)(a, b, c, d));
+    printf("a %s b ? c : d == d (%s)\n", preds[i].name, boolstring((*preds[i].selfunc)(a, b, c, d) == d));
+
+    printf("----------------------------------------\n");
+  }
+
+  return 0;
+}