[MachineScheduler]Add support for store clustering

Perform store clustering just like load clustering. This change add
StoreClusterMutation in machine-scheduler. To control StoreClusterMutation,
added enableClusterStores() in TargetInstrInfo.h. This is enabled only on
AArch64 for now.

This change also add support for unscaled stores which were not handled in
getMemOpBaseRegImmOfs().

llvm-svn: 266437
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index f760708..0cfbe59 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -5,12 +5,12 @@
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldr_int:BB#0
-; EXYNOS: Cluster loads SU(1) - SU(2)
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int(i32* %a) nounwind {
@@ -25,12 +25,12 @@
 ; Test ldpsw clustering
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_sext_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_sext_int:BB#0
-; EXYNOS: Cluster loads SU(1) - SU(2)
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_sext_int(i32* %p) nounwind {
@@ -46,12 +46,12 @@
 ; Test ldur clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldur_int:BB#0
-; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: Cluster ld/st SU(2) - SU(1)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldur_int:BB#0
-; EXYNOS: Cluster loads SU(2) - SU(1)
+; EXYNOS: Cluster ld/st SU(2) - SU(1)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 define i32 @ldur_int(i32* %a) nounwind {
@@ -66,12 +66,12 @@
 ; Test sext + zext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0
-; EXYNOS: Cluster loads SU(3) - SU(4)
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
 ; EXYNOS: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
@@ -89,12 +89,12 @@
 ; Test zext + sext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0
-; EXYNOS: Cluster loads SU(3) - SU(4)
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
 ; EXYNOS: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; EXYNOS: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
@@ -112,12 +112,12 @@
 ; Verify we don't cluster volatile loads.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int_volatile:BB#0
-; CHECK-NOT: Cluster loads
+; CHECK-NOT: Cluster ld/st
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldr_int_volatile:BB#0
-; EXYNOS-NOT: Cluster loads
+; EXYNOS-NOT: Cluster ld/st
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int_volatile(i32* %a) nounwind {
@@ -132,12 +132,12 @@
 ; Test ldq clustering (no clustering for Exynos).
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldq_cluster:BB#0
-; CHECK: Cluster loads SU(1) - SU(3)
+; CHECK: Cluster ld/st SU(1) - SU(3)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRQui
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRQui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldq_cluster:BB#0
-; EXYNOS-NOT: Cluster loads
+; EXYNOS-NOT: Cluster ld/st
 define <2 x i64> @ldq_cluster(i64* %p) {
   %a1 = bitcast i64* %p to <2 x i64>*
   %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8