[MachineScheduler]Add support for store clustering

Perform store clustering just like load clustering. This change add
StoreClusterMutation in machine-scheduler. To control StoreClusterMutation,
added enableClusterStores() in TargetInstrInfo.h. This is enabled only on
AArch64 for now.

This change also add support for unscaled stores which were not handled in
getMemOpBaseRegImmOfs().

llvm-svn: 266437
diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
new file mode 100644
index 0000000..5cab38e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i64_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4):   STRXui %vreg1, %vreg0, 1
+; CHECK:SU(3):   STRXui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRXui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRXui %vreg1, %vreg0, 4
+define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+  store i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+  store i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+  store i64 %v, i64* %arrayidx3
+  ret i64 %v
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i32_scale:BB#0
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(4):   STRWui %vreg1, %vreg0, 1
+; CHECK:SU(3):   STRWui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRWui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRWui %vreg1, %vreg0, 4
+define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %P, i32 3
+  store i32 %v, i32* %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2
+  store i32 %v, i32* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1
+  store i32 %v, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4
+  store i32 %v, i32* %arrayidx3
+  ret i32 %v
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i64_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5):   STURXi %vreg1, %vreg0, -32
+; CHECK:SU(2):   STURXi %vreg1, %vreg0, -24
+; CHECK:SU(4):   STURXi %vreg1, %vreg0, -16
+; CHECK:SU(3):   STURXi %vreg1, %vreg0, -8
+define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
+  store i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
+  store i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
+  store i64 %v, i64* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_i32_unscale:BB#0 entry
+; CHECK:Cluster ld/st SU(5) - SU(2)
+; CHECK:Cluster ld/st SU(4) - SU(3)
+; CHECK:SU(5):   STURWi %vreg1, %vreg0, -16
+; CHECK:SU(2):   STURWi %vreg1, %vreg0, -12
+; CHECK:SU(4):   STURWi %vreg1, %vreg0, -8
+; CHECK:SU(3):   STURWi %vreg1, %vreg0, -4
+define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
+entry:
+  %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
+  store i32 %v, i32* %arrayidx
+  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
+  store i32 %v, i32* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
+  store i32 %v, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
+  store i32 %v, i32* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_double:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3):   STRDui %vreg1, %vreg0, 1
+; CHECK:SU(4):   STRDui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRDui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRDui %vreg1, %vreg0, 4
+define void @stp_double(double* nocapture %P, double %v)  {
+entry:
+  %arrayidx = getelementptr inbounds double, double* %P, i64 3
+  store double %v, double* %arrayidx
+  %arrayidx1 = getelementptr inbounds double, double* %P, i64 1
+  store double %v, double* %arrayidx1
+  %arrayidx2 = getelementptr inbounds double, double* %P, i64 2
+  store double %v, double* %arrayidx2
+  %arrayidx3 = getelementptr inbounds double, double* %P, i64 4
+  store double %v, double* %arrayidx3
+  ret void
+}
+
+; CHECK:********** MI Scheduling **********
+; CHECK-LABEL:stp_float:BB#0
+; CHECK:Cluster ld/st SU(3) - SU(4)
+; CHECK:Cluster ld/st SU(2) - SU(5)
+; CHECK:SU(3):   STRSui %vreg1, %vreg0, 1
+; CHECK:SU(4):   STRSui %vreg1, %vreg0, 2
+; CHECK:SU(2):   STRSui %vreg1, %vreg0, 3
+; CHECK:SU(5):   STRSui %vreg1, %vreg0, 4
+define void @stp_float(float* nocapture %P, float %v)  {
+entry:
+  %arrayidx = getelementptr inbounds float, float* %P, i64 3
+  store float %v, float* %arrayidx
+  %arrayidx1 = getelementptr inbounds float, float* %P, i64 1
+  store float %v, float* %arrayidx1
+  %arrayidx2 = getelementptr inbounds float, float* %P, i64 2
+  store float %v, float* %arrayidx2
+  %arrayidx3 = getelementptr inbounds float, float* %P, i64 4
+  store float %v, float* %arrayidx3
+  ret void
+}
+
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_volatile:BB#0
+; CHECK-NOT: Cluster ld/st
+; CHECK:SU(2):   STRXui %vreg1, %vreg0, 3; mem:Volatile
+; CHECK:SU(3):   STRXui %vreg1, %vreg0, 2; mem:Volatile
+; CHECK:SU(4):   STRXui %vreg1, %vreg0, 1; mem:Volatile
+; CHECK:SU(5):   STRXui %vreg1, %vreg0, 4; mem:Volatile
+define i64 @stp_volatile(i64* nocapture %P, i64 %v) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
+  store volatile i64 %v, i64* %arrayidx
+  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
+  store volatile i64 %v, i64* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
+  store volatile i64 %v, i64* %arrayidx2
+  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
+  store volatile i64 %v, i64* %arrayidx3
+  ret i64 %v
+}
+
diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index f760708..0cfbe59 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -5,12 +5,12 @@
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldr_int:BB#0
-; EXYNOS: Cluster loads SU(1) - SU(2)
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int(i32* %a) nounwind {
@@ -25,12 +25,12 @@
 ; Test ldpsw clustering
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_sext_int:BB#0
-; CHECK: Cluster loads SU(1) - SU(2)
+; CHECK: Cluster ld/st SU(1) - SU(2)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_sext_int:BB#0
-; EXYNOS: Cluster loads SU(1) - SU(2)
+; EXYNOS: Cluster ld/st SU(1) - SU(2)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_sext_int(i32* %p) nounwind {
@@ -46,12 +46,12 @@
 ; Test ldur clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldur_int:BB#0
-; CHECK: Cluster loads SU(2) - SU(1)
+; CHECK: Cluster ld/st SU(2) - SU(1)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldur_int:BB#0
-; EXYNOS: Cluster loads SU(2) - SU(1)
+; EXYNOS: Cluster ld/st SU(2) - SU(1)
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDURWi
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDURWi
 define i32 @ldur_int(i32* %a) nounwind {
@@ -66,12 +66,12 @@
 ; Test sext + zext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0
-; EXYNOS: Cluster loads SU(3) - SU(4)
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
 ; EXYNOS: SU(3):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: SU(4):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
@@ -89,12 +89,12 @@
 ; Test zext + sext clustering.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
-; CHECK: Cluster loads SU(3) - SU(4)
+; CHECK: Cluster ld/st SU(3) - SU(4)
 ; CHECK: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; CHECK: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0
-; EXYNOS: Cluster loads SU(3) - SU(4)
+; EXYNOS: Cluster ld/st SU(3) - SU(4)
 ; EXYNOS: SU(3):   %vreg{{[0-9]+}}:sub_32<def,read-undef> = LDRWui
 ; EXYNOS: SU(4):   %vreg{{[0-9]+}}<def> = LDRSWui
 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
@@ -112,12 +112,12 @@
 ; Verify we don't cluster volatile loads.
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldr_int_volatile:BB#0
-; CHECK-NOT: Cluster loads
+; CHECK-NOT: Cluster ld/st
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; CHECK: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldr_int_volatile:BB#0
-; EXYNOS-NOT: Cluster loads
+; EXYNOS-NOT: Cluster ld/st
 ; EXYNOS: SU(1):   %vreg{{[0-9]+}}<def> = LDRWui
 ; EXYNOS: SU(2):   %vreg{{[0-9]+}}<def> = LDRWui
 define i32 @ldr_int_volatile(i32* %a) nounwind {
@@ -132,12 +132,12 @@
 ; Test ldq clustering (no clustering for Exynos).
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: ldq_cluster:BB#0
-; CHECK: Cluster loads SU(1) - SU(3)
+; CHECK: Cluster ld/st SU(1) - SU(3)
 ; CHECK: SU(1):   %vreg{{[0-9]+}}<def> = LDRQui
 ; CHECK: SU(3):   %vreg{{[0-9]+}}<def> = LDRQui
 ; EXYNOS: ********** MI Scheduling **********
 ; EXYNOS-LABEL: ldq_cluster:BB#0
-; EXYNOS-NOT: Cluster loads
+; EXYNOS-NOT: Cluster ld/st
 define <2 x i64> @ldq_cluster(i64* %p) {
   %a1 = bitcast i64* %p to <2 x i64>*
   %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8
diff --git a/llvm/test/CodeGen/AArch64/arm64-stp.ll b/llvm/test/CodeGen/AArch64/arm64-stp.ll
index 98242d0..5664c7d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-stp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stp.ll
@@ -100,9 +100,9 @@
 
 ; Read of %b to compute %tmp2 shouldn't prevent formation of stp
 ; CHECK-LABEL: stp_int_rar_hazard
-; CHECK: stp w0, w1, [x2]
 ; CHECK: ldr [[REG:w[0-9]+]], [x2, #8]
-; CHECK: add w0, [[REG]], w1
+; CHECK: add w8, [[REG]], w1
+; CHECK: stp w0, w1, [x2]
 ; CHECK: ret
 define i32 @stp_int_rar_hazard(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   store i32 %a, i32* %p, align 4
diff --git a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
index 8b3fc97..434c787 100644
--- a/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/llvm/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -64,8 +64,8 @@
 define void @f4(i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-NEXT: adrp x8, [[SET3]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET3]]@PAGEOFF
-; CHECK-NEXT: stp w0, w1, [x8, #4]
-; CHECK-NEXT: str w2, [x8]
+; CHECK-NEXT: stp w2, w0, [x8]
+; CHECK-NEXT: str w1, [x8, #8]
 ; CHECK-NEXT: ret
   store i32 %a1, i32* @m4, align 4
   store i32 %a2, i32* @n4, align 4