AMDGPU: Switch barrier intrinsics to using convergent noduplicate prevents unrolling of small loops that happen to have barriers in them. If a loop has a barrier in it, it is OK to duplicate it for the unroll. llvm-svn: 256075

commit: 2aed6ca1d33468f7e6f8509213a1b9dcd8026ec1 [log] [tgz]
author: Matt Arsenault <Matthew.Arsenault@amd.com> Sat Dec 19 01:46:41 2015 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> Sat Dec 19 01:46:41 2015 +0000
tree: a4575401ecb619e067cc2b4fed2a83056b80163c
parent: 10a509292cda51aabf3630bf30ceb990d3bf3108 [diff]
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 9be212f..61bcd4b3 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll

@@ -62,5 +62,5 @@
 declare i32 @llvm.r600.read.tidig.x() #3
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
+attributes #1 = { nounwind convergent }
 attributes #3 = { nounwind readnone }

diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 67738f7..f8a7422 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

@@ -2,7 +2,7 @@
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare i32 @llvm.SI.tid() nounwind readnone
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
 
 ; The required pointer calculations for the alloca'd actually requires
 ; an add and won't be folded into the addressing, which fails with a
@@ -35,7 +35,7 @@
   %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
   store i32 %result, i32* %alloca_ptr, align 4
   ; Dummy call
-  call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+  call void @llvm.AMDGPU.barrier.local() nounwind convergent
   %reload = load i32, i32* %alloca_ptr, align 4
   %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 %reload, i32 addrspace(1)* %out_ptr, align 4

diff --git a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 8f63a58..171883e 100644
--- a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll

@@ -49,4 +49,4 @@
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index ac17df1..e657991 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll

@@ -66,5 +66,5 @@
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index ffa45f6..7d6eddb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll

@@ -122,4 +122,4 @@
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind noduplicate convergent }
+attributes #2 = { nounwind convergent }

diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 579f989..5170d9c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll

@@ -505,9 +505,9 @@
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 8073426..0061aaf 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll

@@ -229,9 +229,9 @@
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index 00de55b..4a0571e 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll

@@ -264,9 +264,5 @@
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
-declare void @llvm.AMDGPU.barrier.local() #2
-
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index b408459..9d3a293 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll

@@ -431,9 +431,9 @@
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index 358aa6a..5a1024c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll

@@ -109,9 +109,9 @@
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.r600.read.tidig.y() #1
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index d65b757..86e0c07 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll

@@ -131,5 +131,5 @@
 declare i32 @llvm.r600.read.tidig.x() #3
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind noduplicate }
+attributes #1 = { nounwind convergent }
 attributes #3 = { nounwind readnone }

diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index 99e8d62..2a3b29f 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll

@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 
-declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+declare void @llvm.AMDGPU.barrier.local() convergent nounwind
 
 ; SI-LABEL: {{^}}private_access_f64_alloca:
 
@@ -18,7 +18,7 @@
   %array = alloca double, i32 16, align 8
   %ptr = getelementptr double, double* %array, i32 %b
   store double %val, double* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load double, double* %ptr, align 8
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -38,7 +38,7 @@
   %array = alloca <2 x double>, i32 16, align 16
   %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load <2 x double>, <2 x double>* %ptr, align 16
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -56,7 +56,7 @@
   %array = alloca i64, i32 16, align 8
   %ptr = getelementptr i64, i64* %array, i32 %b
   store i64 %val, i64* %ptr, align 8
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load i64, i64* %ptr, align 8
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -76,7 +76,7 @@
   %array = alloca <2 x i64>, i32 16, align 16
   %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
-  call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
+  call void @llvm.AMDGPU.barrier.local() convergent nounwind
   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
   ret void

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
index f948c98..7dc094e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll

@@ -4,7 +4,6 @@
 ; FIXME: Enable for VI.
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
 declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
 declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
 

diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index 8b01c96..65b454b 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll

@@ -708,4 +708,4 @@
 declare void @llvm.AMDGPU.barrier.local() #1
 
 attributes #0 = { nounwind }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
index 3863afd..e4b16c0 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll

@@ -3,7 +3,7 @@
 ; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
 
-declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.local() nounwind convergent
 
 
 ; SI-LABEL: {{^}}main(

diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 35e9ff8..bc766db 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

@@ -234,4 +234,4 @@
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind noduplicate }
+attributes #2 = { nounwind convergent }

diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
index 4a72b4d0..ba4049f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll

@@ -36,7 +36,7 @@
   ret void
 }
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.local() #2
 
-attributes #2 = { noduplicate nounwind }
+attributes #2 = { convergent nounwind }

diff --git a/llvm/test/CodeGen/AMDGPU/wait.ll b/llvm/test/CodeGen/AMDGPU/wait.ll
index 5aaf002..107e84b 100644
--- a/llvm/test/CodeGen/AMDGPU/wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait.ll

@@ -70,7 +70,7 @@
 }
 
 
-; Function Attrs: noduplicate nounwind
+; Function Attrs: convergent nounwind
 declare void @llvm.AMDGPU.barrier.global() #1
 
 ; Function Attrs: nounwind readnone
@@ -79,7 +79,7 @@
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { "ShaderType"="1" }
-attributes #1 = { noduplicate nounwind }
+attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }
 
 !0 = !{!1, !1, i64 0, i32 1}
commit	2aed6ca1d33468f7e6f8509213a1b9dcd8026ec1	[log] [tgz]
author	Matt Arsenault <Matthew.Arsenault@amd.com>	Sat Dec 19 01:46:41 2015 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	Sat Dec 19 01:46:41 2015 +0000
tree	a4575401ecb619e067cc2b4fed2a83056b80163c
parent	10a509292cda51aabf3630bf30ceb990d3bf3108 [diff]