[AMDGPU] Packed thread ids in function call ABI

Differential Revision: https://reviews.llvm.org/D63851

llvm-svn: 364619
diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
index 0ebf11d..3044ff8 100644
--- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -65,6 +65,7 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_x:
 ; GCN: s_waitcnt
+; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v1
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
 ; GCN-NEXT: s_setpc_b64
 define hidden i32 @use_workitem_id_x(i32 %arg0) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index dbab269..09bb3b4 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,8 +1,9 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
 ; GCN-LABEL: {{^}}use_workitem_id_x:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_x() #1 {
@@ -13,7 +14,8 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_y:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_y() #1 {
@@ -24,7 +26,8 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_z:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
+; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_z() #1 {
@@ -35,8 +38,10 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_xy:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_xy() #1 {
@@ -49,9 +54,12 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_xyz:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v2
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_xyz() #1 {
@@ -66,8 +74,10 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_xz:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_xz() #1 {
@@ -80,8 +90,10 @@
 
 ; GCN-LABEL: {{^}}use_workitem_id_yz:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
+; GCN-DAG: flat_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @use_workitem_id_yz() #1 {
@@ -108,7 +120,7 @@
 
 ; GCN-NOT: v0
 ; GCN-NOT: v1
-; GCN: v_mov_b32_e32 v0, v1
+; GCN: v_lshlrev_b32_e32 v0, 10, v1
 ; GCN-NOT: v0
 ; GCN-NOT: v1
 ; GCN: s_swappc_b64
@@ -122,15 +134,72 @@
 
 ; GCN-NOT: v0
 ; GCN-NOT: v2
-; GCN: v_mov_b32_e32 v0, v2
+; GCN: v_lshlrev_b32_e32 v0, 20, v2
 ; GCN-NOT: v0
-; GCN-NOT: v2
+; GCN-NOT: v1
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
   call void @use_workitem_id_z()
   ret void
 }
 
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
+  call void @use_workitem_id_xy()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
+  call void @use_workitem_id_xz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
+  call void @use_workitem_id_yz()
+  ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]]
+; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; GCN-NOT: v0
+; GCN-NOT: v1
+; GCN-NOT: v2
+; GCN: s_swappc_b64
+define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
+  call void @use_workitem_id_xyz()
+  ret void
+}
+
 ; GCN-LABEL: {{^}}func_indirect_use_workitem_id_x:
 ; GCN-NOT: v0
 ; GCN: s_swappc_b64
@@ -160,8 +229,9 @@
 
 ; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
 define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -171,8 +241,9 @@
 
 ; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
 define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -182,8 +253,9 @@
 
 ; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
 ; GCN: s_waitcnt
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
-; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
+; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
 define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -207,6 +279,7 @@
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
 ; GCN: enable_vgpr_workitem_id = 1
 
+; GCN: v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-NOT: v1
 ; GCN: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: v1
@@ -221,7 +294,7 @@
 ; GCN: enable_vgpr_workitem_id = 2
 
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: v_mov_b32_e32 v1, v2
+; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2
 ; GCN: s_swappc_b64
 ; GCN-NOT: v0
 define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
@@ -232,6 +305,7 @@
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -357,6 +431,7 @@
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT: s_waitcnt
+; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
 ; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -469,15 +544,18 @@
 }
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_and_b32_e32 v32, 0x3ff, v32
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_bfe_u32 v32, v32, 10, 10
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: v_bfe_u32 v32, v32, 20, 10
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @too_many_args_use_workitem_id_xyz(
@@ -531,19 +609,19 @@
   ret void
 }
 
-; frame[0] = ID X
-; frame[1] = ID Y
-; frame[2] = ID Z
+; frame[0] = ID { Z, Y, X }
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; GCN: s_mov_b32 s33, s7
-; GCN: s_mov_b32 s32, s33
+; GCN-DAG: s_mov_b32 s33, s7
+; GCN-DAG: s_mov_b32 s32, s33
 
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-DAG: v_or_b32_e32 v0, v0, v1
+; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-DAG: v_or_b32_e32 v0, v0, v2
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
   call void @too_many_args_use_workitem_id_xyz(
@@ -560,19 +638,19 @@
 
 ; workitem ID X in register, yz on stack
 ; v31 = workitem ID X
-; frame[0] = workitem Y
-; frame[1] = workitem Z
+; frame[0] = workitem { Z, Y, X }
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
+; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; GCN-DAG: flat_store_dword v[0:1], [[IDX]]
+; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDY]]
+; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+; GCN-DAG: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, [[IDZ]]
 
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-; GCN: ScratchSize: 12
+; GCN: ScratchSize: 8
 define void @too_many_args_use_workitem_id_x_stack_yz(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
   i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -623,18 +701,18 @@
   ret void
 }
 
-; frame[0] = ID Y
-; frame[1] = ID Z
-
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
 ; GCN: enable_vgpr_workitem_id = 2
 
 ; GCN: s_mov_b32 s33, s7
-; GCN: s_mov_b32 s32, s33
 
-; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NOT: v0
+; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-DAG: v_or_b32_e32 v0, v0, v1
+; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-DAG: v_or_b32_e32 v31, v0, v2
+
+; GCN: s_mov_b32 s32, s33
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
   call void @too_many_args_use_workitem_id_x_stack_yz(
diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
index e257980..4e2e4d52 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll
@@ -1,8 +1,9 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; RUN: llc -O0 -march=amdgcn < %s | FileCheck %s
+; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefixes=GCN,O2 %s
+; RUN: llc -O0 -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
 
-; CHECK-NOT: and_b32
+; GCN-LABEL: {{^}}zext_grp_size_128:
+; GCN-NOT: and_b32
 
 ; OPT-LABEL: @zext_grp_size_128
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@@ -24,6 +25,9 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
+; GCN-NOT: and_b32
+
 ; OPT-LABEL: @zext_grp_size_32x4x1
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3
@@ -44,6 +48,9 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}zext_grp_size_1x1x1:
+; GCN-NOT: and_b32
+
 ; When EarlyCSE is not run this call produces a range max with 0 active bits,
 ; which is a special case as an AssertZext from width 0 is invalid.
 ; OPT-LABEL: @zext_grp_size_1x1x1
@@ -55,6 +62,9 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}zext_grp_size_512:
+; GCN-NOT: and_b32
+
 ; OPT-LABEL: @zext_grp_size_512
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !6
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !6
@@ -75,6 +85,11 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}func_test_workitem_id_x_known_max_range:
+; O2-NOT: and_b32
+; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
+; O2-NOT: and_b32
+
 ; OPT-LABEL: @func_test_workitem_id_x_known_max_range(
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 define void @func_test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
@@ -85,6 +100,11 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}func_test_workitem_id_x_default_range:
+; O2-NOT: and_b32
+; O2: v_and_b32_e32 v{{[0-9]+}}, 0x3ff,
+; O2-NOT: and_b32
+
 ; OPT-LABEL: @func_test_workitem_id_x_default_range(
 ; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !7
 define void @func_test_workitem_id_x_default_range(i32 addrspace(1)* nocapture %out) #4 {