[AMDGPU] DPP combiner: recognize identities for more opcodes

Summary:
This allows the DPP combiner to kick in more often. For example the
exclusive scan generated by the atomic optimizer for a divergent atomic
add used to look like this:

        v_mov_b32_e32 v3, v1
        v_mov_b32_e32 v5, v1
        v_mov_b32_e32 v6, v1
        v_mov_b32_dpp v3, v2  wave_shr:1 row_mask:0xf bank_mask:0xf
        s_nop 1
        v_add_u32_dpp v4, v3, v3  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
        v_mov_b32_dpp v5, v3  row_shr:2 row_mask:0xf bank_mask:0xf
        v_mov_b32_dpp v6, v3  row_shr:3 row_mask:0xf bank_mask:0xf
        v_add3_u32 v3, v4, v5, v6
        v_mov_b32_e32 v4, v1
        s_nop 1
        v_mov_b32_dpp v4, v3  row_shr:4 row_mask:0xf bank_mask:0xe
        v_add_u32_e32 v3, v3, v4
        v_mov_b32_e32 v4, v1
        s_nop 1
        v_mov_b32_dpp v4, v3  row_shr:8 row_mask:0xf bank_mask:0xc
        v_add_u32_e32 v3, v3, v4
        v_mov_b32_e32 v4, v1
        s_nop 1
        v_mov_b32_dpp v4, v3  row_bcast:15 row_mask:0xa bank_mask:0xf
        v_add_u32_e32 v3, v3, v4
        s_nop 1
        v_mov_b32_dpp v1, v3  row_bcast:31 row_mask:0xc bank_mask:0xf
        v_add_u32_e32 v1, v3, v1
        v_add_u32_e32 v1, v2, v1
        v_readlane_b32 s0, v1, 63

But now most of the dpp movs are combined into adds:

        v_mov_b32_e32 v3, v1
        v_mov_b32_e32 v5, v1
        s_nop 0
        v_mov_b32_dpp v3, v2  wave_shr:1 row_mask:0xf bank_mask:0xf
        s_nop 1
        v_add_u32_dpp v4, v3, v3  row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0
        v_mov_b32_dpp v5, v3  row_shr:2 row_mask:0xf bank_mask:0xf
        v_mov_b32_dpp v1, v3  row_shr:3 row_mask:0xf bank_mask:0xf
        v_add3_u32 v1, v4, v5, v1
        s_nop 1
        v_add_u32_dpp v1, v1, v1  row_shr:4 row_mask:0xf bank_mask:0xe
        s_nop 1
        v_add_u32_dpp v1, v1, v1  row_shr:8 row_mask:0xf bank_mask:0xc
        s_nop 1
        v_add_u32_dpp v1, v1, v1  row_bcast:15 row_mask:0xa bank_mask:0xf
        s_nop 1
        v_add_u32_dpp v1, v1, v1  row_bcast:31 row_mask:0xc bank_mask:0xf
        v_add_u32_e32 v1, v2, v1
        v_readlane_b32 s0, v1, 63

Reviewers: arsenm, vpykhtin

Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64207

llvm-svn: 365211
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 536dc54..7348b5b 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -253,33 +253,46 @@
   switch (OrigMIOp) {
   default: break;
   case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_U32_e64:
   case AMDGPU::V_ADD_I32_e32:
+  case AMDGPU::V_ADD_I32_e64:
   case AMDGPU::V_OR_B32_e32:
+  case AMDGPU::V_OR_B32_e64:
   case AMDGPU::V_SUBREV_U32_e32:
+  case AMDGPU::V_SUBREV_U32_e64:
   case AMDGPU::V_SUBREV_I32_e32:
+  case AMDGPU::V_SUBREV_I32_e64:
   case AMDGPU::V_MAX_U32_e32:
+  case AMDGPU::V_MAX_U32_e64:
   case AMDGPU::V_XOR_B32_e32:
+  case AMDGPU::V_XOR_B32_e64:
     if (OldOpnd->getImm() == 0)
       return true;
     break;
   case AMDGPU::V_AND_B32_e32:
+  case AMDGPU::V_AND_B32_e64:
   case AMDGPU::V_MIN_U32_e32:
+  case AMDGPU::V_MIN_U32_e64:
     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<uint32_t>::max())
       return true;
     break;
   case AMDGPU::V_MIN_I32_e32:
+  case AMDGPU::V_MIN_I32_e64:
     if (static_cast<int32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<int32_t>::max())
       return true;
     break;
   case AMDGPU::V_MAX_I32_e32:
+  case AMDGPU::V_MAX_I32_e64:
     if (static_cast<int32_t>(OldOpnd->getImm()) ==
         std::numeric_limits<int32_t>::min())
       return true;
     break;
   case AMDGPU::V_MUL_I32_I24_e32:
+  case AMDGPU::V_MUL_I32_I24_e64:
   case AMDGPU::V_MUL_U32_U24_e32:
+  case AMDGPU::V_MUL_U32_U24_e64:
     if (OldOpnd->getImm() == 1)
       return true;
     break;