AMDGPU: Use s_addk_i32 / s_mulk_i32

llvm-svn: 266506
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b48b9ad..ad3f63d 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -198,6 +198,10 @@
                                    Orig.isInternalRead());
 }
 
+static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
+  return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
+}
+
 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const SIInstrInfo *TII =
@@ -214,18 +218,6 @@
       Next = std::next(I);
       MachineInstr &MI = *I;
 
-      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
-      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
-        const MachineOperand &Src = MI.getOperand(1);
-
-        if (Src.isImm()) {
-          if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4))
-            MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-        }
-
-        continue;
-      }
-
       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
         // If this has a literal constant source that is the same as the
         // reversed bits of an inline immediate, replace with a bitreverse of
@@ -250,6 +242,47 @@
         }
       }
 
+      // FIXME: We also need to consider movs of constant operands since
+      // immediate operands are not folded if they have more than one use, and
+      // the operand folding pass is unaware if the immediate will be free since
+      // it won't know if the src == dest constraint will end up being
+      // satisfied.
+      if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
+          MI.getOpcode() == AMDGPU::S_MUL_I32) {
+        const MachineOperand &Dest = MI.getOperand(0);
+        const MachineOperand &Src0 = MI.getOperand(1);
+        const MachineOperand &Src1 = MI.getOperand(2);
+
+        // FIXME: This could work better if hints worked with subregisters. If
+        // we have a vector add of a constant, we usually don't get the correct
+        // allocation due to the subregister usage.
+        if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) &&
+            Src0.isReg()) {
+          MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg());
+          continue;
+        }
+
+        if (Src0.isReg() && Src0.getReg() == Dest.getReg()) {
+          if (Src1.isImm() && isKImmOperand(TII, Src1)) {
+            unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
+              AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+
+            MI.setDesc(TII->get(Opc));
+            MI.tieOperands(0, 1);
+          }
+        }
+      }
+
+      // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
+      if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
+        const MachineOperand &Src = MI.getOperand(1);
+
+        if (Src.isImm() && isKImmOperand(TII, Src))
+          MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
+
+        continue;
+      }
+
       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
         continue;