R600: Turn TEX/VTX into native instructions

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180756 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 0f811b1..36f2c15 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -142,6 +142,7 @@
   if (isFCOp(MI.getOpcode())){
     EmitFCInstr(MI, OS);
   } else if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
     MI.getOpcode() == AMDGPU::BUNDLE ||
     MI.getOpcode() == AMDGPU::KILL) {
     return;
@@ -166,10 +167,13 @@
     case AMDGPU::TEX_VTX_TEXBUF : {
       uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
       uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+      InstWord2 |= 1 << 19;
 
-      EmitByte(INSTR_VTX, OS);
+      EmitByte(INSTR_NATIVE, OS);
       Emit(InstWord01, OS);
+      EmitByte(INSTR_NATIVE, OS);
       Emit(InstWord2, OS);
+      Emit((u_int32_t) 0, OS);
       break;
     }
     case AMDGPU::TEX_LD:
@@ -241,9 +245,11 @@
           SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
           Offsets[2] << 10;
 
-      EmitByte(INSTR_TEX, OS);
+      EmitByte(INSTR_NATIVE, OS);
       Emit(Word01, OS);
+      EmitByte(INSTR_NATIVE, OS);
       Emit(Word2, OS);
+      Emit((u_int32_t) 0, OS);
       break;
     }
     case AMDGPU::CF_ALU:
@@ -253,13 +259,13 @@
       Emit(Inst, OS);
       break;
     }
-    case AMDGPU::CF_TC_EG:
-    case AMDGPU::CF_VC_EG:
     case AMDGPU::CF_CALL_FS_EG:
-    case AMDGPU::CF_TC_R600:
-    case AMDGPU::CF_VC_R600:
     case AMDGPU::CF_CALL_FS_R600:
       return;
+    case AMDGPU::CF_TC_EG:
+    case AMDGPU::CF_VC_EG:
+    case AMDGPU::CF_TC_R600:
+    case AMDGPU::CF_VC_R600:
     case AMDGPU::WHILE_LOOP_EG:
     case AMDGPU::END_LOOP_EG:
     case AMDGPU::LOOP_BREAK_EG:
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index f978612..611d61a 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -30,6 +30,8 @@
 class R600ControlFlowFinalizer : public MachineFunctionPass {
 
 private:
+  typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
+
   enum ControlFlowInstruction {
     CF_TC,
     CF_VC,
@@ -105,28 +107,44 @@
     return TII->get(Opcode);
   }
 
-  MachineBasicBlock::iterator
-  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-      unsigned CfAddress) const {
+  ClauseFile
+  MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
+      const {
     MachineBasicBlock::iterator ClauseHead = I;
+    std::vector<MachineInstr *> ClauseContent;
     unsigned AluInstCount = 0;
     bool IsTex = TII->usesTextureCache(ClauseHead);
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
       if (IsTrivialInst(I))
         continue;
+      if (AluInstCount > MaxFetchInst)
+        break;
       if ((IsTex && !TII->usesTextureCache(I)) ||
           (!IsTex && !TII->usesVertexCache(I)))
         break;
       AluInstCount ++;
-      if (AluInstCount > MaxFetchInst)
-        break;
+      ClauseContent.push_back(I);
     }
-    BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
+    MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
         getHWInstrDesc(IsTex?CF_TC:CF_VC))
-        .addImm(CfAddress) // ADDR
-        .addImm(AluInstCount); // COUNT
-    return I;
+        .addImm(0) // ADDR
+        .addImm(AluInstCount - 1); // COUNT
+    return ClauseFile(MIb, ClauseContent);
   }
+
+  void
+  EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
+      unsigned &CfCount) {
+    CounterPropagateAddr(Clause.first, CfCount);
+    MachineBasicBlock *BB = Clause.first->getParent();
+    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
+        .addImm(CfCount);
+    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
+      BB->splice(InsertPos, BB, Clause.second[i]);
+    }
+    CfCount += 2 * Clause.second.size();
+  }
+
   void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
     MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
   }
@@ -182,11 +200,12 @@
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
       }
+      std::vector<ClauseFile> FetchClauses;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
-          I = MakeFetchClause(MBB, I, 0);
+          FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
           continue;
         }
@@ -307,6 +326,8 @@
             BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
             CfCount++;
           }
+          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
+            EmitFetchClause(I, FetchClauses[i], CfCount);
         }
         default:
           break;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 18760cb..09728f8 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -477,6 +477,7 @@
     let FETCH_WHOLE_QUAD = 0;
     let ALT_CONST = 0;
     let SAMPLER_INDEX_MODE = 0;
+    let RESOURCE_INDEX_MODE = 0;
 
     let COORD_TYPE_X = 0;
     let COORD_TYPE_Y = 0;
@@ -928,6 +929,13 @@
 def CF_ALU : ALU_CLAUSE<8, "ALU">;
 def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
 
+def FETCH_CLAUSE : AMDGPUInst <(outs),
+(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
+  field bits<8> Inst;
+  bits<8> num;
+  let Inst = num;
+}
+
 def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
   field bits<64> Inst;
 }