Add preliminary implementation for GPGPU code generation. Translate the selected parallel loop body into a ptx string and run it with the cuda driver API. We limit this preliminary implementation to target the following special test cases: - Support only 2-dimensional parallel loops with or without only one innermost non-parallel loop. - Support write memory access to only one array in a SCoP. The patch was committed with smaller changes to the build system: There is now a flag to enable gpu code generation explictly. This was required as we need the llvm.codegen() patch applied on the llvm sources, to compile this feature correctly. Also, enabling gpu code generation does not require cuda. This requirement was removed to allow 'make polly-test' runs, even without an installed cuda runtime. Contributed by: Yabin Hu <yabin.hwu@gmail.com> llvm-svn: 161239

commit: 6217e18a7dd50c9f112f8468f7c77fc260f40d89 [log] [tgz]
author: Tobias Grosser <grosser@fim.uni-passau.de> Fri Aug 03 12:50:07 2012 +0000
committer: Tobias Grosser <grosser@fim.uni-passau.de> Fri Aug 03 12:50:07 2012 +0000
tree: cbf2767ae63ffb99467b07bc3edf7ec7d461aff0
parent: 7555b54020b1abcc1e67f66452600e2fa8f93dcd [diff] [blame]
diff --git a/polly/lib/CodeGen/CodeGeneration.cpp b/polly/lib/CodeGen/CodeGeneration.cpp
index 9cb708f..5395607 100644
--- a/polly/lib/CodeGen/CodeGeneration.cpp
+++ b/polly/lib/CodeGen/CodeGeneration.cpp

@@ -31,6 +31,7 @@
 #include "polly/CodeGen/CodeGeneration.h"
 #include "polly/CodeGen/BlockGenerators.h"
 #include "polly/CodeGen/LoopGenerators.h"
+#include "polly/CodeGen/PTXGenerator.h"
 #include "polly/CodeGen/Utils.h"
 #include "polly/Support/GICHelper.h"
 
@@ -65,6 +66,19 @@
        cl::value_desc("OpenMP code generation enabled if true"),
        cl::init(false), cl::ZeroOrMore);
 
+#ifdef GPU_CODEGEN
+static cl::opt<bool>
+GPGPU("enable-polly-gpgpu",
+       cl::desc("Generate GPU parallel code"), cl::Hidden,
+       cl::value_desc("GPGPU code generation enabled if true"),
+       cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<std::string>
+GPUTriple("polly-gpgpu-triple",
+       cl::desc("Target triple for GPU code generation"),
+       cl::Hidden, cl::init(""));
+#endif /* GPU_CODEGEN */
+
 static cl::opt<bool>
 AtLeastOnce("enable-polly-atLeastOnce",
        cl::desc("Give polly the hint, that every loop is executed at least"
@@ -284,6 +298,27 @@
   /// statement.
   void codegenForOpenMP(const clast_for *f);
 
+#ifdef GPU_CODEGEN
+  /// @brief Create GPGPU device memory access values.
+  ///
+  /// Create a list of values that will be set to be parameters of the GPGPU
+  /// subfunction. These parameters represent device memory base addresses
+  /// and the size in bytes.
+  SetVector<Value*> getGPUValues(unsigned &OutputBytes);
+
+  /// @brief Create a GPU parallel for loop.
+  ///
+  /// This loop reflects a loop as if it would have been created by a GPU
+  /// statement.
+  void codegenForGPGPU(const clast_for *F);
+
+  /// @brief Get innermost for loop.
+  const clast_stmt *getScheduleInfo(const clast_for *F,
+                                    std::vector<int> &NumIters,
+                                    unsigned &LoopDepth,
+                                    unsigned &NonPLoopDepth);
+#endif /* GPU_CODEGEN */
+
   /// @brief Check if a loop is parallel
   ///
   /// Detect if a clast_for loop can be executed in parallel.
@@ -530,6 +565,163 @@
   Builder.SetInsertPoint(AfterLoop);
 }
 
+#ifdef GPU_CODEGEN
+static unsigned getArraySizeInBytes(const ArrayType *AT) {
+  unsigned Bytes = AT->getNumElements();
+  if (const ArrayType *T = dyn_cast<ArrayType>(AT->getElementType()))
+    Bytes *= getArraySizeInBytes(T);
+  else
+    Bytes *= AT->getElementType()->getPrimitiveSizeInBits() / 8;
+
+  return Bytes;
+}
+
+SetVector<Value*> ClastStmtCodeGen::getGPUValues(unsigned &OutputBytes) {
+  SetVector<Value*> Values;
+  OutputBytes = 0;
+
+  // Record the memory reference base addresses.
+  for (Scop::iterator SI = S->begin(), SE = S->end(); SI != SE; ++SI) {
+    ScopStmt *Stmt = *SI;
+    for (SmallVector<MemoryAccess*, 8>::iterator I = Stmt->memacc_begin(),
+         E = Stmt->memacc_end(); I != E; ++I) {
+      Value *BaseAddr = const_cast<Value*>((*I)->getBaseAddr());
+      Values.insert((BaseAddr));
+
+      // FIXME: we assume that there is one and only one array to be written
+      // in a SCoP.
+      int NumWrites = 0;
+      if ((*I)->isWrite()) {
+        ++NumWrites;
+        assert(NumWrites <= 1 &&
+               "We support at most one array to be written in a SCoP.");
+        if (const PointerType * PT =
+            dyn_cast<PointerType>(BaseAddr->getType())) {
+          Type *T = PT->getArrayElementType();
+          const ArrayType *ATy = dyn_cast<ArrayType>(T);
+          OutputBytes = getArraySizeInBytes(ATy);
+        }
+      }
+    }
+  }
+
+  return Values;
+}
+
+const clast_stmt *ClastStmtCodeGen::getScheduleInfo(const clast_for *F,
+                                                    std::vector<int> &NumIters,
+                                                    unsigned &LoopDepth,
+                                                    unsigned &NonPLoopDepth) {
+  clast_stmt *Stmt = (clast_stmt *)F;
+  const clast_for *Result;
+  bool NonParaFlag = false;
+  LoopDepth = 0;
+  NonPLoopDepth = 0;
+
+  while (Stmt) {
+    if (CLAST_STMT_IS_A(Stmt, stmt_for)) {
+      const clast_for *T = (clast_for *) Stmt;
+      if (isParallelFor(T)) {
+        if (!NonParaFlag) {
+          NumIters.push_back(getNumberOfIterations(T));
+          Result = T;
+        }
+      } else
+        NonParaFlag = true;
+
+      Stmt = T->body;
+      LoopDepth++;
+      continue;
+    }
+    Stmt = Stmt->next;
+  }
+
+  assert(NumIters.size() == 4 &&
+         "The loops should be tiled into 4-depth parallel loops and an "
+         "innermost non-parallel one (if exist).");
+  NonPLoopDepth = LoopDepth - NumIters.size();
+  assert(NonPLoopDepth <= 1
+         && "We support only one innermost non-parallel loop currently.");
+  return (const clast_stmt *)Result->body;
+}
+
+void ClastStmtCodeGen::codegenForGPGPU(const clast_for *F) {
+  BasicBlock::iterator LoopBody;
+  SetVector<Value *> Values;
+  SetVector<Value *> IVS;
+  std::vector<int> NumIterations;
+  PTXGenerator::ValueToValueMapTy VMap;
+
+  assert(!GPUTriple.empty()
+         && "Target triple should be set properly for GPGPU code generation.");
+  PTXGenerator PTXGen(Builder, P, GPUTriple);
+
+  // Get original IVS and ScopStmt
+  unsigned TiledLoopDepth, NonPLoopDepth;
+  const clast_stmt *InnerStmt = getScheduleInfo(F, NumIterations,
+                                                TiledLoopDepth, NonPLoopDepth);
+  const clast_stmt *TmpStmt;
+  const clast_user_stmt *U;
+  const clast_for *InnerFor;
+  if (CLAST_STMT_IS_A(InnerStmt, stmt_for)) {
+    InnerFor = (const clast_for *)InnerStmt;
+    TmpStmt = InnerFor->body;
+  } else
+    TmpStmt = InnerStmt;
+  U = (const clast_user_stmt *) TmpStmt;
+  ScopStmt *Statement = (ScopStmt *) U->statement->usr;
+  for (unsigned i = 0; i < Statement->getNumIterators() - NonPLoopDepth; i++) {
+    const Value* IV = Statement->getInductionVariableForDimension(i);
+    IVS.insert(const_cast<Value *>(IV));
+  }
+
+  unsigned OutBytes;
+  Values = getGPUValues(OutBytes);
+  PTXGen.setOutputBytes(OutBytes);
+  PTXGen.startGeneration(Values, IVS, VMap, &LoopBody);
+
+  BasicBlock::iterator AfterLoop = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(LoopBody);
+
+  BasicBlock *AfterBB = 0;
+  if (NonPLoopDepth) {
+    Value *LowerBound, *UpperBound, *IV, *Stride;
+    Type *IntPtrTy = getIntPtrTy();
+    LowerBound = ExpGen.codegen(InnerFor->LB, IntPtrTy);
+    UpperBound = ExpGen.codegen(InnerFor->UB, IntPtrTy);
+    Stride = Builder.getInt(APInt_from_MPZ(InnerFor->stride));
+    IV = createLoop(LowerBound, UpperBound, Stride, Builder, P, AfterBB);
+    const Value *OldIV_ = Statement->getInductionVariableForDimension(2);
+    Value *OldIV = const_cast<Value *>(OldIV_);
+    VMap.insert(std::make_pair<Value*, Value*>(OldIV, IV));
+  }
+
+  updateWithValueMap(VMap, /* reverse */ false);
+  BlockGenerator::generate(Builder, *Statement, ValueMap, P);
+  updateWithValueMap(VMap, /* reverse */ true);
+
+  if (AfterBB)
+    Builder.SetInsertPoint(AfterBB->begin());
+
+  // FIXME: The replacement of the host base address with the parameter of ptx
+  // subfunction should have been done by updateWithValueMap. We use the
+  // following codes to avoid affecting other parts of Polly. This should be
+  // fixed later.
+  Function *FN = Builder.GetInsertBlock()->getParent();
+  for (unsigned j = 0; j < Values.size(); j++) {
+    Value *baseAddr = Values[j];
+    for (Function::iterator B = FN->begin(); B != FN->end(); ++B) {
+      for (BasicBlock::iterator I = B->begin(); I != B->end(); ++I)
+        I->replaceUsesOfWith(baseAddr, ValueMap[baseAddr]);
+    }
+  }
+  Builder.SetInsertPoint(AfterLoop);
+  PTXGen.setLaunchingParameters(NumIterations[0], NumIterations[1],
+                                NumIterations[2], NumIterations[3]);
+  PTXGen.finishGeneration(FN);
+}
+#endif
+
 bool ClastStmtCodeGen::isInnermostLoop(const clast_for *f) {
   const clast_stmt *stmt = f->body;
 
@@ -647,6 +839,18 @@
     }
   }
 
+#ifdef GPU_CODEGEN
+  if (GPGPU && isParallelFor(f)) {
+    if (!parallelCodeGeneration) {
+      parallelCodeGeneration = true;
+      parallelLoops.push_back(f->iterator);
+      codegenForGPGPU(f);
+      parallelCodeGeneration = false;
+      return;
+    }
+  }
+#endif
+
   codegenForSequential(f);
 }
commit	6217e18a7dd50c9f112f8468f7c77fc260f40d89	[log] [tgz]
author	Tobias Grosser <grosser@fim.uni-passau.de>	Fri Aug 03 12:50:07 2012 +0000
committer	Tobias Grosser <grosser@fim.uni-passau.de>	Fri Aug 03 12:50:07 2012 +0000
tree	cbf2767ae63ffb99467b07bc3edf7ec7d461aff0
parent	7555b54020b1abcc1e67f66452600e2fa8f93dcd [diff] [blame]