Runtime support for compiler kernel fusion.

The runtime will start a seperate process to call the new bcc to fuse kernels.

Change-Id: Ia73ea917a126a5055ec97f13d90a5feaafd6a2f5
diff --git a/cpu_ref/rsCpuScriptGroup2.cpp b/cpu_ref/rsCpuScriptGroup2.cpp
index 52cd8a0..90907d0 100644
--- a/cpu_ref/rsCpuScriptGroup2.cpp
+++ b/cpu_ref/rsCpuScriptGroup2.cpp
@@ -1,5 +1,15 @@
 #include "rsCpuScriptGroup2.h"
 
+#include <dlfcn.h>
+
+#include <string>
+#include <vector>
+
+#ifndef RS_COMPATIBILITY_LIB
+#include "bcc/Config/Config.h"
+#include <sys/wait.h>
+#endif
+
 #include "cpu_ref/rsCpuCore.h"
 #include "rsClosure.h"
 #include "rsContext.h"
@@ -7,13 +17,17 @@
 #include "rsCpuScript.h"
 #include "rsScript.h"
 #include "rsScriptGroup2.h"
+#include "rsScriptIntrinsic.h"
+
+using std::string;
+using std::vector;
 
 namespace android {
 namespace renderscript {
 
 namespace {
 
-static const size_t DefaultKernelArgCount = 2;
+const size_t DefaultKernelArgCount = 2;
 
 void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart,
                uint32_t xend, uint32_t outstep) {
@@ -66,25 +80,45 @@
   mutable_kparams->usr        = &closures;
 }
 
-/*
-  Returns true if closure depends on any closure in batch via a glboal variable
-  TODO: this probably should go into class Closure.
- */
-bool conflict(const list<CPUClosure*> &batch, CPUClosure* closure) {
+}  // namespace
+
+Batch::~Batch() {
+  for (CPUClosure* c : mClosures) {
+    delete c;
+  }
+  if (mScriptObj) {
+    dlclose(mScriptObj);
+  }
+}
+
+bool Batch::conflict(CPUClosure* closure) const {
+  if (mClosures.empty()) {
+    return false;
+  }
+
+  if (closure->mClosure->mKernelID.get() == nullptr ||
+      mClosures.front()->mClosure->mKernelID.get() == nullptr) {
+    // An invoke should be in a batch by itself, so it conflicts with any other
+    // closure.
+    return true;
+  }
+
   for (const auto &p : closure->mClosure->mGlobalDeps) {
     const Closure* dep = p.first;
-    for (CPUClosure* c : batch) {
+    for (CPUClosure* c : mClosures) {
       if (c->mClosure == dep) {
+        ALOGV("ScriptGroup2: closure %p conflicting with closure %p via its global", closure, dep);
         return true;
       }
     }
   }
   for (const auto &p : closure->mClosure->mArgDeps) {
     const Closure* dep = p.first;
-    for (CPUClosure* c : batch) {
+    for (CPUClosure* c : mClosures) {
       if (c->mClosure == dep) {
         for (const auto &p1 : *p.second) {
-          if (p1.second != nullptr) {
+          if (p1.second->get() != nullptr) {
+            ALOGV("ScriptGroup2: closure %p conflicting with closure %p via its arg", closure, dep);
             return true;
           }
         }
@@ -94,12 +128,10 @@
   return false;
 }
 
-}  // namespace
-
 CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl,
                                          const ScriptGroupBase *sg) :
     mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) {
-  list<CPUClosure*>* batch = new list<CPUClosure*>();
+  Batch* batch = new Batch(this);
   for (Closure* closure: mGroup->mClosures) {
     const ScriptKernelID* kernelID = closure->mKernelID.get();
     RsdCpuScriptImpl* si =
@@ -110,32 +142,192 @@
     // TODO: Is mtls.fep.usrLen ever used?
     CPUClosure* cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel,
                                     mtls.fep.usr, mtls.fep.usrLen);
-    if (conflict(*batch, cc)) {
+    if (batch->conflict(cc)) {
       mBatches.push_back(batch);
-      batch = new list<CPUClosure*>();
+      batch = new Batch(this);
     }
-    batch->push_back(cc);
+
+    batch->mClosures.push_back(cc);
   }
+
   mBatches.push_back(batch);
+
+#ifndef RS_COMPATIBILITY_LIB
+  for (Batch* batch : mBatches) {
+    batch->tryToCreateFusedKernel(mGroup->mCacheDir.c_str());
+  }
+#endif
 }
 
 CpuScriptGroup2Impl::~CpuScriptGroup2Impl() {
-  for (list<CPUClosure*>* batch : mBatches) {
-    for (CPUClosure* c : *batch) {
-      delete c;
-    }
+  for (Batch* batch : mBatches) {
+    delete batch;
   }
 }
 
+namespace {
+
+#ifndef RS_COMPATIBILITY_LIB
+
+string getFileName(string path) {
+  unsigned found = path.find_last_of("/\\");
+  return path.substr(found + 1);
+}
+
+void setupCompileArguments(
+    const vector<string>& inputs, const vector<int>& kernels,
+    const string& output_dir, const string& output_filename,
+    const string& rsLib, vector<const char*>* args) {
+  args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH);
+  args->push_back("-fPIC");
+  args->push_back("-embedRSInfo");
+  args->push_back("-mtriple");
+  args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
+  args->push_back("-bclib");
+  args->push_back(rsLib.c_str());
+  for (const string& input : inputs) {
+    args->push_back(input.c_str());
+  }
+  for (int kernel : kernels) {
+    args->push_back("-k");
+    string strKernel = std::to_string(kernel);
+    args->push_back(strKernel.c_str());
+  }
+  args->push_back("-output_path");
+  args->push_back(output_dir.c_str());
+  args->push_back("-o");
+  args->push_back(output_filename.c_str());
+  args->push_back(nullptr);
+}
+
+string convertListToString(int n, const char* const* strs) {
+  string ret;
+  ret.append(strs[0]);
+  for (int i = 1; i < n; i++) {
+    ret.append(" ");
+    ret.append(strs[i]);
+  }
+  return ret;
+}
+
+bool fuseAndCompile(const char** arguments,
+                    const string& commandLine) {
+  const pid_t pid = fork();
+
+  if (pid == -1) {
+    ALOGE("Couldn't fork for bcc execution");
+    return false;
+  }
+
+  if (pid == 0) {
+    // Child process
+    ALOGV("Invoking BCC with: %s", commandLine.c_str());
+    execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments);
+
+    ALOGE("execv() failed: %s", strerror(errno));
+    abort();
+    return false;
+  }
+
+  // Parent process
+  int status = 0;
+  const pid_t w = waitpid(pid, &status, 0);
+  if (w == -1) {
+    return false;
+  }
+
+  if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) {
+    ALOGE("bcc terminated unexpectedly");
+    return false;
+  }
+
+  return true;
+}
+#endif
+
+}  // anonymous namespace
+
+void Batch::tryToCreateFusedKernel(const char *cacheDir) {
+#ifndef RS_COMPATIBILITY_LIB
+  if (mClosures.size() < 2) {
+    ALOGV("Compiler kernel fusion skipped due to only one or zero kernel in"
+          " a script group batch.");
+    return;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Fuse the input kernels and generate native code in an object file
+  //===--------------------------------------------------------------------===//
+
+  std::vector<string> inputFiles;
+  std::vector<int> slots;
+
+  for (CPUClosure* cpuClosure : mClosures) {
+    const Closure* closure = cpuClosure->mClosure;
+    const ScriptKernelID* kernelID = closure->mKernelID.get();
+    const Script* script = kernelID->mScript;
+
+    if (script->isIntrinsic()) {
+      return;
+    }
+
+    const RsdCpuScriptImpl *cpuScript =
+        (const RsdCpuScriptImpl*)script->mHal.drv;
+
+    const string& bitcodeFilename = cpuScript->getBitcodeFilePath();
+
+    inputFiles.push_back(bitcodeFilename);
+    slots.push_back(kernelID->mSlot);
+  }
+
+  string outputPath(tempnam(cacheDir, "fused"));
+  string outputFileName = getFileName(outputPath);
+  string objFilePath(outputPath);
+  objFilePath.append(".o");
+  string rsLibPath(SYSLIBPATH"/libclcore.bc");
+  vector<const char*> arguments;
+  setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath,
+                        &arguments);
+  string commandLine =
+      convertListToString(arguments.size() - 1, arguments.data());
+
+  if (!fuseAndCompile(arguments.data(), commandLine)) {
+    return;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Create and load the shared lib
+  //===--------------------------------------------------------------------===//
+
+  const char* resName = outputFileName.c_str();
+
+  if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) {
+    ALOGE("Failed to link object file '%s'", resName);
+    return;
+  }
+
+  void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName);
+  if (mSharedObj == nullptr) {
+    ALOGE("Unable to load '%s'", resName);
+    return;
+  }
+
+  mExecutable = ScriptExecutable::createFromSharedObject(
+      nullptr,  // RS context. Unused.
+      mSharedObj);
+
+#endif  // RS_COMPATIBILITY_LIB
+}
+
 void CpuScriptGroup2Impl::execute() {
-  for (list<CPUClosure*>* batch : mBatches) {
-    setGlobalsForBatch(*batch);
-    runBatch(*batch);
+  for (auto batch : mBatches) {
+    batch->setGlobalsForBatch();
+    batch->run();
   }
 }
 
-void CpuScriptGroup2Impl::setGlobalsForBatch(const list<CPUClosure*>& batch) {
-  for (CPUClosure* cpuClosure : batch) {
+void Batch::setGlobalsForBatch() {
+  for (CPUClosure* cpuClosure : mClosures) {
     const Closure* closure = cpuClosure->mClosure;
     const ScriptKernelID* kernelID = closure->mKernelID.get();
     Script* s = kernelID->mScript;
@@ -152,8 +344,32 @@
   }
 }
 
-void CpuScriptGroup2Impl::runBatch(const list<CPUClosure*>& batch) {
-  for (CPUClosure* cpuClosure : batch) {
+void Batch::run() {
+  if (mExecutable != nullptr) {
+    MTLaunchStruct mtls;
+    const CPUClosure* firstCpuClosure = mClosures.front();
+    const CPUClosure* lastCpuClosure = mClosures.back();
+
+    firstCpuClosure->mSi->forEachMtlsSetup(
+        (const Allocation**)&firstCpuClosure->mClosure->mArgs[0],
+        firstCpuClosure->mClosure->mArgs.size(),
+        lastCpuClosure->mClosure->mReturnValue,
+        nullptr, 0, nullptr, &mtls);
+
+    mtls.script = nullptr;
+    mtls.fep.usr = nullptr;
+    mtls.kernel = mExecutable->getForEachFunction(0);
+
+    mGroup->getCpuRefImpl()->launchThreads(
+        (const Allocation**)&firstCpuClosure->mClosure->mArgs[0],
+        firstCpuClosure->mClosure->mArgs.size(),
+        lastCpuClosure->mClosure->mReturnValue,
+        nullptr, &mtls);
+
+    return;
+  }
+
+  for (CPUClosure* cpuClosure : mClosures) {
     const Closure* closure = cpuClosure->mClosure;
     const ScriptKernelID* kernelID = closure->mKernelID.get();
     cpuClosure->mSi->preLaunch(kernelID->mSlot,
@@ -163,7 +379,7 @@
                                nullptr);
   }
 
-  const CPUClosure* cpuClosure = batch.front();
+  const CPUClosure* cpuClosure = mClosures.front();
   const Closure* closure = cpuClosure->mClosure;
   MTLaunchStruct mtls;
 
@@ -174,12 +390,12 @@
 
       mtls.script = nullptr;
       mtls.kernel = (void (*)())&groupRoot;
-      mtls.fep.usr = &batch;
+      mtls.fep.usr = &mClosures;
 
-      mCpuRefImpl->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
+      mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls);
   }
 
-  for (CPUClosure* cpuClosure : batch) {
+  for (CPUClosure* cpuClosure : mClosures) {
     const Closure* closure = cpuClosure->mClosure;
     const ScriptKernelID* kernelID = closure->mKernelID.get();
     cpuClosure->mSi->postLaunch(kernelID->mSlot,