diff --git a/include/bcc/Renderscript/RSCompilerDriver.h b/include/bcc/Renderscript/RSCompilerDriver.h
index efc1fa4..68f5fd4 100644
--- a/include/bcc/Renderscript/RSCompilerDriver.h
+++ b/include/bcc/Renderscript/RSCompilerDriver.h
@@ -23,6 +23,7 @@
 
 #include "bcinfo/MetadataExtractor.h"
 
+#include <list>
 #include <vector>
 
 namespace bcc {
@@ -120,8 +121,11 @@
 
   bool buildScriptGroup(
       BCCContext& Context, const char* pOutputFilepath, const char* pRuntimePath,
-      const std::vector<const Source*>& sources, const std::vector<int>& slots,
-      bool dumpIR);
+      bool dumpIR, const std::vector<Source*>& sources,
+      const std::list<std::list<std::pair<int, int>>>& toFuse,
+      const std::list<std::string>& fused,
+      const std::list<std::list<std::pair<int, int>>>& invokes,
+      const std::list<std::string>& invokeBatchNames);
 
   // Returns true if script is successfully compiled.
   bool buildForCompatLib(RSScript &pScript, const char *pOut,
diff --git a/include/bcc/Renderscript/RSMetadata.h b/include/bcc/Renderscript/RSMetadata.h
deleted file mode 100644
index 33db022..0000000
--- a/include/bcc/Renderscript/RSMetadata.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright 2015, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef BCC_RS_METADATA_H
-#define BCC_RS_METADATA_H
-
-#include "stdint.h"
-
-namespace llvm {
-  class Module;
-  class Function;
-}
-
-namespace bcc {
-
-/// @brief Class to manage RenderScript metadata.
-class RSMetadata{
-  llvm::Module &Module;
-
-public:
-
-  /// @brief Create a metadata manager for a specific LLVM module.
-  ///
-  /// @param Module The module to work on.
-  RSMetadata(llvm::Module &Module);
-
-  /// @brief Delete all metadata.
-  void deleteAll();
-
-  /// @brief Add foreach function.
-  ///
-  /// Add metadata to describe a new foreach function.
-  ///
-  /// @param Function The function to mark.
-  /// @param Properties The properties of the function.
-  void markForEachFunction(llvm::Function &Function, uint32_t Properties);
-};
-
-} // end namespace bcc
-
-#endif /* BCC_RS_METADATA_H */
diff --git a/include/bcc/Renderscript/RSScriptGroupFusion.h b/include/bcc/Renderscript/RSScriptGroupFusion.h
index 5478956..51e983a 100644
--- a/include/bcc/Renderscript/RSScriptGroupFusion.h
+++ b/include/bcc/Renderscript/RSScriptGroupFusion.h
@@ -18,6 +18,7 @@
 #define BCC_RS_SCRIPT_GROUP_FUSION_H
 
 #include <vector>
+#include <string>
 
 namespace llvm {
 class Module;
@@ -26,18 +27,23 @@
 namespace bcc {
 
 class Source;
-class RSScript;
 class BCCContext;
 
 /// @brief Fuse kernels
 ///
-/// @param Sources The Sources containing the kernels.
-/// @param Slots The slots where the kernels are located.
-/// @return A script that containing the fused kernels.
-// TODO(yangni): Check FP precision. (http://b/19098612)
-llvm::Module* fuseKernels(BCCContext& Context,
-                          const std::vector<const Source *>& sources,
-                          const std::vector<int>& slots);
+/// @param Context bcc context.
+/// @param sources The Sources containing the kernels.
+/// @param slots The slots where the kernels are located.
+/// @param fusedName
+/// @return True, if kernels are successfully merged. False, otherwise.
+bool fuseKernels(BCCContext& Context,
+                 const std::vector<Source *>& sources,
+                 const std::vector<int>& slots,
+                 const std::string& fusedName,
+                 llvm::Module* mergedModule);
+
+bool renameInvoke(BCCContext& Context, const Source* source, const int slot,
+                  const std::string& newName, llvm::Module* mergedModule);
 }
 
 #endif /* BCC_RS_SCRIPT_GROUP_FUSION_H */
diff --git a/lib/Renderscript/Android.mk b/lib/Renderscript/Android.mk
index 251b32c..8e692f9 100644
--- a/lib/Renderscript/Android.mk
+++ b/lib/Renderscript/Android.mk
@@ -29,7 +29,6 @@
   RSInfoExtractor.cpp \
   RSInfoReader.cpp \
   RSInfoWriter.cpp \
-  RSMetadata.cpp \
   RSScript.cpp \
   RSInvokeHelperPass.cpp \
   RSIsThreadablePass.cpp \
diff --git a/lib/Renderscript/RSCompilerDriver.cpp b/lib/Renderscript/RSCompilerDriver.cpp
index 547700e..21beaa2 100644
--- a/lib/Renderscript/RSCompilerDriver.cpp
+++ b/lib/Renderscript/RSCompilerDriver.cpp
@@ -18,11 +18,13 @@
 
 #include "llvm/IR/AssemblyAnnotationWriter.h"
 #include <llvm/IR/Module.h>
+#include "llvm/Linker/Linker.h"
 #include <llvm/Support/CommandLine.h>
 #include <llvm/Support/Path.h>
 #include <llvm/Support/raw_ostream.h>
 
 #include "bcinfo/BitcodeWrapper.h"
+#include "bcc/Assert.h"
 #include "bcc/BCCContext.h"
 #include "bcc/Compiler.h"
 #include "bcc/Config/Config.h"
@@ -38,6 +40,7 @@
 #include "bcc/Support/Sha1Util.h"
 #include "bcc/Support/OutputFile.h"
 
+#include <sstream>
 #include <string>
 
 #ifdef HAVE_ANDROID_OS
@@ -327,21 +330,86 @@
 }
 
 bool RSCompilerDriver::buildScriptGroup(
-    BCCContext& Context, const char* pOutputFilepath, const char*pRuntimePath,
-    const std::vector<const Source*>& sources, const std::vector<int>& slots,
-    bool dumpIR) {
-  llvm::Module* module = fuseKernels(Context, sources, slots);
-  if (module == nullptr) {
-    return false;
+    BCCContext& Context, const char* pOutputFilepath, const char* pRuntimePath,
+    bool dumpIR, const std::vector<Source*>& sources,
+    const std::list<std::list<std::pair<int, int>>>& toFuse,
+    const std::list<std::string>& fused,
+    const std::list<std::list<std::pair<int, int>>>& invokes,
+    const std::list<std::string>& invokeBatchNames) {
+  // ---------------------------------------------------------------------------
+  // Link all input modules into a single module
+  // ---------------------------------------------------------------------------
+
+  llvm::LLVMContext& context = Context.getLLVMContext();
+  llvm::Module module("Merged Script Group", context);
+
+  llvm::Linker linker(&module);
+  for (Source* source : sources) {
+    if (linker.linkInModule(&source->getModule())) {
+      ALOGE("Linking for module in source failed.");
+      return false;
+    }
   }
 
+  // ---------------------------------------------------------------------------
+  // Create fused kernels
+  // ---------------------------------------------------------------------------
+
+  auto inputIter = toFuse.begin();
+  for (const std::string& nameOfFused : fused) {
+    auto inputKernels = *inputIter++;
+    std::vector<Source*> sourcesToFuse;
+    std::vector<int> slots;
+
+    for (auto p : inputKernels) {
+      sourcesToFuse.push_back(sources[p.first]);
+      slots.push_back(p.second);
+    }
+
+    if (!fuseKernels(Context, sourcesToFuse, slots, nameOfFused, &module)) {
+      return false;
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Rename invokes
+  // ---------------------------------------------------------------------------
+
+  auto invokeIter = invokes.begin();
+  for (const std::string& newName : invokeBatchNames) {
+    auto inputInvoke = *invokeIter++;
+    auto p = inputInvoke.front();
+    Source* source = sources[p.first];
+    int slot = p.second;
+
+    if (!renameInvoke(Context, source, slot, newName, &module)) {
+      return false;
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Compile the new module with fused kernels
+  // ---------------------------------------------------------------------------
+
   const std::unique_ptr<Source> source(
-      Source::CreateFromModule(Context, pOutputFilepath, *module));
+      Source::CreateFromModule(Context, pOutputFilepath, module, true));
   RSScript script(*source);
 
   uint8_t bitcode_sha1[SHA1_DIGEST_LENGTH];
   const char* compileCommandLineToEmbed = "";
-  const char* buildChecksum = nullptr;
+  const char* buildChecksum = "DummyChecksumForScriptGroup";
+  const char* buildFingerprintToEmbed = "";
+
+  RSInfo* info = RSInfo::ExtractFromSource(*source, bitcode_sha1,
+                                           compileCommandLineToEmbed, buildFingerprintToEmbed);
+  if (info == nullptr) {
+    return false;
+  }
+  script.setInfo(info);
+
+  // Embed the info string directly in the ELF
+  script.setEmbedInfo(true);
+  script.setOptimizationLevel(RSScript::kOptLvl3);
 
   llvm::SmallString<80> output_path(pOutputFilepath);
   llvm::sys::path::replace_extension(output_path, ".o");
diff --git a/lib/Renderscript/RSMetadata.cpp b/lib/Renderscript/RSMetadata.cpp
deleted file mode 100644
index 841ade7..0000000
--- a/lib/Renderscript/RSMetadata.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright 2015, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bcc/Renderscript/RSMetadata.h"
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Module.h"
-
-// Name of metadata node where pragma info resides (should be synced with
-// slang.cpp)
-const llvm::StringRef pragma_metadata_name("#pragma");
-
-/*
- * The following names should be synced with the one appeared in
- * slang_rs_metadata.h.
- */
-
-// Name of metadata node where exported variable names reside
-static const llvm::StringRef
-export_var_metadata_name("#rs_export_var");
-
-// Name of metadata node where exported function names reside
-static const llvm::StringRef
-export_func_metadata_name("#rs_export_func");
-
-// Name of metadata node where exported ForEach name information resides
-static const llvm::StringRef
-export_foreach_name_metadata_name("#rs_export_foreach_name");
-
-// Name of metadata node where exported ForEach signature information resides
-static const llvm::StringRef
-export_foreach_metadata_name("#rs_export_foreach");
-
-// Name of metadata node where RS object slot info resides (should be
-static const llvm::StringRef
-object_slot_metadata_name("#rs_object_slots");
-
-bcc::RSMetadata::RSMetadata(llvm::Module &Module) : Module(Module) {}
-
-void bcc::RSMetadata::deleteAll() {
-   std::vector<llvm::StringRef> MDNames;
-   MDNames.push_back(pragma_metadata_name);
-   MDNames.push_back(export_var_metadata_name);
-   MDNames.push_back(export_func_metadata_name);
-   MDNames.push_back(export_foreach_name_metadata_name);
-   MDNames.push_back(export_foreach_metadata_name);
-   MDNames.push_back(object_slot_metadata_name);
-
-   for (std::vector<llvm::StringRef>::iterator MI = MDNames.begin(),
-                                               ME = MDNames.end();
-        MI != ME; ++MI) {
-     llvm::NamedMDNode *MDNode = Module.getNamedMetadata(*MI);
-     if (MDNode) {
-       MDNode->eraseFromParent();
-     }
-   }
-}
-
-void bcc::RSMetadata::markForEachFunction(llvm::Function &Function,
-  uint32_t Signature) {
-  llvm::NamedMDNode *ExportForEachNameMD;
-  llvm::NamedMDNode *ExportForEachMD;
-
-  llvm::MDString *MDString;
-  llvm::MDNode *MDNode;
-
-  ExportForEachNameMD =
-    Module.getOrInsertNamedMetadata(export_foreach_name_metadata_name);
-  MDString = llvm::MDString::get(Module.getContext(), Function.getName());
-  MDNode = llvm::MDNode::get(Module.getContext(), MDString);
-  ExportForEachNameMD->addOperand(MDNode);
-
-  ExportForEachMD =
-    Module.getOrInsertNamedMetadata(export_foreach_metadata_name);
-  MDString = llvm::MDString::get(Module.getContext(),
-                                 llvm::utostr_32(Signature));
-  MDNode = llvm::MDNode::get(Module.getContext(), MDString);
-  ExportForEachMD->addOperand(MDNode);
-}
diff --git a/lib/Renderscript/RSScriptGroupFusion.cpp b/lib/Renderscript/RSScriptGroupFusion.cpp
index 352f55e..7ee79bf 100644
--- a/lib/Renderscript/RSScriptGroupFusion.cpp
+++ b/lib/Renderscript/RSScriptGroupFusion.cpp
@@ -18,192 +18,298 @@
 
 #include "bcc/Assert.h"
 #include "bcc/BCCContext.h"
-#include "bcc/Renderscript/RSMetadata.h"
-#include "bcc/Renderscript/RSScript.h"
 #include "bcc/Source.h"
 #include "bcc/Support/Log.h"
 #include "bcinfo/MetadataExtractor.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/PassManager.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <map>
-#include <string>
 
 using llvm::Function;
+using llvm::Module;
 
-using std::map;
-using std::pair;
 using std::string;
 
 namespace bcc {
 
 namespace {
 
-struct SourceCompare {
-  bool operator()(const Source* lhs, const Source* rhs) const {
-    return lhs->getName().compare(rhs->getName()) < 0;
-  }
-};
-
-typedef map<const Source*,
-            map<int, pair<const Function*, int>>, SourceCompare> SlotMap;
-
-const Function* getFunction(const Source* source, const int slot) {
-  const llvm::Module* module = &source->getModule();
+const Function* getInvokeFunction(const Source& source, const int slot,
+                                  Module* newModule) {
+  Module* module = const_cast<Module*>(&source.getModule());
   bcinfo::MetadataExtractor metadata(module);
   if (!metadata.extract()) {
     return nullptr;
   }
+  const char* functionName = metadata.getExportFuncNameList()[slot];
+  Function* func = newModule->getFunction(functionName);
+  // Materialize the function so that later the caller can inspect its argument
+  // and return types.
+  newModule->materialize(func);
+  return func;
+}
+
+const Function*
+getFunction(Module* mergedModule, const Source* source, const int slot,
+            uint32_t* signature) {
+  bcinfo::MetadataExtractor metadata(&source->getModule());
+  metadata.extract();
+
   const char* functionName = metadata.getExportForEachNameList()[slot];
-  return module->getFunction(functionName);
-}
-
-llvm::Type* getArgType(const Source* source, const int slot) {
-  const Function* func = getFunction(source, slot);
-  if (func == nullptr) {
+  if (functionName == nullptr) {
     return nullptr;
   }
-  auto argIter = func->getArgumentList().begin();
-  return argIter->getType();
-}
 
-llvm::Type* getReturnType(const Source* source, const int slot) {
-  const Function* func = getFunction(source, slot);
-  if (func == nullptr) {
+  if (metadata.getExportForEachInputCountList()[slot] > 1) {
+    // TODO: Handle multiple inputs.
+    ALOGW("Kernel %s has multiple inputs", functionName);
     return nullptr;
   }
-  return func->getReturnType();
+
+  if (signature != nullptr) {
+    *signature = metadata.getExportForEachSignatureList()[slot];
+  }
+
+  const Function* function = mergedModule->getFunction(functionName);
+
+  return function;
 }
 
-pair<const Function*, int> getFunction(
-    SlotMap& slotMap, llvm::Linker& linker, const Source* source,
-    const int slot) {
-  auto it1 = slotMap.find(source);
-  if (it1 == slotMap.end()) {
-    llvm::Module* module = (llvm::Module*)&source->getModule();
-    if (linker.linkInModule(module)) {
-      ALOGE("Linking for module in source %s failed.",
-            source->getName().c_str());
-      return std::make_pair(nullptr, 0);
-    }
-  }
-  auto &functions = slotMap[source];
+// TODO: Handle the context argument
+constexpr uint32_t ExpectedSignatureBits =
+        bcinfo::MD_SIG_In |
+        bcinfo::MD_SIG_Out |
+        bcinfo::MD_SIG_X |
+        bcinfo::MD_SIG_Y |
+        bcinfo::MD_SIG_Z |
+        bcinfo::MD_SIG_Kernel;
 
-  auto it2 = functions.find(slot);
-  if (it2 == functions.end()) {
+int getFusedFuncSig(const std::vector<Source*>& sources,
+                    const std::vector<int>& slots,
+                    uint32_t* retSig) {
+  *retSig = 0;
+  uint32_t firstSignature = 0;
+  uint32_t signature = 0;
+  auto slotIter = slots.begin();
+  for (const Source* source : sources) {
+    const int slot = *slotIter++;
     bcinfo::MetadataExtractor metadata(&source->getModule());
     metadata.extract();
-    const char* functionName = metadata.getExportForEachNameList()[slot];
-    if (functionName == nullptr) {
-      return std::make_pair(nullptr, 0);
-    }
 
     if (metadata.getExportForEachInputCountList()[slot] > 1) {
-      // TODO: Handle multiple inputs.
-      ALOGW("Kernel %s has multiple inputs", functionName);
-      return std::make_pair(nullptr, 0);
+      // TODO: Handle multiple inputs in kernel fusion.
+      ALOGW("Kernel %d in source %p has multiple inputs", slot, source);
+      return -1;
     }
 
-    const uint32_t signature = metadata.getExportForEachSignatureList()[slot];
-    int dim = 0;
-    if (metadata.hasForEachSignatureX(signature)) {
-      dim++;
-    }
-    if (metadata.hasForEachSignatureY(signature)) {
-      dim++;
+    signature = metadata.getExportForEachSignatureList()[slot];
+    if (signature & ~ExpectedSignatureBits) {
+      ALOGW("Unexpected signature %x seen while fusing kernels", signature);
+      return -1;
     }
 
-    const Function* function = linker.getModule()->getFunction(functionName);
-    it2 = functions.emplace(slot, std::make_pair(function, dim)).first;
+    if (firstSignature == 0) {
+      firstSignature = signature;
+    }
+
+    *retSig |= signature;
   }
-  return it2->second;
+
+  if (!bcinfo::MetadataExtractor::hasForEachSignatureIn(firstSignature)) {
+    *retSig &= ~bcinfo::MD_SIG_In;
+  }
+
+  if (!bcinfo::MetadataExtractor::hasForEachSignatureOut(signature)) {
+    *retSig &= ~bcinfo::MD_SIG_Out;
+  }
+
+  return 0;
+}
+
+llvm::FunctionType* getFusedFuncType(bcc::BCCContext& Context,
+                                     const std::vector<Source*>& sources,
+                                     const std::vector<int>& slots,
+                                     Module* M,
+                                     uint32_t* signature) {
+  int error = getFusedFuncSig(sources, slots, signature);
+
+  if (error < 0) {
+    return nullptr;
+  }
+
+  const Function* firstF = getFunction(M, sources.front(), slots.front(), nullptr);
+
+  bccAssert (firstF != nullptr);
+
+  llvm::SmallVector<llvm::Type*, 8> ArgTys;
+
+  if (bcinfo::MetadataExtractor::hasForEachSignatureIn(*signature)) {
+    ArgTys.push_back(firstF->arg_begin()->getType());
+  }
+
+  llvm::Type* I32Ty = llvm::IntegerType::get(Context.getLLVMContext(), 32);
+  if (bcinfo::MetadataExtractor::hasForEachSignatureX(*signature)) {
+    ArgTys.push_back(I32Ty);
+  }
+  if (bcinfo::MetadataExtractor::hasForEachSignatureY(*signature)) {
+    ArgTys.push_back(I32Ty);
+  }
+  if (bcinfo::MetadataExtractor::hasForEachSignatureZ(*signature)) {
+    ArgTys.push_back(I32Ty);
+  }
+
+  const Function* lastF = getFunction(M, sources.back(), slots.back(), nullptr);
+
+  bccAssert (lastF != nullptr);
+
+  llvm::Type* retTy = lastF->getReturnType();
+
+  return llvm::FunctionType::get(retTy, ArgTys, false);
 }
 
 }  // anonymous namespace
 
-llvm::Module*
-fuseKernels(bcc::BCCContext& Context,
-            const std::vector<const Source *>& sources,
-            const std::vector<int>& slots) {
-  bccAssert(sources.size() > 1 && "Need at least two kernels for kernel merging");
+bool fuseKernels(bcc::BCCContext& Context,
+                 const std::vector<Source *>& sources,
+                 const std::vector<int>& slots,
+                 const std::string& fusedName,
+                 Module* mergedModule) {
   bccAssert(sources.size() == slots.size() && "sources and slots differ in size");
 
-  llvm::LLVMContext& context = Context.getLLVMContext();
-  std::unique_ptr<llvm::Module> module(
-      new llvm::Module("Merged ScriptGroup", context));
-  if (module == nullptr) {
-    ALOGE("out of memory while creating module for fused kernels");
-    return nullptr;
-  }
-  llvm::Linker linker(module.get());
-  SlotMap slotMap;
+  uint32_t signature;
 
-  llvm::Type* inputType = getArgType(sources.front(), slots.front());
-  if (inputType == nullptr) {
-    return nullptr;
+  llvm::FunctionType* fusedType =
+          getFusedFuncType(Context, sources, slots, mergedModule, &signature);
+
+  if (fusedType == nullptr) {
+    return false;
   }
-  llvm::Type* returnType = getReturnType(sources.back(), slots.back());
-  if (returnType == nullptr) {
-    return nullptr;
-  }
-  llvm::Type* I32Ty = llvm::IntegerType::get(context, 32);
+
   Function* fusedKernel =
-      (Function*)(module->getOrInsertFunction(
-          "__rs_fused_kernels", returnType, inputType, I32Ty, I32Ty, nullptr));
+          (Function*)(mergedModule->getOrInsertFunction(fusedName, fusedType));
 
-  llvm::BasicBlock* block = llvm::BasicBlock::Create(context, "entry",
-                                                     fusedKernel);
+  llvm::LLVMContext& ctxt = Context.getLLVMContext();
+
+  llvm::BasicBlock* block = llvm::BasicBlock::Create(ctxt, "entry", fusedKernel);
   llvm::IRBuilder<> builder(block);
 
   Function::arg_iterator argIter = fusedKernel->arg_begin();
-  llvm::Value* dataElement = argIter++;
-  dataElement->setName("DataIn");
-  llvm::Value* X = argIter++;
-  X->setName("x");
-  llvm::Value* Y = argIter++;
-  Y->setName("y");
+
+  llvm::Value* dataElement = nullptr;
+  if (bcinfo::MetadataExtractor::hasForEachSignatureIn(signature)) {
+    dataElement = argIter++;
+    dataElement->setName("DataIn");
+  }
+
+  llvm::Value* X = nullptr;
+  if (bcinfo::MetadataExtractor::hasForEachSignatureX(signature)) {
+      X = argIter++;
+      X->setName("x");
+  }
+
+  llvm::Value* Y = nullptr;
+  if (bcinfo::MetadataExtractor::hasForEachSignatureY(signature)) {
+      Y = argIter++;
+      Y->setName("y");
+  }
+
+  llvm::Value* Z = nullptr;
+  if (bcinfo::MetadataExtractor::hasForEachSignatureZ(signature)) {
+      Z = argIter++;
+      Z->setName("z");
+  }
 
   auto slotIter = slots.begin();
   for (const Source* source : sources) {
     int slot = *slotIter++;
 
-    const auto& p = getFunction(slotMap, linker, source, slot);
-    const Function* function = p.first;
+    uint32_t signature;
+    const Function* function = getFunction(mergedModule, source, slot, &signature);
+
     if (function == nullptr) {
-      return nullptr;
+      return false;
     }
-    const int dim = p.second;
 
     std::vector<llvm::Value*> args;
-    args.push_back(dataElement);
-    if (dim > 0) {
+    if (dataElement != nullptr) {
+      args.push_back(dataElement);
+    }
+
+    // TODO: Handle the context argument
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureX(signature)) {
       args.push_back(X);
-      if (dim > 1) {
-        args.push_back(Y);
-      }
+    }
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureY(signature)) {
+      args.push_back(Y);
+    }
+
+    if (bcinfo::MetadataExtractor::hasForEachSignatureZ(signature)) {
+      args.push_back(Z);
     }
 
     dataElement = builder.CreateCall((llvm::Value*)function, args);
   }
 
-  builder.CreateRet(dataElement);
+  if (fusedKernel->getReturnType()->isVoidTy()) {
+    builder.CreateRetVoid();
+  } else {
+    builder.CreateRet(dataElement);
+  }
 
-  bcc::RSMetadata metadata(*module);
-  metadata.deleteAll();
-  metadata.markForEachFunction(*fusedKernel, bcinfo::MD_SIG_Kernel
-                               | bcinfo::MD_SIG_In
-                               | bcinfo::MD_SIG_Out
-                               | bcinfo::MD_SIG_X
-                               | bcinfo::MD_SIG_Y);
+  llvm::NamedMDNode* ExportForEachNameMD =
+    mergedModule->getOrInsertNamedMetadata("#rs_export_foreach_name");
 
-  return module.release();
+  llvm::MDString* nameMDStr = llvm::MDString::get(ctxt, fusedName);
+  llvm::MDNode* nameMDNode = llvm::MDNode::get(ctxt, nameMDStr);
+  ExportForEachNameMD->addOperand(nameMDNode);
+
+  llvm::NamedMDNode* ExportForEachMD =
+    mergedModule->getOrInsertNamedMetadata("#rs_export_foreach");
+  llvm::MDString* sigMDStr = llvm::MDString::get(ctxt,
+                                                 llvm::utostr_32(signature));
+  llvm::MDNode* sigMDNode = llvm::MDNode::get(ctxt, sigMDStr);
+  ExportForEachMD->addOperand(sigMDNode);
+
+  return true;
+}
+
+bool renameInvoke(BCCContext& Context, const Source* source, const int slot,
+                  const std::string& newName, Module* module) {
+  const llvm::Function* F = getInvokeFunction(*source, slot, module);
+  std::vector<llvm::Type*> params;
+  for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) {
+    params.push_back(I->getType());
+  }
+  llvm::Type* returnTy = F->getReturnType();
+
+  llvm::FunctionType* batchFuncTy =
+          llvm::FunctionType::get(returnTy, params, false);
+
+  llvm::Function* newF =
+          llvm::Function::Create(batchFuncTy,
+                                 llvm::GlobalValue::ExternalLinkage, newName,
+                                 module);
+
+  llvm::BasicBlock* block = llvm::BasicBlock::Create(Context.getLLVMContext(),
+                                                     "entry", newF);
+  llvm::IRBuilder<> builder(block);
+
+  llvm::Function::arg_iterator argIter = newF->arg_begin();
+  llvm::Value* arg1 = argIter++;
+  builder.CreateCall((llvm::Value*)F, arg1);
+
+  builder.CreateRetVoid();
+
+  llvm::NamedMDNode* ExportFuncNameMD =
+          module->getOrInsertNamedMetadata("#rs_export_func");
+  llvm::MDString* strMD = llvm::MDString::get(module->getContext(), newName);
+  llvm::MDNode* nodeMD = llvm::MDNode::get(module->getContext(), strMD);
+  ExportFuncNameMD->addOperand(nodeMD);
+
+  return true;
 }
 
 }  // namespace bcc
diff --git a/tools/bcc/Main.cpp b/tools/bcc/Main.cpp
index fcc44f2..6e2c8a4 100644
--- a/tools/bcc/Main.cpp
+++ b/tools/bcc/Main.cpp
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <iostream>
+#include <list>
+#include <map>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -56,9 +60,14 @@
 OptInputFilenames(llvm::cl::Positional, llvm::cl::OneOrMore,
                   llvm::cl::desc("<input bitcode files>"));
 
-llvm::cl::list<int>
-OptKernelSlots("k", llvm::cl::ZeroOrMore,
-               llvm::cl::desc("kernel function slot numbers"));
+llvm::cl::list<std::string>
+OptMergePlans("merge", llvm::cl::ZeroOrMore,
+               llvm::cl::desc("Lists of kernels to merge (as source-and-slot "
+                              "pairs) and names for the final merged kernels"));
+
+llvm::cl::list<std::string>
+OptInvokes("invoke", llvm::cl::ZeroOrMore,
+           llvm::cl::desc("Invocable functions"));
 
 llvm::cl::opt<std::string>
 OptOutputFilename("o", llvm::cl::desc("Specify the output filename"),
@@ -131,35 +140,64 @@
   return;
 }
 
-bool fuseKernels(BCCContext& Context, RSCompilerDriver& RSCD) {
-  if (OptInputFilenames.size() != OptKernelSlots.size()) {
-    llvm::errs() << "Mismatching number of input files and kernel slots.\n";
-    return false;
+void extractSourcesAndSlots(const llvm::cl::list<std::string>& optList,
+                            std::list<std::string>* batchNames,
+                            std::list<std::list<std::pair<int, int>>>* sourcesAndSlots) {
+  for (unsigned i = 0; i < optList.size(); ++i) {
+    std::string plan = optList[i];
+    unsigned found = plan.find(":");
+
+    std::string name = plan.substr(0, found);
+    std::cerr << "new kernel name: " << name << std::endl;
+    batchNames->push_back(name);
+
+    std::istringstream iss(plan.substr(found + 1));
+    std::string s;
+    std::list<std::pair<int, int>> planList;
+    while (getline(iss, s, '.')) {
+      found = s.find(",");
+      std::string sourceStr = s.substr(0, found);
+      std::string slotStr = s.substr(found + 1);
+
+      std::cerr << "source " << sourceStr << ", slot " << slotStr << std::endl;
+
+      int source = std::stoi(sourceStr);
+      int slot = std::stoi(slotStr);
+      planList.push_back(std::make_pair(source, slot));
+    }
+
+    sourcesAndSlots->push_back(planList);
   }
+}
 
-  std::vector<const bcc::Source*> sources;
-  std::vector<int> slots;
-
+bool compileScriptGroup(BCCContext& Context, RSCompilerDriver& RSCD) {
+  std::vector<bcc::Source*> sources;
   for (unsigned i = 0; i < OptInputFilenames.size(); ++i) {
-    const bcc::Source* source =
+    bcc::Source* source =
         bcc::Source::CreateFromFile(Context, OptInputFilenames[i]);
     if (!source) {
       llvm::errs() << "Error loading file '" << OptInputFilenames[i]<< "'\n";
       return false;
     }
-    int slot = OptKernelSlots[i];
-
     sources.push_back(source);
-    slots.push_back(slot);
   }
 
+  std::list<std::string> fusedKernelNames;
+  std::list<std::list<std::pair<int, int>>> sourcesAndSlots;
+  extractSourcesAndSlots(OptMergePlans, &fusedKernelNames, &sourcesAndSlots);
+
+  std::list<std::string> invokeBatchNames;
+  std::list<std::list<std::pair<int, int>>> invokeSourcesAndSlots;
+  extractSourcesAndSlots(OptInvokes, &invokeBatchNames, &invokeSourcesAndSlots);
+
   std::string outputFilepath(OptOutputPath);
   outputFilepath.append("/");
   outputFilepath.append(OptOutputFilename);
 
   bool success = RSCD.buildScriptGroup(
-      Context, outputFilepath.c_str(), OptBCLibFilename.c_str(), sources,
-      slots, true);
+    Context, outputFilepath.c_str(), OptBCLibFilename.c_str(), true,
+    sources, sourcesAndSlots, fusedKernelNames,
+    invokeSourcesAndSlots, invokeBatchNames);
 
   return success;
 }
@@ -266,11 +304,13 @@
     rscdi(&RSCD);
   }
 
-  if (OptInputFilenames.size() > 1) {
-    bool success = fuseKernels(context, RSCD);
+  if (OptMergePlans.size() > 0) {
+    bool success = compileScriptGroup(context, RSCD);
+
     if (!success) {
       return EXIT_FAILURE;
     }
+
     return EXIT_SUCCESS;
   }
 
@@ -294,8 +334,7 @@
     if (!built) {
       return EXIT_FAILURE;
     }
-  }
-  else {
+  } else {
     // embedRSInfo is set.  Use buildForCompatLib to embed RS symbol information
     // into the .rs.info symbol.
     Source *source = Source::CreateFromBuffer(context, OptInputFilenames[0].c_str(),
